### Import Lugwig

Import ludwig libraries for the `conda_python3` kernel

In [None]:
!pip install ludwig[text] # Will install tensorflow.1.14 and spacy

In [None]:
!python -m spacy download en

### Download Data

In [None]:
!rm -Rf input
!mkdir -p input/data/training input/data/test

!aws s3 cp s3://open-banking-classificaiton-ap-southeast-2/open-banking-test.csv input/data/training/train.csv
!aws s3 cp s3://open-banking-classificaiton-ap-southeast-2/open-banking-test.csv input/data/test/test.csv

In [None]:
import glob
import os
import pandas as pd

def read_csv_dataframe(path, engine='python'):
    files = glob.glob(os.path.join(path, '*.csv'))
    if len(files) > 0:
        return pd.concat([pd.read_csv(fn, engine=engine) for fn in files], axis=0, ignore_index=True)

# Load the train/validation/test files
data_train_df = read_csv_dataframe('input/data/training')
data_validation_df = None
data_test_df = read_csv_dataframe('input/data/test')

data_train_df.head()

### Model Definition

Create a model definition file

In [None]:
%%writefile model_definition.yml
input_features:
    -
        name: text
        type: text
        level: word
        encoder: parallel_cnn

output_features:
    -
        name: class
        type: category

### Train Model

Load the model definition and train

In [None]:
!rm -Rf output
!mkdir -p output/model

In [None]:
%%time

# Import ludwig library
import json
import ludwig
from ludwig.api import LudwigModel

print('ludwig: {}'.format(ludwig.__version__))

# Create model from definition
ludwig_model = LudwigModel(None, model_definition_file='model_definition.yml')
print('model definition', json.dumps(ludwig_model.model_definition))

print('training model...')
train_stats = ludwig_model.train(
    skip_save_log=True, # Don't save tensorboard
    skip_save_processed_input=True, # Don't save pre-processed input
    data_train_df=data_train_df,
    data_validation_df=data_validation_df,
    data_test_df=data_test_df,
    output_directory='output'
)

# TODO: Output stats for logging
print('train stats', json.dumps(train_stats))
        
# Save the ludwig model 
ludwig_model.save('output/model')
ludwig_model.close()    

Inspect the experiment output and model files

In [None]:
!ls -R output

### Test Model

Get predictions and stats for the test dataset

In [None]:
%%time
predictions, test_stats = ludwig_model.test(data_df=data_test_df)  

In [None]:
predictions[['class_predictions', 'class_probability']].head()

In [None]:
test_stats['class']['overall_stats']