In [None]:
!apt-get -y update && apt-get -y install \
    git \
    libsndfile1 \
    cmake \
    libcudnn7=7.6.5.32-1+cuda10.1 \
    libnccl2=2.7.8-1+cuda10.1 \
    libnccl-dev=2.7.8-1+cuda10.1

In [None]:
!pip install --upgrade pip

In [None]:
test_url = "https://sagemaker-us-east-1-249959045939.s3.amazonaws.com/suicide-transformer/suicide-classification/suicide-classify-trial/data/test/test.csv"
train_url = "https://sagemaker-us-east-1-249959045939.s3.amazonaws.com/suicide-transformer/suicide-classification/suicide-classify-trial/data/training/train.csv"
validate_url = "https://sagemaker-us-east-1-249959045939.s3.amazonaws.com/suicide-transformer/suicide-classification/suicide-classify-trial/data/validation/validation.csv"
!wget $test_url $train_url $validate_url  -P data/

In [None]:
!git clone --depth=1 https://github.com/ludwig-ai/ludwig.git

In [None]:
!pip uninstall -y ludwig
!pip cache remove ludwig

In [None]:
!cd ludwig/ \
    && HOROVOD_GPU_OPERATIONS=NCCL \
       HOROVOD_WITH_TENSORFLOW=1 \
       HOROVOD_WITHOUT_MPI=1 \
       HOROVOD_WITHOUT_PYTORCH=1 \
       HOROVOD_WITHOUT_MXNET=1 \
    && pip install --no-cache-dir '.[text,audio,image,hyperopt,serve,viz]'
# !cd ludwig && pip install --no-cache-dir '.[full]'

In [None]:
# %%capture
!pip uninstall -y horovod # uninstall horovod to get ludwig to work correctly in notebook
!pip install -U ludwig[text]

In [None]:
!pip install numpy pandas petastorm

In [None]:
!pip install -r ludwig/requirements_dask.txt

In [None]:
!pip list | grep tensorflow

In [None]:
%%writefile train.py
import argparse
import json
import numpy as np
import pandas as pd
import os
import logging

def read_csv(input_dir):
    # Take the set of 1 or more files and read them all into a single pandas dataframe
    input_files = [ os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('csv') ]
    if len(input_files) == 0:
        raise ValueError(f'No csv files found in {input_dir}')
    df = pd.concat([pd.read_csv(file) for file in input_files])
    print(f'Loaded {len(input_files)} files from {input_dir}, shape: {df.shape}')
    return df
    
def train(args):
    # output directories
    print(f'input train: {args.training_set}, val: {args.validation_set}, test: {args.testing_set}')
    print(f'output model: {args.model_dir}, data: {args.output_data_dir}')

    # configure integrations https://ludwig-ai.github.io/ludwig-docs/user_guide/#integrations
    try:
        import ludwig.contrib
        if args.integration == 'comet':
            ludwig.contrib.use_contrib('comet')
            print(f'using comet integration')
        elif args.integration == 'wandb':
            ludwig.contrib.use_contrib('wandb')
            print(f'using wandb integration')
    except Exception as e:
        print('integration not supported: {}'.format(e))

    # import ludwig after contrib incase we need to hook TF prior to loading
    from ludwig.api import LudwigModel
    
    # loading csv dataframes
    train_df = pd.read_csv(args.training_set)
    val_df = pd.read_csv(args.validation_set)
    test_df = pd.read_csv(args.testing_set)
    
    # train the model based on config yaml file
    ludwig_model = LudwigModel(args.config, logging_level=logging.DEBUG, gpus=0)
    train_stats, _, _  = ludwig_model.train(
        experiment_name=args.experiment_name,
        model_name=args.model_name,
        training_set=train_df,
        validation_set=val_df,
        test_set=test_df,
        output_directory = args.output_data_dir, # Save experiment to output data dir
        skip_save_training_statistics=False, # Save training results to file
        skip_save_log=False, # Save tensorboard logs
        skip_save_progress = False,
    )
    
    print('saving model')
    
    # Save the latest model to model_directory
    ludwig_model.save(args.model_dir)
    
    # Save the compiled SavedModel to model directory
    ludwig_model.save_savedmodel(args.model_dir)
    
    print('emmiting metrics')
    
    # enuemrate through the channels and output features to get metrics
    for channel in train_stats:
        for output in ludwig_model.config['output_features']:
            for metric in train_stats['training'][output['name']]:
                # get the metric from last epoch
                value = train_stats['training'][output['name']][metric][-1]
                print('{}_{}={};'.format(channel, metric, value))
    

    print('evaluating test dataset')
        
    # output evaluations based on test
    ludwig_model.evaluate(test_df,
        output_directory=args.output_data_dir,
        skip_save_unprocessed_output=True, # Only save CSV values
        skip_save_predictions=False, # Write predictions to file
        skip_save_eval_stats=False, # Write evaluation stats to file
        collect_predictions=True,
        collect_overall_stats=True,
    )
    
    # Return the model 
    return ludwig_model

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # reads input channels training and testing from the environment variables
    parser.add_argument("--config", type=str, default='config.yml')
    parser.add_argument("--training-set", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
    parser.add_argument("--validation-set", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
    parser.add_argument("--testing-set", type=str, default=os.environ["SM_CHANNEL_TESTING"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--experiment-name", type=str, default='api_experiment')
    parser.add_argument("--model-name", type=str, default='run')
    parser.add_argument("--integration", type=str, required=False)
    args = parser.parse_args()
    train(args)

In [None]:
%%writefile config.yml
input_features:
    -   name: text
        type: text
        level: word
        encoder: distilbert
output_features:
    -   name: class
        type: category
training:
    epochs: 2
    batch_size: 16 # OOM for bert if we don't keep this small
    learning_rate: 0.00001
    decay: true
    trainable: true

In [None]:
%env SM_CHANNEL_TRAINING=data/train.csv
%env SM_CHANNEL_VALIDATION=data/validation.csv
%env SM_CHANNEL_TESTING=data/test.csv
%env SM_MODEL_DIR=model/
%env SM_OUTPUT_DATA_DIR=output/
!python train.py --config config.yml --experiment-name=suicide-classification