In [None]:
!python3 -m pip install --upgrade pip
!python3 -m pip install --upgrade setuptools

In [None]:
isGpuEnabled = 'false'

In [None]:
if isGpuEnabled == 'false':
    # CPU Install
    !python3 -m pip install -U pip
    !python3 -m pip install -U setuptools wheel
    !python3 -m pip install "mxnet<2.0.0, >=1.7.0"
    !python3 -m pip install autogluon
    !pip install "scikit-learn-intelex<2021.3" // speeds up KNN models on CPU
else:
    # GPU Install
    !python3 -m pip install -U pip
    !python3 -m pip install -U setuptools wheel

    # Here we assume CUDA 10.1 is installed.  You should change the number
    # according to your own CUDA version (e.g. mxnet_cu100 for CUDA 10.0).
    !python3 -m pip install "mxnet_cu111<2.0.0, >=1.7.0"
    !python3 -m pip install autogluon

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pprint
import random
from autogluon.tabular import TabularPredictor
import mxnet as mx

np.random.seed(123)
random.seed(123)
mx.random.seed(123)

In [None]:
test_url = "https://sagemaker-us-east-1-249959045939.s3.amazonaws.com/suicide-transformer/suicide-classification/suicide-classify-trial/data/test/test.csv"
train_url = "https://sagemaker-us-east-1-249959045939.s3.amazonaws.com/suicide-transformer/suicide-classification/suicide-classify-trial/data/training/train.csv"
validate_url = "https://sagemaker-us-east-1-249959045939.s3.amazonaws.com/suicide-transformer/suicide-classification/suicide-classify-trial/data/validation/validation.csv"
!wget $test_url $train_url $validate_url  -P data/

In [None]:
subsample_size = 2000  # for quick demo, try setting to larger values
feature_columns = ['text']
label = 'class'

train_df = pd.read_csv('data/train.csv')
dev_df = pd.read_csv('data/validation.csv')
test_df = pd.read_csv('data/test.csv')

if isGpuEnabled == 'false':
    train_df = train_df.sample(2000, random_state=123)
    dev_df = dev_df.sample(2000, random_state=123)
    test_df = test_df.sample(2000, random_state=123)

train_df = train_df[feature_columns + [label]]
dev_df = dev_df[feature_columns + [label]]
test_df = test_df[feature_columns]
# train_df["class"] = train_df["class"].astype("category")
# dev_df["class"] = train_df["class"].astype("category")
# test_df["class"] = train_df["class"].astype("category")
print('Number of training samples:', len(train_df))
print('Number of dev samples:', len(dev_df))
print('Number of test samples:', len(test_df))

In [None]:
from autogluon.tabular import TabularPredictor
outputFolder = 'output'
predictor = TabularPredictor(label=label, path=outputFolder, verbosity=3)
# For CPU, comment on GPU
if isGpuEnabled == 'false':
#     %env AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU=1 
    predictor.fit(train_df)
else:
    predictor.fit(train_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1)

In [None]:
null_count = dev_df.isnull().sum()
null_count

In [None]:
predictor.leaderboard(dev_df)

In [None]:
import os
import json
dev_metric_score = predictor.evaluate(dev_df)
dev_predictions = predictor.predict(dev_df, as_pandas=True)
test_predictions = predictor.predict(test_df, as_pandas=True)
dev_predictions.to_csv(os.path.join(outputFolder, 'dev_prediction.csv'))
test_predictions.to_csv(os.path.join(outputFolder, 'test_prediction.csv'))
with open(os.path.join(outputFolder, 'final_model_scores.json'), 'w') as of:
    json.dump(dev_metric_score, of)