In [None]:
import torch
torch.cuda.get_device_name()

In [None]:
!pip install simpletransformers

In [None]:
import pandas as pd
import numpy as np

In [None]:
food_df = pd.read_csv("food_training_df.csv")
food_df.shape

In [None]:
food_df.head()

In [None]:
food_df['Topic'] = food_df['Topic'].replace(to_replace=['Food Effect', 'Non Food Effect'], value = [1,0]).astype(float)
food_df[['Topic', 'Data_Source']].value_counts()

##### Split into training and test sets

In [None]:
dm_food_df = food_df[food_df['Data_Source'] == 'DailyMed'].sample(n = 1200, random_state = 1234)
df_food_df = food_df[food_df['Data_Source'] == 'DrugsFDA'].sample(n = 1200, random_state = 1234)
print(dm_food_df['Topic'].value_counts())
print(df_food_df['Topic'].value_counts())

In [None]:
def prepare_data(df):
    train_df = df.sample(frac = 0.8, random_state = 1234)
    test_df = df.drop(train_df.index).reset_index(drop=True)
    train_df = train_df.reset_index(drop = True)
    print('{},{}'.format(str(len(train_df)), str(len(test_df))))
    return train_df, test_df

In [None]:
dm_train_df, dm_test_df = prepare_data(dm_food_df)
print(dm_train_df['Topic'].value_counts())
print(dm_test_df['Topic'].value_counts())

In [None]:
df_train_df, df_test_df = prepare_data(df_food_df)
print(df_train_df['Topic'].value_counts())
print(df_test_df['Topic'].value_counts())

In [None]:
dmdf_train_df = pd.concat([dm_train_df, df_train_df])
print(dmdf_train_df['Topic'].value_counts())
dmdf_test_df = pd.concat([dm_test_df, df_test_df])
print(dmdf_test_df['Topic'].value_counts())

In [None]:
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score

###### Set the arguments and train the model.

In [None]:
train_args = {
   'output_dir': 'outputs/',
   'cache_dir': 'cache/',
   'max_seq_length': 100,
   'train_batch_size': 32,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 1,
   'weight_decay': 0,
   'learning_rate': 4e-5,
   'adam_epsilon': 1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,
   'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,
   'overwrite_output_dir': True,
   'reprocess_input_data': True,
}

In [None]:
dmdf_train_df = dmdf_train_df[['Paragraph', 'Topic']]
dmdf_test_df = dmdf_test_df[['Paragraph', 'Topic']]
dmdf_train_df['Paragraph'] = dmdf_train_df['Paragraph'].str.lower()
dmdf_test_df['Paragraph'] = dmdf_test_df['Paragraph'].str.lower()
dmdf_test_df.head()
print(dmdf_train_df['Topic'].value_counts())
print(dmdf_test_df['Topic'].value_counts())

dm_train_df = dm_train_df[['Paragraph', 'Topic']]
dm_test_df = dm_test_df[['Paragraph', 'Topic']]
dm_train_df['Paragraph'] = dm_train_df['Paragraph'].str.lower()
dm_test_df['Paragraph'] = dm_test_df['Paragraph'].str.lower()
print(dm_train_df['Topic'].value_counts())
print(dm_test_df['Topic'].value_counts())

df_train_df = df_train_df[['Paragraph', 'Topic']]
df_test_df = df_test_df[['Paragraph', 'Topic']]
df_train_df['Paragraph'] = df_train_df['Paragraph'].str.lower()
df_test_df['Paragraph'] = df_test_df['Paragraph'].str.lower()
print(df_train_df['Topic'].value_counts())
print(df_test_df['Topic'].value_counts())

In [None]:
# Install apex makes use_cuda=1 works much faster
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir ./

In [None]:
!sh setup.sh

#### Define a simple function to calculate (using sklearn.metrics) evaluation measures.

In [None]:
def report_results(A, B):
    A_name = A.name
    B_name = B.name
    
    df = pd.DataFrame({'A':A,
                       'B':B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    acc = accuracy_score(B, A)
    f1 = f1_score(B, A)
    prec = precision_score(B, A)
    rec = recall_score(B, A)
    ROC = roc_auc_score(B, A)
    
    print('Candidate: '+A_name+' | Ground Truth: '+B_name+'\n')
    print('accuracy: %0.2f \nprecision: %0.2f \nrecall: %0.2f \nF1 score: %0.2f \nROC AUC: %0.2f \n' % (acc, prec, rec, f1, ROC))
    return prec, rec, f1
    

In [None]:
data_source_result_df = pd.DataFrame(columns=['F1'])
method_result_df = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])

In [None]:
!rm -rf runs cache/ outputs/ cache_dir/
model_bert = ClassificationModel('bert', 'bert-base-uncased', args=train_args)
model_bert.train_model(dmdf_train_df)
result, model_outputs, wrong_predictions = model_bert.eval_model(dmdf_test_df, acc=accuracy_score)
dmdf_test_df['BERT_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(dmdf_test_df['BERT_topic'], dmdf_test_df['Topic'])
method_result_df.loc['bert-base-uncased'] = [prec, rec, f1]

In [None]:
!rm -rf runs cache/ outputs/ cache_dir/
model_roberta = ClassificationModel('roberta', 'roberta-base', args=train_args)
model_roberta.train_model(dmdf_train_df)
result, model_outputs, wrong_predictions = model_roberta.eval_model(dmdf_test_df, acc=accuracy_score)
dmdf_test_df['roberta_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(dmdf_test_df['roberta_topic'], dmdf_test_df['Topic'])
method_result_df.loc['roberta-base'] = [prec, rec, f1]

In [None]:
!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert = ClassificationModel('distilbert', 'distilbert-base-uncased', args=train_args)
model_distilbert.train_model(dmdf_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(dmdf_test_df, acc=accuracy_score)
dmdf_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(dmdf_test_df['distilbert_topic'], dmdf_test_df['Topic'])
data_source_result_df.loc['dm+df_dm+df'] = [f1]
method_result_df.loc['distilbert-base-uncased'] = [prec, rec, f1]

!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(dm_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(dmdf_test_df, acc=accuracy_score)
dmdf_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = prec, rec, f1 = report_results(dmdf_test_df['distilbert_topic'], dmdf_test_df['Topic'])
data_source_result_df.loc['dm_dm+df'] = [f1]

!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(df_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(dmdf_test_df, acc=accuracy_score)
dmdf_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = prec, rec, f1 = report_results(dmdf_test_df['distilbert_topic'], dmdf_test_df['Topic'])
data_source_result_df.loc['df_dm+df'] = [f1]


!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(dmdf_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(dm_test_df, acc=accuracy_score)
dm_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(dm_test_df['distilbert_topic'], dm_test_df['Topic'])
data_source_result_df.loc['dm+df_dm'] = [f1]

!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(dm_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(dm_test_df, acc=accuracy_score)
dm_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(dm_test_df['distilbert_topic'], dm_test_df['Topic'])
data_source_result_df.loc['dm_dm'] = [f1]

!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(df_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(dm_test_df, acc=accuracy_score)
dm_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(dm_test_df['distilbert_topic'], dm_test_df['Topic'])
data_source_result_df.loc['df_dm'] = [f1]


!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(dmdf_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(df_test_df, acc=accuracy_score)
df_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(df_test_df['distilbert_topic'], df_test_df['Topic'])
data_source_result_df.loc['dm+df_df'] = [f1]

!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(dm_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(df_test_df, acc=accuracy_score)
df_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(df_test_df['distilbert_topic'], df_test_df['Topic'])
data_source_result_df.loc['dm_df'] = [f1]

!rm -rf runs cache/ outputs/ cache_dir/
model_distilbert.train_model(df_train_df)
result, model_outputs, wrong_predictions = model_distilbert.eval_model(df_test_df, acc=accuracy_score)
df_test_df['distilbert_topic'] = np.argmax(model_outputs, axis = 1)
prec, rec, f1 = report_results(df_test_df['distilbert_topic'], df_test_df['Topic'])
data_source_result_df.loc['df_df'] = [f1]

In [None]:
data_source_result_df.to_csv('data_source_result_df_bert.csv', index=True)
method_result_df.to_csv('method_result_df_bert.csv', index=True)