In [None]:
!pip3 install sklearn

In [None]:
import pandas as pd
import numpy as np

In [None]:
food_df = pd.read_csv("food_training_df.csv")
food_df['Topic'] = food_df['Topic'].replace(to_replace=['Food Effect', 'Non Food Effect'], value = [1,0]).astype(float)
food_df[['Topic', 'Data_Source']].value_counts()

In [None]:
dm_food_df = food_df[food_df['Data_Source'] == 'DailyMed'].sample(n = 1200, random_state = 1234)
df_food_df = food_df[food_df['Data_Source'] == 'DrugsFDA'].sample(n = 1200, random_state = 1234)

In [None]:
print(dm_food_df['Topic'].value_counts())

In [None]:
print(df_food_df['Topic'].value_counts())

In [None]:
def prepare_data(df):
    train_df = df.sample(frac = 0.8, random_state = 1234)
    test_df = df.drop(train_df.index).reset_index(drop=True)
    train_df = train_df.reset_index(drop = True)
    print('{},{}'.format(str(len(train_df)), str(len(test_df))))
    return train_df, test_df

In [None]:
dm_train_df, dm_test_df = prepare_data(dm_food_df)
print(dm_train_df['Topic'].value_counts())
print(dm_test_df['Topic'].value_counts())

In [None]:
df_train_df, df_test_df = prepare_data(df_food_df)
print(df_train_df['Topic'].value_counts())
print(df_test_df['Topic'].value_counts())

In [None]:
dmdf_train_df = pd.concat([dm_train_df, df_train_df])
print(dmdf_train_df['Topic'].value_counts())
dmdf_test_df = pd.concat([dm_test_df, df_test_df])
print(dmdf_test_df['Topic'].value_counts())

In [None]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
def report_results(A, B):
    A_name = A.name
    B_name = B.name
    
    df = pd.DataFrame({'A':A,
                       'B':B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    acc = accuracy_score(B, A)
    f1 = f1_score(B, A)
    prec = precision_score(B, A)
    rec = recall_score(B, A)
    ROC = roc_auc_score(B, A)
    
#     print('Candidate: '+A_name+' | Ground Truth: '+B_name+'\n')
    print('accuracy: %0.4f \nprecision: %0.4f \nrecall: %0.4f \nF1 score: %0.4f \nROC AUC: %0.4f \n' % (acc, prec, rec, f1, ROC))
    return prec, rec, f1

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
    
def perform_model(train_df, test_df):
    
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), max_features=100)
    tfidf.fit(train_df['Paragraph']) # .toarray()  
    x_train = tfidf.transform(train_df['Paragraph'])
    y_train = train_df['Topic']
    x_test = tfidf.transform(test_df['Paragraph'])
    y_test = test_df['Topic']
    
    model_lc = LogisticRegression()
    model_lc.fit(x_train, y_train)
    test_df['Prediction'] = model_lc.predict(x_test)
    print('Logistic Regression\n')
    lr_prec, lr_rec, lr_f1 = report_results(test_df['Prediction'], test_df['Topic'])
    incorrect = test_df[test_df['Prediction'] != test_df['Topic']]
    print('incorrect: ' + str(len(incorrect)))
    
    model_ls = LinearSVC()
    model_ls.fit(x_train, y_train)
    test_df['Prediction'] = model_ls.predict(x_test)
    print('\nLinear SVC\n')
    ls_prec, ls_rec, ls_f1 = report_results(test_df['Prediction'], test_df['Topic'])
    incorrect = test_df[test_df['Prediction'] != test_df['Topic']]
    print('incorrect: ' + str(len(incorrect)))
    
    model_rf = RandomForestClassifier(random_state=100)
    model_rf.fit(x_train, y_train)
    test_df['Prediction'] = model_rf.predict(x_test)
    print('\nRandom Forest\n')
    rf_prec, rf_rec, rf_f1 = report_results(test_df['Prediction'], test_df['Topic'])
    incorrect = test_df[test_df['Prediction'] != test_df['Topic']]
    print('incorrect: ' + str(len(incorrect)))
    
    return lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1

In [None]:
data_source_result_df = pd.DataFrame(columns=['F1'])
method_result_df = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(dmdf_train_df, dmdf_test_df)
data_source_result_df.loc['dm+df_dm+df'] = [rf_f1]
method_result_df.loc['Logistic Regression'] = [lr_prec, lr_rec, lr_f1]
method_result_df.loc['Linear SVC'] = [ls_prec, ls_rec, ls_f1]
method_result_df.loc['Random Forest'] = [rf_prec, rf_rec, rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(dm_train_df, dmdf_test_df)
data_source_result_df.loc['dm_dm+df'] = [rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(df_train_df, dmdf_test_df)
data_source_result_df.loc['df_dm+df'] = [rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(dmdf_train_df, dm_test_df)
data_source_result_df.loc['dm+df_dm'] = [rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(dm_train_df, dm_test_df)
data_source_result_df.loc['dm_dm'] = [rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(df_train_df, dm_test_df)
data_source_result_df.loc['df_dm'] = [rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(dmdf_train_df, df_test_df)
data_source_result_df.loc['dm+df_df'] = [rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(dm_train_df, df_test_df)
data_source_result_df.loc['dm_df'] = [rf_f1]

In [None]:
lr_prec, lr_rec, lr_f1, ls_prec, ls_rec, ls_f1, rf_prec, rf_rec, rf_f1 = perform_model(df_train_df, df_test_df)
data_source_result_df.loc['df_df'] = [rf_f1]

In [None]:
data_source_result_df.to_csv('data_source_result_df.csv', index=True)
method_result_df.to_csv('method_result_df.csv', index=True)