In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import warnings
import model_report as mr
import pickle
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#load data... this data is the optimal 4100 features we identified
X_train = pickle.load(open('tv_train_features_sub.pkl','rb'))
X_test = pickle.load(open('tv_test_features_sub.pkl','rb'))
X_holdout = pickle.load(open('tv_holdout_features_sub.pkl','rb'))

#load labels
y_train = pickle.load(open('train_label.pkl','rb'))
y_test = pickle.load(open('test_label.pkl','rb'))
y_holdout = pickle.load(open('holdout_label.pkl','rb'))

In [3]:
from sklearn import metrics
def get_metrics(true_labels, predicted_labels):
    my_accuracy = np.round(metrics.accuracy_score(true_labels,predicted_labels),4)
    my_precision = np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),4)
    my_TPR = np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),4)
    my_F1 = np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),4)
    
    return my_accuracy, my_precision, my_TPR, my_F1

In [4]:
#identify the correct predictions
def get_test_results(preds, test_labels):
    df_matches = pd.DataFrame(list(zip(preds,test_labels)),columns=['prediction','category'])
    df_matches['match'] = np.where(df_matches['category']==df_matches['prediction'],1,0)
    #count how many times the model predicted pornography or drugs incorrectly
    df_sx = df_matches[(df_matches.prediction == 'sx') & (df_matches.match == 0)].groupby('category').size().reset_index(name="count")
    fp_sx = df_sx['count'].sum()
    df_dr = df_matches[(df_matches.prediction == 'dr') & (df_matches.match == 0)].groupby('category').size().reset_index(name="count")
    fp_dr = df_dr['count'].sum()
    
    return df_matches, fp_sx, fp_dr

In [5]:

candidate_models = {1: {'model_name':'svm_rbf','file_name':'svm_rbf_comp_nouns_v1.sav'},
                     2: {'model_name':'svm_lin','file_name':'svm_comp_nouns_v1.sav'},
                     3: {'model_name':'sgd_lin','file_name':'sgd_comp_nouns_v1.sav'},
                     4: {'model_name':'ovo_lin','file_name':'ovo_comp_nouns_v1.sav'},
                     5: {'model_name':'ovr_lin','file_name':'ovr_comp_nouns_v1.sav'},
                     6: {'model_name':'ens_svm','file_name':'ensemble_comp_nouns_v1.sav'},
                    }

#create an object to collect metrics for comparison
data_dict = []

for mdl in candidate_models:
    model_name = candidate_models[mdl]['model_name']
    model_file = candidate_models[mdl]['file_name']
    model = pickle.load(open(model_file,'rb'))
    model_predictions = model.predict(X_test)
    
    #get accuracy by class
    matches, missSx, missDr = get_test_results(model_predictions,y_test)
    rpt = mr.generate_report(matches)
    #accuracies
    sx = rpt['byCategory']['sx']['accuracy']
    dr = rpt['byCategory']['dr']['accuracy']
    ed = rpt['byCategory']['ed']['accuracy']
    sp = rpt['byCategory']['sp']['accuracy']
    mk = rpt['byCategory']['mk']['accuracy']
    os = rpt['byCategory']['os']['accuracy']
    
    #get model metrics
    accuracy, precision, tpr, f1 = get_metrics(true_labels=y_test,predicted_labels=model_predictions)
    
    #store metrics in dictionary
    tmp_dict = {'model_name':model_name,
                'test_acc':accuracy,
                'test_precision': precision,
                'test tpr/recall': tpr,
                'test F1 Score': f1,
                'sx_acc': sx,
                'dr_acc': dr,
                'ed_acc': ed,
                'sp_acc': sp,
                'mk_acc': mk,
                'os_acc': os,
                'fp_sx': missSx,
                'fp_dr': missDr
               }
    
    #append metrics from latest model to dictionary object
    data_dict.append(tmp_dict)

#create dataframe form dictionary object
df_overall = pd.DataFrame(data_dict)

#view all metrics
df_overall

Unnamed: 0,model_name,test_acc,test_precision,test tpr/recall,test F1 Score,sx_acc,dr_acc,ed_acc,sp_acc,mk_acc,os_acc,fp_sx,fp_dr
0,svm_rbf,0.8641,0.8664,0.8641,0.8647,0.991078,0.938111,0.879949,0.878543,0.796957,0.785355,7,5
1,svm_lin,0.8637,0.8668,0.8637,0.8645,0.994796,0.947883,0.872286,0.868421,0.801638,0.783202,10,7
2,sgd_lin,0.8651,0.8655,0.8651,0.865,0.997026,0.95114,0.88378,0.894737,0.768871,0.79397,23,7
3,ovo_lin,0.8619,0.8662,0.8619,0.8632,0.985874,0.925081,0.869732,0.870445,0.802224,0.786073,4,2
4,ovr_lin,0.8678,0.8682,0.8678,0.8678,0.991822,0.938111,0.886335,0.90081,0.776477,0.80402,12,5
5,ens_svm,0.8655,0.867,0.8655,0.8659,0.988848,0.905537,0.882503,0.88664,0.779988,0.796841,8,2


# Get Baseline for Comparison

In [6]:
#load data for baseline... this data is the optimal 3800 features we identified for baseline model
X_train_b = pickle.load(open('tv_train_features_sub_baseline_v1.pkl','rb'))
X_test_b = pickle.load(open('tv_test_features_sub_baseline_v1.pkl','rb'))
X_holdout_b = pickle.load(open('tv_holdout_features_sub_baseline_v1.pkl','rb'))

#load labels
y_train_b = pickle.load(open('train_label_baseline.pkl','rb'))
y_test_b = pickle.load(open('test_label_baseline.pkl','rb'))
y_holdout_b = pickle.load(open('holdout_label_baseline.pkl','rb'))

In [7]:
#load baseline model
svm_lin_base = pickle.load(open('svm_baseline_v1.sav','rb'))

In [8]:
#get model predictions on test set
svm_lin_base_predictions = svm_lin_base.predict(X_test_b)

In [9]:
#get accuracy by class for baseline model
matches, missSx, missDr = get_test_results(svm_lin_base_predictions,y_test_b)
rpt = mr.generate_report(matches)
#accuracies
sx = rpt['byCategory']['sx']['accuracy']
dr = rpt['byCategory']['dr']['accuracy']
ed = rpt['byCategory']['ed']['accuracy']
sp = rpt['byCategory']['sp']['accuracy']
mk = rpt['byCategory']['mk']['accuracy']
os = rpt['byCategory']['os']['accuracy']

#get model metrics
accuracy, precision, tpr, f1 = get_metrics(true_labels=y_test_b,predicted_labels=svm_lin_base_predictions)

#store metrics in dictionary
tmp_dict = {'model_name':'svm_lin_base',
            'test_acc':accuracy,
            'test_precision': precision,
            'test tpr/recall': tpr,
            'test F1 Score': f1,
            'sx_acc': sx,
            'dr_acc': dr,
            'ed_acc': ed,
            'sp_acc': sp,
            'mk_acc': mk,
            'os_acc': os,
            'fp_sx': missSx,
            'fp_dr': missDr
           }
#append metrics from latest model to dictionary object
data_dict.append(tmp_dict)
#create dataframe that includes baseline
df_all = pd.DataFrame(data_dict)
#view all metrics
df_all

Unnamed: 0,model_name,test_acc,test_precision,test tpr/recall,test F1 Score,sx_acc,dr_acc,ed_acc,sp_acc,mk_acc,os_acc,fp_sx,fp_dr
0,svm_rbf,0.8641,0.8664,0.8641,0.8647,0.991078,0.938111,0.879949,0.878543,0.796957,0.785355,7,5
1,svm_lin,0.8637,0.8668,0.8637,0.8645,0.994796,0.947883,0.872286,0.868421,0.801638,0.783202,10,7
2,sgd_lin,0.8651,0.8655,0.8651,0.865,0.997026,0.95114,0.88378,0.894737,0.768871,0.79397,23,7
3,ovo_lin,0.8619,0.8662,0.8619,0.8632,0.985874,0.925081,0.869732,0.870445,0.802224,0.786073,4,2
4,ovr_lin,0.8678,0.8682,0.8678,0.8678,0.991822,0.938111,0.886335,0.90081,0.776477,0.80402,12,5
5,ens_svm,0.8655,0.867,0.8655,0.8659,0.988848,0.905537,0.882503,0.88664,0.779988,0.796841,8,2
6,svm_lin_base,0.856,0.8587,0.856,0.8567,0.991822,0.934853,0.868455,0.856275,0.790521,0.773869,13,11


# Get Keyword Model for Comparison

In [10]:
#load data for keywords... this data is the optimal 2000 features we identified for keyword model using only keywords from yake
X_train_k = pickle.load(open('tv_train_features_sub_keywords_v1.pkl','rb'))
X_test_k = pickle.load(open('tv_test_features_sub_keywords_v1.pkl','rb'))
X_holdout_k = pickle.load(open('tv_holdout_features_sub_keywords_v1.pkl','rb'))

#load labels
y_train_k = pickle.load(open('train_label_keywords.pkl','rb'))
y_test_k = pickle.load(open('test_label_keywords.pkl','rb'))
y_holdout_k = pickle.load(open('holdout_label_keywords.pkl','rb'))

In [11]:
#load keyword model
svm_key_base = pickle.load(open('svm_keywords_v1.sav','rb'))

In [12]:
#get model predictions on test set
svm_key_base_predictions = svm_key_base.predict(X_test_k)

In [13]:
#get accuracy by class for baseline model
matches, missSx, missDr = get_test_results(svm_key_base_predictions,y_test_k)
rpt = mr.generate_report(matches)
#accuracies
sx = rpt['byCategory']['sx']['accuracy']
dr = rpt['byCategory']['dr']['accuracy']
ed = rpt['byCategory']['ed']['accuracy']
sp = rpt['byCategory']['sp']['accuracy']
mk = rpt['byCategory']['mk']['accuracy']
os = rpt['byCategory']['os']['accuracy']

#get model metrics
accuracy, precision, tpr, f1 = get_metrics(true_labels=y_test_k,predicted_labels=svm_key_base_predictions)

#store metrics in dictionary
tmp_dict = {'model_name':'svm_key_lin',
            'test_acc':accuracy,
            'test_precision': precision,
            'test tpr/recall': tpr,
            'test F1 Score': f1,
            'sx_acc': sx,
            'dr_acc': dr,
            'ed_acc': ed,
            'sp_acc': sp,
            'mk_acc': mk,
            'os_acc': os,
            'fp_sx': missSx,
            'fp_dr': missDr
           }
#append metrics from latest model to dictionary object
data_dict.append(tmp_dict)
#create dataframe that includes baseline
df_all = pd.DataFrame(data_dict)


In [14]:
#view all metrics
df_all

Unnamed: 0,model_name,test_acc,test_precision,test tpr/recall,test F1 Score,sx_acc,dr_acc,ed_acc,sp_acc,mk_acc,os_acc,fp_sx,fp_dr
0,svm_rbf,0.8641,0.8664,0.8641,0.8647,0.991078,0.938111,0.879949,0.878543,0.796957,0.785355,7,5
1,svm_lin,0.8637,0.8668,0.8637,0.8645,0.994796,0.947883,0.872286,0.868421,0.801638,0.783202,10,7
2,sgd_lin,0.8651,0.8655,0.8651,0.865,0.997026,0.95114,0.88378,0.894737,0.768871,0.79397,23,7
3,ovo_lin,0.8619,0.8662,0.8619,0.8632,0.985874,0.925081,0.869732,0.870445,0.802224,0.786073,4,2
4,ovr_lin,0.8678,0.8682,0.8678,0.8678,0.991822,0.938111,0.886335,0.90081,0.776477,0.80402,12,5
5,ens_svm,0.8655,0.867,0.8655,0.8659,0.988848,0.905537,0.882503,0.88664,0.779988,0.796841,8,2
6,svm_lin_base,0.856,0.8587,0.856,0.8567,0.991822,0.934853,0.868455,0.856275,0.790521,0.773869,13,11
7,svm_key_lin,0.7645,0.7654,0.7645,0.7636,0.97026,0.895765,0.704981,0.728745,0.653599,0.659727,83,42


In [15]:
#write results to disk
file_out = "C:\\Users\\blgai\\OneDrive\\Documents\\School\\SMU\\Courses\\Fall 2021\\Capstone A\Data\\model_comparison_test_v1.csv"
df_all.to_csv(file_out,index=False)

In [16]:
import altair as alt

alt.Chart(df_all).mark_bar().encode(
    x=alt.Y('model_name:N', sort='-y'),
    y=alt.Y('test_acc:Q',scale=alt.Scale(domain=[.75,.89])),
    color='model_name',
    tooltip=['model_name','test_acc']
).properties(width=700,height=400,title='Model Accuracy on Test Data').configure_title(fontSize=20,anchor='start')


In [17]:
import altair as alt

alt.Chart(df_all).mark_bar().encode(
    x=alt.Y('model_name:N', sort='-y'),
    y=alt.Y('test_precision:Q',scale=alt.Scale(domain=[.75,.89])),
    color='model_name',
    tooltip=['model_name','test_precision']
).properties(width=700,height=400,title='Model Precision on Test Data').configure_title(fontSize=20,anchor='start')

In [18]:
candidate_models = {1: {'model_name':'svm_rbf','file_name':'svm_rbf_comp_nouns_v1.sav'},
                     2: {'model_name':'svm_lin','file_name':'svm_comp_nouns_v1.sav'},
                     3: {'model_name':'sgd_lin','file_name':'sgd_comp_nouns_v1.sav'},
                     4: {'model_name':'ovo_lin','file_name':'ovo_comp_nouns_v1.sav'},
                     5: {'model_name':'ovr_lin','file_name':'ovr_comp_nouns_v1.sav'},
                     6: {'model_name':'ens_svm','file_name':'ensemble_comp_nouns_v1.sav'},
                     7: {'model_name':'svm_key_lin','file_name':'svm_keywords_v1.sav'},
                    }

#grab baseline metrics
base_acc = df_all.loc[df_all.model_name == 'svm_lin_base','test_acc'].values[0]
base_pre = df_all.loc[df_all.model_name == 'svm_lin_base','test_precision'].values[0]
base_tpr = df_all.loc[df_all.model_name == 'svm_lin_base','test tpr/recall'].values[0]
base_f1 = df_all.loc[df_all.model_name == 'svm_lin_base','test F1 Score'].values[0]
base_sxacc = df_all.loc[df_all.model_name == 'svm_lin_base','sx_acc'].values[0]
base_dracc = df_all.loc[df_all.model_name == 'svm_lin_base','dr_acc'].values[0]
base_edacc = df_all.loc[df_all.model_name == 'svm_lin_base','ed_acc'].values[0]
base_spacc = df_all.loc[df_all.model_name == 'svm_lin_base','sp_acc'].values[0]
base_mkacc = df_all.loc[df_all.model_name == 'svm_lin_base','mk_acc'].values[0]
base_osacc = df_all.loc[df_all.model_name == 'svm_lin_base','os_acc'].values[0]
base_sxmiss = df_all.loc[df_all.model_name == 'svm_lin_base','fp_sx'].values[0]
base_drmiss = df_all.loc[df_all.model_name == 'svm_lin_base','fp_dr'].values[0]

#calculate the difference in metrics between baseline
data_dict = []
for mdl in candidate_models:
    model_name = candidate_models[mdl]['model_name']
    mdl_acc = df_all.loc[df_all.model_name == model_name,'test_acc'].values[0]
    mdl_pre = df_all.loc[df_all.model_name == model_name,'test_precision'].values[0]
    mdl_tpr = df_all.loc[df_all.model_name == model_name,'test tpr/recall'].values[0]
    mdl_f1 = df_all.loc[df_all.model_name == model_name,'test F1 Score'].values[0]
    mdl_sxacc = df_all.loc[df_all.model_name == model_name,'sx_acc'].values[0]
    mdl_dracc = df_all.loc[df_all.model_name == model_name,'dr_acc'].values[0]
    mdl_edacc = df_all.loc[df_all.model_name == model_name,'ed_acc'].values[0]
    mdl_spacc = df_all.loc[df_all.model_name == model_name,'sp_acc'].values[0]
    mdl_mkacc = df_all.loc[df_all.model_name == model_name,'mk_acc'].values[0]
    mdl_osacc = df_all.loc[df_all.model_name == model_name,'os_acc'].values[0]
    mdl_sxmiss = df_all.loc[df_all.model_name == model_name,'fp_sx'].values[0]
    mdl_drmiss = df_all.loc[df_all.model_name == model_name,'fp_dr'].values[0]
    #store difference in dictionary
    tmp_dict = {'model_name':model_name,
                'accuracy_diff':mdl_acc - base_acc,
                'precision_diff':mdl_pre - base_pre,
                'tpr_diff':mdl_tpr - base_tpr,
                'f1_diff':mdl_f1 - base_f1,
                'sx_acc_diff':mdl_sxacc - base_sxacc,
                'dr_acc_diff':mdl_dracc - base_dracc,
                'ed_acc_diff':mdl_edacc - base_edacc,
                'sp_acc_diff':mdl_spacc - base_spacc,
                'mk_acc_diff':mdl_mkacc - base_mkacc,
                'os_acc_diff':mdl_osacc - base_osacc,
                'sx_fp_diff':mdl_sxmiss - base_sxmiss,
                'dr_fp_diff':mdl_drmiss - base_drmiss
               }
    #append differences from latest model to dictionary object
    data_dict.append(tmp_dict)

df_acc_diff = pd.DataFrame(data_dict)

#view the differences table
df_acc_diff

Unnamed: 0,model_name,accuracy_diff,precision_diff,tpr_diff,f1_diff,sx_acc_diff,dr_acc_diff,ed_acc_diff,sp_acc_diff,mk_acc_diff,os_acc_diff,sx_fp_diff,dr_fp_diff
0,svm_rbf,0.0081,0.0077,0.0081,0.008,-0.000743,0.003257,0.011494,0.022267,0.006437,0.011486,-6,-6
1,svm_lin,0.0077,0.0081,0.0077,0.0078,0.002974,0.013029,0.003831,0.012146,0.011118,0.009332,-3,-4
2,sgd_lin,0.0091,0.0068,0.0091,0.0083,0.005204,0.016287,0.015326,0.038462,-0.02165,0.020101,10,-4
3,ovo_lin,0.0059,0.0075,0.0059,0.0065,-0.005948,-0.009772,0.001277,0.01417,0.011703,0.012204,-9,-9
4,ovr_lin,0.0118,0.0095,0.0118,0.0111,0.0,0.003257,0.01788,0.044534,-0.014043,0.030151,-1,-6
5,ens_svm,0.0095,0.0083,0.0095,0.0092,-0.002974,-0.029316,0.014049,0.030364,-0.010532,0.022972,-5,-9
6,svm_key_lin,-0.0915,-0.0933,-0.0915,-0.0931,-0.021561,-0.039088,-0.163474,-0.12753,-0.136922,-0.114142,70,31


In [19]:
import altair as alt

points = alt.Chart(df_all).mark_circle(size=60).encode(
    x=alt.X('sx_acc:Q',scale=alt.Scale(domain=[.968,1])),
    y=alt.Y('dr_acc:Q',scale=alt.Scale(domain=[.88,1])),
    color = 'model_name'
).properties(width=700,
             height=400,
             title='Scatter Plot of Pornography and Drugs Accuracy by Model'
            )
text = points.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='model_name'
)

points.configure_title(fontSize=20,anchor='start')

points + text

# One vs One has fewest classes misclassfied as Pornography or Drugs

In [20]:
import altair as alt

points = alt.Chart(df_all[df_all.model_name != 'svm_key_lin']).mark_circle(size=60).encode(
    x=alt.X('fp_sx:Q',scale=alt.Scale(domain=[0,25])),
    y=alt.Y('fp_dr:Q',scale=alt.Scale(domain=[0,14])),
    color = 'model_name',
    tooltip = ['model_name','test_acc']
).properties(width=700,
             height=400,
             title='Scatter Plot of Count of Misclassifications of Pornography and Drugs to Webpages by Model'
            )
text = points.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='model_name'
)

points.configure_title(fontSize=20,anchor='start')

points + text