In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import warnings
import model_report as mr
import pickle
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#load data... this data is the optimal 4100 features we identified
X_train = pickle.load(open('tv_train_features_sub.pkl','rb'))
X_test = pickle.load(open('tv_test_features_sub.pkl','rb'))
X_holdout = pickle.load(open('tv_holdout_features_sub.pkl','rb'))

#load labels
y_train = pickle.load(open('train_label.pkl','rb'))
y_test = pickle.load(open('test_label.pkl','rb'))
y_holdout = pickle.load(open('holdout_label.pkl','rb'))

In [3]:
from sklearn import metrics
def get_metrics(true_labels, predicted_labels):
    my_accuracy = np.round(metrics.accuracy_score(true_labels,predicted_labels),4)
    my_precision = np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),4)
    my_TPR = np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),4)
    my_F1 = np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),4)
    
    return my_accuracy, my_precision, my_TPR, my_F1

In [4]:
#identify the correct predictions
def get_test_results(preds, test_labels):
    df_matches = pd.DataFrame(list(zip(preds,test_labels)),columns=['prediction','category'])
    df_matches['match'] = np.where(df_matches['category']==df_matches['prediction'],1,0)
    #count how many times the model predicted pornography or drugs incorrectly
    df_sx = df_matches[(df_matches.prediction == 'sx') & (df_matches.match == 0)].groupby('category').size().reset_index(name="count")
    fp_sx = df_sx['count'].sum()
    df_dr = df_matches[(df_matches.prediction == 'dr') & (df_matches.match == 0)].groupby('category').size().reset_index(name="count")
    fp_dr = df_dr['count'].sum()
    
    return df_matches, fp_sx, fp_dr

In [5]:

candidate_models = {1: {'model_name':'svm_rbf','file_name':'svm_rbf_comp_nouns_v1.sav'},
                     2: {'model_name':'svm_lin','file_name':'svm_comp_nouns_v1.sav'},
                     3: {'model_name':'sgd_lin','file_name':'sgd_comp_nouns_v1.sav'},
                     4: {'model_name':'ovo_lin','file_name':'ovo_comp_nouns_v1.sav'},
                     5: {'model_name':'ovr_lin','file_name':'ovr_comp_nouns_v1.sav'},
                     6: {'model_name':'ens_svm','file_name':'ensemble_comp_nouns_v1.sav'},
                    }

#create an object to collect metrics for comparison
data_dict = []

for mdl in candidate_models:
    model_name = candidate_models[mdl]['model_name']
    model_file = candidate_models[mdl]['file_name']
    model = pickle.load(open(model_file,'rb'))
    model_predictions = model.predict(X_holdout)
    
    #get accuracy by class
    matches, missSx, missDr = get_test_results(model_predictions,y_holdout)
    rpt = mr.generate_report(matches)
    #accuracies
    sx = rpt['byCategory']['sx']['accuracy']
    dr = rpt['byCategory']['dr']['accuracy']
    ed = rpt['byCategory']['ed']['accuracy']
    sp = rpt['byCategory']['sp']['accuracy']
    mk = rpt['byCategory']['mk']['accuracy']
    os = rpt['byCategory']['os']['accuracy']
    
    #get model metrics
    accuracy, precision, tpr, f1 = get_metrics(true_labels=y_holdout,predicted_labels=model_predictions)
    
    #store metrics in dictionary
    tmp_dict = {'model_name':model_name,
                'holdout_acc':accuracy,
                'holdout_precision': precision,
                'holdout tpr/recall': tpr,
                'holdout F1 Score': f1,
                'sx_acc': sx,
                'dr_acc': dr,
                'ed_acc': ed,
                'sp_acc': sp,
                'mk_acc': mk,
                'os_acc': os,
                'fp_sx': missSx,
                'fp_dr': missDr
               }
    
    #append metrics from latest model to dictionary object
    data_dict.append(tmp_dict)

#create dataframe form dictionary object
df_overall = pd.DataFrame(data_dict)

#view all metrics
df_overall

Unnamed: 0,model_name,holdout_acc,holdout_precision,holdout tpr/recall,holdout F1 Score,sx_acc,dr_acc,ed_acc,sp_acc,mk_acc,os_acc,fp_sx,fp_dr
0,svm_rbf,0.8682,0.8693,0.8682,0.8685,0.994012,0.944056,0.863402,0.860465,0.794151,0.797856,6,2
1,svm_lin,0.8649,0.8665,0.8649,0.8652,0.992515,0.965035,0.871134,0.837209,0.8009,0.779479,4,7
2,sgd_lin,0.8653,0.865,0.8653,0.8649,0.994012,0.972028,0.868557,0.891473,0.762655,0.797856,10,6
3,ovo_lin,0.866,0.8687,0.866,0.8669,0.989521,0.93007,0.871134,0.841085,0.804274,0.794793,1,1
4,ovr_lin,0.8644,0.8644,0.8644,0.8642,0.992515,0.937063,0.876289,0.891473,0.764904,0.800919,4,4
5,ens_svm,0.867,0.8678,0.867,0.8671,0.992515,0.909091,0.873711,0.864341,0.773903,0.807044,3,1


# Get Baseline for Comparison

In [6]:
#load data for baseline... this data is the optimal 3800 features we identified for baseline model
X_train_b = pickle.load(open('tv_train_features_sub_baseline_v1.pkl','rb'))
X_test_b = pickle.load(open('tv_test_features_sub_baseline_v1.pkl','rb'))
X_holdout_b = pickle.load(open('tv_holdout_features_sub_baseline_v1.pkl','rb'))

#load labels
y_train_b = pickle.load(open('train_label_baseline.pkl','rb'))
y_test_b = pickle.load(open('test_label_baseline.pkl','rb'))
y_holdout_b = pickle.load(open('holdout_label_baseline.pkl','rb'))

In [7]:
#load baseline model
svm_lin_base = pickle.load(open('svm_baseline_v1.sav','rb'))

In [8]:
#get model predictions on test set
svm_lin_base_predictions = svm_lin_base.predict(X_holdout_b)

In [9]:
#get accuracy by class for baseline model
matches, missSx, missDr = get_test_results(svm_lin_base_predictions,y_holdout_b)
rpt = mr.generate_report(matches)
#accuracies
sx = rpt['byCategory']['sx']['accuracy']
dr = rpt['byCategory']['dr']['accuracy']
ed = rpt['byCategory']['ed']['accuracy']
sp = rpt['byCategory']['sp']['accuracy']
mk = rpt['byCategory']['mk']['accuracy']
os = rpt['byCategory']['os']['accuracy']

#get model metrics
accuracy, precision, tpr, f1 = get_metrics(true_labels=y_holdout_b,predicted_labels=svm_lin_base_predictions)

#store metrics in dictionary
tmp_dict = {'model_name':'svm_lin_base',
            'holdout_acc':accuracy,
            'holdout_precision': precision,
            'holdout tpr/recall': tpr,
            'holdout F1 Score': f1,
            'sx_acc': sx,
            'dr_acc': dr,
            'ed_acc': ed,
            'sp_acc': sp,
            'mk_acc': mk,
            'os_acc': os,
            'fp_sx': missSx,
            'fp_dr': missDr
           }
#append metrics from latest model to dictionary object
data_dict.append(tmp_dict)
#create dataframe that includes baseline
df_all = pd.DataFrame(data_dict)
#view all metrics
df_all

Unnamed: 0,model_name,holdout_acc,holdout_precision,holdout tpr/recall,holdout F1 Score,sx_acc,dr_acc,ed_acc,sp_acc,mk_acc,os_acc,fp_sx,fp_dr
0,svm_rbf,0.8682,0.8693,0.8682,0.8685,0.994012,0.944056,0.863402,0.860465,0.794151,0.797856,6,2
1,svm_lin,0.8649,0.8665,0.8649,0.8652,0.992515,0.965035,0.871134,0.837209,0.8009,0.779479,4,7
2,sgd_lin,0.8653,0.865,0.8653,0.8649,0.994012,0.972028,0.868557,0.891473,0.762655,0.797856,10,6
3,ovo_lin,0.866,0.8687,0.866,0.8669,0.989521,0.93007,0.871134,0.841085,0.804274,0.794793,1,1
4,ovr_lin,0.8644,0.8644,0.8644,0.8642,0.992515,0.937063,0.876289,0.891473,0.764904,0.800919,4,4
5,ens_svm,0.867,0.8678,0.867,0.8671,0.992515,0.909091,0.873711,0.864341,0.773903,0.807044,3,1
6,svm_lin_base,0.8598,0.8609,0.8598,0.86,0.994012,0.951049,0.850515,0.879845,0.789651,0.770291,12,9


# Get Keyword Model for Comparison

In [10]:
#load data for keywords... this data is the optimal 2000 features we identified for keyword model using only keywords from yake
X_train_k = pickle.load(open('tv_train_features_sub_keywords_v1.pkl','rb'))
X_test_k = pickle.load(open('tv_test_features_sub_keywords_v1.pkl','rb'))
X_holdout_k = pickle.load(open('tv_holdout_features_sub_keywords_v1.pkl','rb'))

#load labels
y_train_k = pickle.load(open('train_label_keywords.pkl','rb'))
y_test_k = pickle.load(open('test_label_keywords.pkl','rb'))
y_holdout_k = pickle.load(open('holdout_label_keywords.pkl','rb'))

In [11]:
#load keyword model
svm_key_base = pickle.load(open('svm_keywords_v1.sav','rb'))

In [12]:
#get model predictions on test set
svm_key_base_predictions = svm_key_base.predict(X_holdout_k)

In [13]:
#get accuracy by class for baseline model
matches, missSx, missDr = get_test_results(svm_key_base_predictions,y_holdout_k)
rpt = mr.generate_report(matches)
#accuracies
sx = rpt['byCategory']['sx']['accuracy']
dr = rpt['byCategory']['dr']['accuracy']
ed = rpt['byCategory']['ed']['accuracy']
sp = rpt['byCategory']['sp']['accuracy']
mk = rpt['byCategory']['mk']['accuracy']
os = rpt['byCategory']['os']['accuracy']

#get model metrics
accuracy, precision, tpr, f1 = get_metrics(true_labels=y_holdout_k,predicted_labels=svm_key_base_predictions)

#store metrics in dictionary
tmp_dict = {'model_name':'svm_key_lin',
            'holdout_acc':accuracy,
            'holdout_precision': precision,
            'holdout tpr/recall': tpr,
            'holdout F1 Score': f1,
            'sx_acc': sx,
            'dr_acc': dr,
            'ed_acc': ed,
            'sp_acc': sp,
            'mk_acc': mk,
            'os_acc': os,
            'fp_sx': missSx,
            'fp_dr': missDr
           }
#append metrics from latest model to dictionary object
data_dict.append(tmp_dict)
#create dataframe that includes baseline
df_all = pd.DataFrame(data_dict)


In [14]:
#view all metrics
df_all

Unnamed: 0,model_name,holdout_acc,holdout_precision,holdout tpr/recall,holdout F1 Score,sx_acc,dr_acc,ed_acc,sp_acc,mk_acc,os_acc,fp_sx,fp_dr
0,svm_rbf,0.8682,0.8693,0.8682,0.8685,0.994012,0.944056,0.863402,0.860465,0.794151,0.797856,6,2
1,svm_lin,0.8649,0.8665,0.8649,0.8652,0.992515,0.965035,0.871134,0.837209,0.8009,0.779479,4,7
2,sgd_lin,0.8653,0.865,0.8653,0.8649,0.994012,0.972028,0.868557,0.891473,0.762655,0.797856,10,6
3,ovo_lin,0.866,0.8687,0.866,0.8669,0.989521,0.93007,0.871134,0.841085,0.804274,0.794793,1,1
4,ovr_lin,0.8644,0.8644,0.8644,0.8642,0.992515,0.937063,0.876289,0.891473,0.764904,0.800919,4,4
5,ens_svm,0.867,0.8678,0.867,0.8671,0.992515,0.909091,0.873711,0.864341,0.773903,0.807044,3,1
6,svm_lin_base,0.8598,0.8609,0.8598,0.86,0.994012,0.951049,0.850515,0.879845,0.789651,0.770291,12,9
7,svm_key_lin,0.772,0.7715,0.772,0.7705,0.980539,0.909091,0.724227,0.697674,0.665917,0.643185,30,24


In [15]:
#write results to disk
file_out = "C:\\Users\\blgai\\OneDrive\\Documents\\School\\SMU\\Courses\\Fall 2021\\Capstone A\Data\\model_comparison_holdout_v1.csv"
df_all.to_csv(file_out,index=False)

In [16]:
import altair as alt

alt.Chart(df_all).mark_bar().encode(
    x=alt.Y('model_name:N', sort='-y'),
    y=alt.Y('holdout_acc:Q',scale=alt.Scale(domain=[.75,.89])),
    color='model_name',
    tooltip=['model_name','holdout_acc']
).properties(width=700,height=400,title='Model Accuracy on Holdout Data').configure_title(fontSize=20,anchor='start')


In [17]:
import altair as alt

alt.Chart(df_all).mark_bar().encode(
    x=alt.Y('model_name:N', sort='-y'),
    y=alt.Y('holdout_precision:Q',scale=alt.Scale(domain=[.75,.89])),
    color='model_name',
    tooltip=['model_name','holdout_precision']
).properties(width=700,height=400,title='Model Precision on Holdout Data').configure_title(fontSize=20,anchor='start')

In [18]:
candidate_models = {1: {'model_name':'svm_rbf','file_name':'svm_rbf_comp_nouns_v1.sav'},
                     2: {'model_name':'svm_lin','file_name':'svm_comp_nouns_v1.sav'},
                     3: {'model_name':'sgd_lin','file_name':'sgd_comp_nouns_v1.sav'},
                     4: {'model_name':'ovo_lin','file_name':'ovo_comp_nouns_v1.sav'},
                     5: {'model_name':'ovr_lin','file_name':'ovr_comp_nouns_v1.sav'},
                     6: {'model_name':'ens_svm','file_name':'ensemble_comp_nouns_v1.sav'},
                     7: {'model_name':'svm_key_lin','file_name':'svm_keywords_v1.sav'},
                    }

#grab baseline metrics
base_acc = df_all.loc[df_all.model_name == 'svm_lin_base','holdout_acc'].values[0]
base_pre = df_all.loc[df_all.model_name == 'svm_lin_base','holdout_precision'].values[0]
base_tpr = df_all.loc[df_all.model_name == 'svm_lin_base','holdout tpr/recall'].values[0]
base_f1 = df_all.loc[df_all.model_name == 'svm_lin_base','holdout F1 Score'].values[0]
base_sxacc = df_all.loc[df_all.model_name == 'svm_lin_base','sx_acc'].values[0]
base_dracc = df_all.loc[df_all.model_name == 'svm_lin_base','dr_acc'].values[0]
base_edacc = df_all.loc[df_all.model_name == 'svm_lin_base','ed_acc'].values[0]
base_spacc = df_all.loc[df_all.model_name == 'svm_lin_base','sp_acc'].values[0]
base_mkacc = df_all.loc[df_all.model_name == 'svm_lin_base','mk_acc'].values[0]
base_osacc = df_all.loc[df_all.model_name == 'svm_lin_base','os_acc'].values[0]
base_sxmiss = df_all.loc[df_all.model_name == 'svm_lin_base','fp_sx'].values[0]
base_drmiss = df_all.loc[df_all.model_name == 'svm_lin_base','fp_dr'].values[0]

#calculate the difference in metrics between baseline
data_dict = []
for mdl in candidate_models:
    model_name = candidate_models[mdl]['model_name']
    mdl_acc = df_all.loc[df_all.model_name == model_name,'holdout_acc'].values[0]
    mdl_pre = df_all.loc[df_all.model_name == model_name,'holdout_precision'].values[0]
    mdl_tpr = df_all.loc[df_all.model_name == model_name,'holdout tpr/recall'].values[0]
    mdl_f1 = df_all.loc[df_all.model_name == model_name,'holdout F1 Score'].values[0]
    mdl_sxacc = df_all.loc[df_all.model_name == model_name,'sx_acc'].values[0]
    mdl_dracc = df_all.loc[df_all.model_name == model_name,'dr_acc'].values[0]
    mdl_edacc = df_all.loc[df_all.model_name == model_name,'ed_acc'].values[0]
    mdl_spacc = df_all.loc[df_all.model_name == model_name,'sp_acc'].values[0]
    mdl_mkacc = df_all.loc[df_all.model_name == model_name,'mk_acc'].values[0]
    mdl_osacc = df_all.loc[df_all.model_name == model_name,'os_acc'].values[0]
    mdl_sxmiss = df_all.loc[df_all.model_name == model_name,'fp_sx'].values[0]
    mdl_drmiss = df_all.loc[df_all.model_name == model_name,'fp_dr'].values[0]
    #store difference in dictionary
    tmp_dict = {'model_name':model_name,
                'accuracy_diff':mdl_acc - base_acc,
                'precision_diff':mdl_pre - base_pre,
                'tpr_diff':mdl_tpr - base_tpr,
                'f1_diff':mdl_f1 - base_f1,
                'sx_acc_diff':mdl_sxacc - base_sxacc,
                'dr_acc_diff':mdl_dracc - base_dracc,
                'ed_acc_diff':mdl_edacc - base_edacc,
                'sp_acc_diff':mdl_spacc - base_spacc,
                'mk_acc_diff':mdl_mkacc - base_mkacc,
                'os_acc_diff':mdl_osacc - base_osacc,
                'sx_fp_diff':mdl_sxmiss - base_sxmiss,
                'dr_fp_diff':mdl_drmiss - base_drmiss
               }
    #append differences from latest model to dictionary object
    data_dict.append(tmp_dict)

df_acc_diff = pd.DataFrame(data_dict)

#view the differences table
df_acc_diff

Unnamed: 0,model_name,accuracy_diff,precision_diff,tpr_diff,f1_diff,sx_acc_diff,dr_acc_diff,ed_acc_diff,sp_acc_diff,mk_acc_diff,os_acc_diff,sx_fp_diff,dr_fp_diff
0,svm_rbf,0.0084,0.0084,0.0084,0.0085,0.0,-0.006993,0.012887,-0.01938,0.004499,0.027565,-6,-7
1,svm_lin,0.0051,0.0056,0.0051,0.0052,-0.001497,0.013986,0.020619,-0.042636,0.011249,0.009188,-8,-2
2,sgd_lin,0.0055,0.0041,0.0055,0.0049,0.0,0.020979,0.018041,0.011628,-0.026997,0.027565,-2,-3
3,ovo_lin,0.0062,0.0078,0.0062,0.0069,-0.004491,-0.020979,0.020619,-0.03876,0.014623,0.024502,-11,-8
4,ovr_lin,0.0046,0.0035,0.0046,0.0042,-0.001497,-0.013986,0.025773,0.011628,-0.024747,0.030628,-8,-5
5,ens_svm,0.0072,0.0069,0.0072,0.0071,-0.001497,-0.041958,0.023196,-0.015504,-0.015748,0.036753,-9,-8
6,svm_key_lin,-0.0878,-0.0894,-0.0878,-0.0895,-0.013473,-0.041958,-0.126289,-0.182171,-0.123735,-0.127106,18,15


In [19]:
import altair as alt

points = alt.Chart(df_all).mark_circle(size=60).encode(
    x=alt.X('sx_acc:Q',scale=alt.Scale(domain=[.968,1])),
    y=alt.Y('dr_acc:Q',scale=alt.Scale(domain=[.88,1])),
    color = 'model_name'
).properties(width=700,
             height=400,
             title='Scatter Plot of Pornography and Drugs Accuracy by Model - Holdout Data'
            )
text = points.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='model_name'
)

points.configure_title(fontSize=20,anchor='start')

points + text

# One vs One has fewest classes misclassfied as Pornography or Drugs

In [20]:
import altair as alt

points = alt.Chart(df_all[df_all.model_name != 'svm_key_lin']).mark_circle(size=60).encode(
    x=alt.X('fp_sx:Q',scale=alt.Scale(domain=[0,25])),
    y=alt.Y('fp_dr:Q',scale=alt.Scale(domain=[0,14])),
    color = 'model_name',
    tooltip = ['model_name','holdout_acc']
).properties(width=700,
             height=400,
             title='Scatter Plot of Count of Misclassifications of Pornography and Drugs to Webpages by Model - Holdout Data'
            )
text = points.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='model_name'
)

points.configure_title(fontSize=20,anchor='start')

points + text