In [32]:
import os
from importlib import reload

# Data Analysis
import pandas as pd
import numpy as np
from scipy import sparse

# Text Processing
from sklearn.feature_extraction import text as tx
from scipy import sparse
from nltk.corpus import words as nltk_words
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from langdetect import detect
import enchant

# Plotting
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

# Modeling
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score as auc, average_precision_score as ap


In [3]:
plt.style.use('seaborn-darkgrid')
pio.renderers.default = "browser"
%matplotlib inline


# Use Enchant English Dictionary
d = enchant.Dict("en_US")

In [38]:
def train_val(train_name, val_name, df_trn, model, label):
    # Upload Training Data
    y_trn = df_trn['label']
    X_trn = sparse.load_npz(os.path.join("data", train_name))
    X_trn = sparse.hstack((X_trn, df_trn.drop(columns=['label']).values))
    
    # Upload Validation Data
    y_val = df_val['label']
    X_val = sparse.load_npz(os.path.join("data", val_name))
    X_val = sparse.hstack((X_val, df_val.drop(columns=['label']).values))    
    
    # Train Model
    model = model.fit(X_trn, y_trn)
    yhat_trn = model.predict_proba(X_trn)
    #yhat_trn = model.predict(X_trn)
    yhat_val = model.predict_proba(X_val)
    
    # Results
    auc_trn = auc(y_trn, yhat_trn[:, 1])
    auc_val = auc(y_val, yhat_val[:, 1])
    ap_trn = ap(y_trn, yhat_trn[:, 1])
    ap_val = ap(y_val, yhat_val[:, 1])
    
    return {'model':label, 'data':val_name, 'auc':auc_val, 
              'ap':ap_val, 'auc_trn':auc_trn, 'ap_trn':ap_trn}
    
    

### Standard Datasets

In [28]:
results=[]

In [26]:
df_trn = pd.read_csv(os.path.join('data', 'df_train.csv'), index_col=['ex_id'])
df_val = pd.read_csv(os.path.join('data', 'df_valid.csv'), index_col=['ex_id'])

In [27]:
datasets = [['wordcount_train.npz', 'wordcount_valid.npz'],
            ['tfidfnorm_trim_train.npz', 'tfidfnorm_trim_valid.npz']]

In [30]:
for data in datasets:
    results.append(train_val(data[0], data[1], df_trn, BernoulliNB(), 'Naive Bayes (default)'))

### Up-Sampled Datasets

In [35]:
df_trn = pd.read_csv(os.path.join('data', 'df_train_up.csv'), index_col=['ex_id'])


In [40]:
datasets = [['wordcount_train_up.npz', 'wordcount_valid_up.npz'],
            ['tfidfnorm_trim_train_up.npz', 'tfidfnorm_trim_valid_up.npz']]

In [41]:
for data in datasets:
    results.append(train_val(data[0], data[1], df_trn, BernoulliNB(), 'Naive Bayes (default)'))

### N-Gram DataSets

In [43]:
df_trn = pd.read_csv(os.path.join('data', 'df_train.csv'), index_col=['ex_id'])

In [44]:
datasets = [['ngram_lem_train.npz', 'ngram_lem_valid.npz'],
            ['ngram_snow_tfidf_train.npz', 'ngram_snow_tfidf_valid.npz']]

In [45]:
for data in datasets:
    results.append(train_val(data[0], data[1], df_trn, BernoulliNB(), 'Naive Bayes (default)'))

In [46]:
results_df = pd.DataFrame(results)

In [47]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   model    6 non-null      object 
 1   data     6 non-null      object 
 2   auc      6 non-null      float64
 3   ap       6 non-null      float64
 4   auc_trn  6 non-null      float64
 5   ap_trn   6 non-null      float64
dtypes: float64(4), object(2)
memory usage: 416.0+ bytes


In [49]:
results_df.sort_values(['ap', 'auc'], ascending=False)

Unnamed: 0,model,data,auc,ap,auc_trn,ap_trn
5,Naive Bayes (default),ngram_snow_tfidf_valid.npz,0.703675,0.208643,0.887085,0.474335
4,Naive Bayes (default),ngram_lem_valid.npz,0.705168,0.207704,0.878633,0.449419
3,Naive Bayes (default),tfidfnorm_trim_valid_up.npz,0.675065,0.195291,0.72883,0.714939
1,Naive Bayes (default),tfidfnorm_trim_valid.npz,0.678018,0.195122,0.704599,0.224764
0,Naive Bayes (default),wordcount_valid.npz,0.666764,0.186743,0.693051,0.211223
2,Naive Bayes (default),wordcount_valid_up.npz,0.66504,0.185383,0.706379,0.686171
