In [1]:
import pandas as pd
import numpy as np

import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

from collections import Counter

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample

In [2]:
df = pd.read_pickle('initial_clean.pkl')

In [3]:
# Remove rows with no comments
df = df[df['violations_orig'].notna()]
df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
16,577275,ROYALTY,1306130,Restaurant,2011-04-18,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,577343,2011-05-24,Pass,-36
38,1345428,PRET A MANGER,2138418,Restaurant,2013-08-06,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1345448,2013-08-13,Pass,-7
43,1114379,PRET A MANGER,2138418,Restaurant,2012-07-23,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1114384,2012-07-31,Pass,-8
65,343293,CHIPOTLE MEXICAN GRILL,1379435,Restaurant,2010-08-17,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,343310,2010-08-26,Pass,-9
72,2484973,HILLTOP FAMILY RESTAURANT,2652370,Restaurant,2021-02-19,Canvass Re-Inspection,Fail,"44. UTENSILS, EQUIPMENT & LINENS: PROPERLY STO...",2485081,2021-02-23,Pass w/ Conditions,-4


In [4]:
# Change 'Pass w/ Conditions' to 'Pass', and change target to 0/1 numeric
results_dict = {'Pass': 0, 'Fail': 1}
df['results_re'] = df['results_re'].str.replace('Pass w/ Conditions', 'Pass')
df['results_re'] = df['results_re'].apply(lambda x: results_dict[x])

### Split data into feature & target, and then into train and test sets

In [5]:
# Sort dataframe by inspection date
# Training set will include initial 80% of inspections
sorted_df = df.sort_values(by=['date_orig'])

In [6]:
sorted_df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
494307,67738,MICHAEL'S ON MAIN CAFE,2008948,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,124279,2010-01-19,0,-15
217726,104236,TEMPO CAFE,80916,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,104243,2010-01-12,0,-8
45936,80207,Delhi Darbar Kabob House,2013590,Restaurant,2010-01-05,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,72235,2010-01-12,0,-7
489892,98313,WHIPPLE STORE,2009114,Grocery Store,2010-01-05,License,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",98371,2010-02-05,0,-31
491644,160209,CHICAGO SINAI CONGREGATION,75567,Daycare (2 - 6 Years),2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,160331,2010-04-12,0,-97


In [7]:
target_col = ['results_re']
target = sorted_df[target_col]

cols_to_exclude = ['name', 'id_orig', 'id_re', 'license']
feature_cols = [col for col in sorted_df.columns if (col not in cols_to_exclude and col not in target_col)]
features = sorted_df[feature_cols]

In [8]:
target.head()

Unnamed: 0,results_re
494307,0
217726,0
45936,0
489892,0
491644,0


In [9]:
features.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
494307,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
217726,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
45936,Restaurant,2010-01-05,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-7
489892,Grocery Store,2010-01-05,License,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",2010-02-05,-31
491644,Daycare (2 - 6 Years),2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-04-12,-97


In [10]:
# Don't shuffle data before splitting
train_feat, test_feat, train_targ, test_targ = train_test_split(features, target, test_size=0.2,
                                                                shuffle=False)

In [11]:
train_feat.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
494307,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
217726,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
45936,Restaurant,2010-01-05,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-7
489892,Grocery Store,2010-01-05,License,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",2010-02-05,-31
491644,Daycare (2 - 6 Years),2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-04-12,-97


In [12]:
pct_fail = len(train_targ[train_targ['results_re'] == 1]) / len(train_targ)
print(f'{pct_fail*100:.2f}% reinspections fail in training set')

8.77% reinspections fail in training set


### Split out the text features

In [13]:
text_col = ['violations_orig']
train_feat_txt = train_feat[text_col].astype(str)
test_feat_txt = test_feat[text_col].astype(str)

In [14]:
train_feat_txt.head()

Unnamed: 0,violations_orig
494307,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
217726,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
45936,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
489892,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR..."
491644,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...


### Split the text into tokens

In [15]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()

In [16]:
# Create vocab using training set only
for idx, text in train_feat_txt.itertuples():
    counter.update(tokenizer(text))

In [17]:
MIN_FREQ = 500
vocab = Vocab(counter, min_freq=MIN_FREQ)

In [18]:
len(vocab)

1081

### Create BOW features

In [19]:
def collate_into_bow(data, voc):
    '''
    Collate data into BOW features

    Inputs:
    data - Pandas dataframe
    voc - vocab object

    Returns:
    token counts - a vector of document lengths
    bow - a matrix of BOW embeddings
    '''
    bow = np.zeros((len(data), len(voc)))
    token_counts = np.zeros((len(data), 1))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        line_vocab = Vocab(counter)
        tot_freqs = sum(line_vocab.freqs.values())
        token_counts[i] = tot_freqs
        for token in line_vocab.freqs:
            bow[i, voc.stoi[token]] = line_vocab.freqs[token] / tot_freqs  # Using relative frequencies
    return token_counts, bow

In [20]:
# Create training bow set
token_counts, bow = collate_into_bow(train_feat_txt, vocab)
bow_concat = np.concatenate((bow, token_counts), axis=1)
bow_df = pd.DataFrame(bow_concat, columns=vocab.itos + ['token_counts'])

In [21]:
# Create test bow set using vocab from training set
test_token_counts, test_bow = collate_into_bow(test_feat_txt, vocab)
test_bow_concat = np.concatenate((test_bow, test_token_counts), axis=1)
test_bow_df = pd.DataFrame(test_bow_concat, columns=vocab.itos + ['token_counts'])

### __Create CBOW features__

In [22]:
# You'll need to copy your vector cache to the folder with this notebook!
VECTORS_CACHE_DIR = './.vector_cache'
DIM_GLOVE = 300

glove = GloVe('6B',cache=VECTORS_CACHE_DIR)

In [23]:
def collate_into_cbow(data):
    '''
    Collate data into CBOW features

    Inputs:
    data - Pandas dataframe

    Returns:
    cbow - a matrix of CBOW embeddings
    '''
    cbow = np.zeros((len(data), DIM_GLOVE))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        tokens = list(Vocab(counter).freqs)
        vecs = glove.get_vecs_by_tokens(tokens).numpy()
        cbow[i] = np.mean(vecs, axis=0)
    return cbow

In [24]:
# Create training cbow set
cbow = collate_into_cbow(train_feat_txt)
cbow_df = pd.DataFrame(cbow)

In [25]:
# Create test cbow set
test_cbow = collate_into_cbow(test_feat_txt)
test_cbow_df = pd.DataFrame(test_cbow)

### Create ngram features

In [26]:
# Using CountVectorizer to get ngrams
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
def divide_rows_by_row_sum(arr):
        '''
        Divides the rows of a given numpy array by the sum, account for zeroes
        '''
    return np.nan_to_num(
            np.divide(arr, arr.sum(axis=1)[:, None]),
            nan=0  # Small number of ngram rows are all zeroes; don't divide row by 0 
    )

In [28]:
vectorizer = CountVectorizer(min_df=MIN_FREQ, ngram_range=(2,2))

In [29]:
corpus = train_feat_txt['violations_orig'].to_list()

In [30]:
# Create training ngram set
X = vectorizer.fit_transform(corpus)
ngram_arr = X.toarray()
ngram_arr = divide_rows_by_row_sum(ngram_arr)
ngram_arr = np.concatenate((ngram_arr, token_counts), axis=1)

In [31]:
# Create test ngram set

corpus_test = test_feat_txt['violations_orig'].to_list()
X_test = vectorizer.transform(corpus_test)
test_ngram_arr = X_test.toarray()
test_ngram_arr = divide_rows_by_row_sum(test_ngram_arr)
test_ngram_arr = np.concatenate((test_ngram_arr, test_token_counts), axis=1)

  np.divide(arr, arr.sum(axis=1)[:, None]),


In [32]:
test_ngram_arr.shape

(7311, 1905)

### Split train set into train and validation

In [33]:
# Don't need to split into validation set because we use cross validation instead
#train_bow, val_bow, train_targ_spl, val_targ = train_test_split(bow_df, train_targ, test_size=0.2,
#                                                                shuffle=False)

#train_cbow, val_cbow = train_test_split(cbow_df, test_size=0.2, shuffle=False)
#train_2gram, val_2gram = train_test_split(ngram_arr, test_size=0.2, shuffle=False)

train_bow, train_targ_spl = bow_df, train_targ
train_cbow = cbow_df
train_2gram = ngram_arr

In [34]:
#Oversample on target vector
train_targ_reset = train_targ_spl.reset_index().drop('index', axis=1)
train_targ_fail = train_targ_reset[train_targ_reset['results_re'] == 1]
size_diff = train_targ_reset.shape[0] - train_targ_fail.shape[0]
train_resample = resample(train_targ_fail, n_samples = size_diff, replace=True)
train_targ_all = pd.concat([train_targ_reset, train_resample])

In [35]:
#then resample feature matrices
train_bow_resample = train_bow.iloc[train_targ_all.index]
train_cbow_resample = train_cbow.iloc[train_targ_all.index]
train_2gram_resample = train_2gram[[train_targ_all.index]]

  train_2gram_resample = train_2gram[[train_targ_all.index]]


In [36]:
# Model expects 1d array as target
train_targ_1d = train_targ_all.iloc[:, 0].ravel()
test_targ_1d = test_targ.iloc[:, 0].ravel()

### Set model baseline

Baseline for model will be if we predict 'pass' every time. If model cannot beat these scores, then it is not good!

In [37]:
def print_clf_metrics(y_true, y_pred):
    '''
    Prints performance metrics for the given vectors

    Inputs:
        y_true - a vector; the true labels
        y_pred - a vector; the predicted labels
    
    Returns:
        None
    '''
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    print(f"""
    Accuracy: {accuracy:.2f}
    Precision: {precision:.2f}
    Recall: {recall:.2f}
    F1 score: {f1:.2f}
    ROC AUC: {auc:.2f}
    """)

In [39]:
# create y_pred array that includes only Passes
results = [0] * test_targ.shape[0]
d = {'results_re': results}
baseline_df = pd.DataFrame(data=d)

In [41]:
print_clf_metrics(test_targ['results_re'], baseline_df['results_re'])


    Accuracy: 0.88
    Precision: 0.00
    Recall: 0.00
    F1 score: 0.00
    ROC AUC: 0.50
    


### Naive Bayes model

BOW model:

In [42]:
# Use multinomial NB b/c features are discrete (counts)
nb_bow = MultinomialNB()
nb_bow.fit(train_bow_resample, train_targ_1d)

y_pred_bow_test = nb_bow.predict(test_bow_df)
print_clf_metrics(test_targ_1d, y_pred_bow_test)


    Accuracy: 0.36
    Precision: 0.13
    Recall: 0.83
    F1 score: 0.23
    ROC AUC: 0.57
    


CBOW model:

In [43]:
# Use Gaussian NB b/c features now continuous
nb_cbow = GaussianNB()
nb_cbow.fit(train_cbow_resample, train_targ_1d)

y_pred_cbow_test = nb_cbow.predict(test_cbow_df)
print_clf_metrics(test_targ_1d, y_pred_cbow_test)


    Accuracy: 0.67
    Precision: 0.17
    Recall: 0.49
    F1 score: 0.26
    ROC AUC: 0.59
    


2-gram model:

In [44]:
# Use multinomial NB b/c features are discrete (counts)
nb_2gram = MultinomialNB()
nb_2gram.fit(train_2gram_resample, train_targ_1d)

y_pred_2gram_test = nb_2gram.predict(test_ngram_arr)
print_clf_metrics(test_targ_1d, y_pred_2gram_test)


    Accuracy: 0.44
    Precision: 0.14
    Recall: 0.78
    F1 score: 0.24
    ROC AUC: 0.58
    


### Logistic Regression model

BOW model:

In [45]:
log_reg_bow = LogisticRegression(max_iter=5000)
log_reg_bow.fit(train_bow_resample, train_targ_1d)

log_y_pred_bow = log_reg_bow.predict(test_bow_df)
print_clf_metrics(test_targ_1d, log_y_pred_bow)


    Accuracy: 0.54
    Precision: 0.15
    Recall: 0.64
    F1 score: 0.24
    ROC AUC: 0.58
    


In [46]:
def get_top_features(model, vocabulary):
    '''
    Get the features of a regression model ordered by magnitude.

    Inputs:
        model - a sklearn model object
        vocabulary - a vector-like object of the vocabulary used in the model
    
    Returns:
        a sorted Pandas dataframe of the coefficients
    '''
    features = np.array(vocabulary).reshape(-1, 1)
    features = np.append(features, ['len_document'])
    coefficients = model.coef_[0]
    
    ranked_coeffs = pd.DataFrame(features, coefficients).reset_index()
    ranked_coeffs['abs'] = [abs(x) for x in ranked_coeffs['index']]

    ranked_coeffs = ranked_coeffs.sort_values('abs', ascending=False)
    ranked_coeffs.columns = ['coefficient', 'feature', 'coefficient (absolute)']

    return ranked_coeffs

In [47]:
top_reg_bow = get_top_features(log_reg_bow, vocab.itos)
top_reg_bow.head(10)

Unnamed: 0,coefficient,feature,coefficient (absolute)
2,-0.051965,.,0.051965
3,-0.039181,",",0.039181
4,-0.032513,and,0.032513
5,-0.022961,the,0.022961
10,-0.018806,to,0.018806
9,-0.018098,-,0.018098
8,-0.018006,comments,0.018006
13,-0.015763,clean,0.015763
7,-0.015606,of,0.015606
12,-0.013573,food,0.013573


CBOW model:

In [48]:
log_reg_cbow = LogisticRegression(max_iter=1000)
log_reg_cbow.fit(train_cbow_resample, train_targ_1d)

log_y_pred_cbow = log_reg_cbow.predict(test_cbow_df)
print_clf_metrics(test_targ_1d, log_y_pred_cbow)


    Accuracy: 0.44
    Precision: 0.14
    Recall: 0.78
    F1 score: 0.24
    ROC AUC: 0.59
    


2-gram model:

In [49]:
log_reg_2gram = LogisticRegression(max_iter=5000)
log_reg_2gram.fit(train_2gram_resample, train_targ_1d)

log_y_pred_2gram = log_reg_2gram.predict(test_ngram_arr)
print_clf_metrics(test_targ_1d, log_y_pred_2gram)


    Accuracy: 0.54
    Precision: 0.15
    Recall: 0.64
    F1 score: 0.24
    ROC AUC: 0.59
    


In [50]:
top_reg_bow = get_top_features(log_reg_2gram, vectorizer.get_feature_names())
top_reg_bow.head(10)

Unnamed: 0,coefficient,feature,coefficient (absolute)
981,-0.014985,instructed to,0.014985
196,-0.010717,and maintain,0.010717
427,-0.01002,clean and,0.01002
496,-0.008892,comments observed,0.008892
852,-0.007172,good repair,0.007172
1300,-0.007068,per code,0.007068
528,-0.007067,constructed per,0.007067
1522,-0.006955,shall be,0.006955
764,-0.00685,food and,0.00685
456,-0.006842,cleaning methods,0.006842


### Grid Search on Logistic Regression and SVM

In [51]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


In [52]:
parameters_log = {'C': [0.3, 0.5, 0.7]}
#scikit learn uses L2 penalty (ridge) by default, with C=1

scoring = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}

def grid_search_log(train_x, train_y, max_iter=5000):
    '''
    Run a logistic regression grid search

    Inputs:
        train_x - Pandas dataframe of features
        train_y - Pandas series of targets
        max_iter - max number of iterations on models
    
    Returns:
        a GridSearchCV object
    '''
    grid_log = GridSearchCV(LogisticRegression(max_iter=max_iter, random_state=0), param_grid=parameters_log, scoring=scoring,
                            cv=5, refit='Precision')
    grid_log.fit(train_x, train_y)
    logs_all = pd.DataFrame.from_dict(grid_log.cv_results_)
    print(logs_all.loc[:, ['param_C', 'mean_test_Accuracy', 'mean_test_Precision', 'mean_test_Recall', 'rank_test_Precision']])
    return grid_log

In [53]:
grid_log = grid_search_log(train_bow_resample, train_targ_1d)

  param_C  mean_test_Accuracy  mean_test_Precision  mean_test_Recall  \
0     0.3            0.603889             0.627263          0.607735   
1     0.5            0.603889             0.627263          0.607735   
2     0.7            0.603889             0.627263          0.607735   

   rank_test_Precision  
0                    1  
1                    1  
2                    1  


In [54]:
top_reg_bow = get_top_features(grid_log.best_estimator_, vocab.itos)
top_reg_bow.head(10)

Unnamed: 0,coefficient,feature,coefficient (absolute)
2,-0.051964,.,0.051964
3,-0.03918,",",0.03918
4,-0.032512,and,0.032512
5,-0.022961,the,0.022961
10,-0.018806,to,0.018806
9,-0.018098,-,0.018098
8,-0.018006,comments,0.018006
13,-0.015763,clean,0.015763
7,-0.015606,of,0.015606
12,-0.013573,food,0.013573


In [55]:
y_pred_bow_test = grid_log.best_estimator_.predict(test_bow_df)
print_clf_metrics(test_targ_1d, y_pred_bow_test)


    Accuracy: 0.54
    Precision: 0.15
    Recall: 0.64
    F1 score: 0.24
    ROC AUC: 0.58
    


In [56]:
grid_log = grid_search_log(train_cbow_resample, train_targ_1d)

  param_C  mean_test_Accuracy  mean_test_Precision  mean_test_Recall  \
0     0.3            0.627584             0.626112          0.753275   
1     0.5            0.629802             0.628720          0.751770   
2     0.7            0.631572             0.630384          0.752317   

   rank_test_Precision  
0                    3  
1                    2  
2                    1  


In [57]:
y_pred_cbow_test = grid_log.best_estimator_.predict(test_cbow_df)
print_clf_metrics(test_targ_1d, y_pred_cbow_test)


    Accuracy: 0.44
    Precision: 0.14
    Recall: 0.78
    F1 score: 0.24
    ROC AUC: 0.59
    


In [58]:
grid_log = grid_search_log(train_2gram_resample, train_targ_1d)

  param_C  mean_test_Accuracy  mean_test_Precision  mean_test_Recall  \
0     0.3            0.603746             0.627231          0.607222   
1     0.5            0.603746             0.627231          0.607222   
2     0.7            0.603746             0.627231          0.607222   

   rank_test_Precision  
0                    1  
1                    1  
2                    1  


In [59]:
top_reg_2gram = get_top_features(grid_log.best_estimator_, vectorizer.get_feature_names())
top_reg_2gram.head(10)

Unnamed: 0,coefficient,feature,coefficient (absolute)
981,-0.014985,instructed to,0.014985
196,-0.010717,and maintain,0.010717
427,-0.01002,clean and,0.01002
496,-0.008892,comments observed,0.008892
852,-0.007172,good repair,0.007172
1300,-0.007068,per code,0.007068
528,-0.007067,constructed per,0.007067
1522,-0.006955,shall be,0.006955
764,-0.00685,food and,0.00685
456,-0.006842,cleaning methods,0.006842


In [60]:
y_pred_2gram_test = grid_log.best_estimator_.predict(test_ngram_arr)
print_clf_metrics(test_targ_1d, y_pred_2gram_test)


    Accuracy: 0.54
    Precision: 0.15
    Recall: 0.64
    F1 score: 0.24
    ROC AUC: 0.59
    


In [61]:
parameters_svm = {'C': [0.1, 1, 10]}

scoring = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}

def grid_search_svm(train_x, train_y, max_iter=10000):
    '''
    Run a SVM grid search

    Inputs:
        train_x - Pandas dataframe of features
        train_y - Pandas series of targets
        max_iter - max number of iterations on models
    
    Returns:
        a GridSearchCV object
    '''
    grid_svm = GridSearchCV(LinearSVC(max_iter=max_iter, random_state=0), param_grid=parameters_log, scoring=scoring,
                            cv=3, refit='Precision')
    grid_svm.fit(train_x, train_y)
    svm_all = pd.DataFrame.from_dict(grid_svm.cv_results_)
    print(svm_all.loc[:, ['param_C', 'mean_test_Accuracy', 'mean_test_Precision', 'mean_test_Recall', 'rank_test_Precision']])
    return grid_svm

In [62]:
grid_svm = grid_search_svm(train_bow_resample, train_targ_1d)

  param_C  mean_test_Accuracy  mean_test_Precision  mean_test_Recall  \
0     0.3            0.542373             0.535141          0.950724   
1     0.5            0.476154             0.548495          0.010156   
2     0.7            0.515084             0.547193          0.619682   

   rank_test_Precision  
0                    3  
1                    1  
2                    2  


In [63]:
y_pred_bow_test = grid_svm.best_estimator_.predict(test_bow_df)
print_clf_metrics(test_targ_1d, y_pred_bow_test)


    Accuracy: 0.37
    Precision: 0.14
    Recall: 0.83
    F1 score: 0.23
    ROC AUC: 0.57
    


In [64]:
grid_svm_cbow = grid_search_svm(train_cbow_resample, train_targ_1d)

  param_C  mean_test_Accuracy  mean_test_Precision  mean_test_Recall  \
0     0.3            0.600223             0.598285          0.753069   
1     0.5            0.600581             0.598762          0.753035   
2     0.7            0.599794             0.598281          0.751599   

   rank_test_Precision  
0                    2  
1                    1  
2                    3  


In [65]:
y_pred_cbow_test = grid_svm_cbow.best_estimator_.predict(test_cbow_df)
print_clf_metrics(test_targ_1d, y_pred_cbow_test)


    Accuracy: 0.45
    Precision: 0.14
    Recall: 0.76
    F1 score: 0.24
    ROC AUC: 0.59
    


In [66]:
grid_svm_ngrams = grid_search_svm(train_2gram_resample, train_targ_1d)

  param_C  mean_test_Accuracy  mean_test_Precision  mean_test_Recall  \
0     0.3            0.473257             0.586346          0.199631   
1     0.5            0.477012             0.552989          0.012789   
2     0.7            0.495414             0.539898          0.337608   

   rank_test_Precision  
0                    1  
1                    2  
2                    3  


In [67]:
y_pred_ngram_test = grid_svm_ngrams.best_estimator_.predict(test_ngram_arr)
print_clf_metrics(test_targ_1d, y_pred_ngram_test)


    Accuracy: 0.88
    Precision: 0.00
    Recall: 0.00
    F1 score: 0.00
    ROC AUC: 0.50
    
