In [1]:
import pandas as pd
import numpy as np

import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

from collections import Counter

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample

__Notes:__
- If we want to keep our prediction binary, we should change all 'Pass w/ Conditions' in the `results_re` field to 'Pass'.
- We should also make this field numeric, with Fail as 1 and Pass as 0 (as 'Fail' is the event we are trying to predict). I think we can add both of these to the `data_prep` notebook.

__Questions to discuss:__
- __Shuffling during train/test split__: When we do our train/test split, should we sort our data by inspection date first? I know this matters for time series data (where we'd want to avoid leakage of future data into the past), but not sure how much it matters here, where the inspections are generally distinct establishments. Below I did sort the data, and didn't shuffle during the train/test split, but would like your thoughts on whether this matters.
- __Word frequency threshold__: In our HW 2, we only included in our vocab words that appeared at least 1000 times in the entire corpus. We'll need to decide what threshold we want to use. I used 1000 below, and also listed the vocab counts at different thresholds.
- __Pipeline operations__: What kinds of additional cleaning steps should we perform in our pipeline? Right now, we're just converting words to lower case and splitting them with a tokenizer. Other possible steps include removing stop words and lemmatizing (https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/). I'd recommend we go to Amitabh's office hours to get his thoughts on an appropriate 'good practice' pipeline.
- __Relative vs. absolute frequencies for BOW__: In our HW, we implemented BOW with relative frequencies, but I think for our purposes absolute frequencies make more sense, as more words/longer comments can mean more violations. Do you think this makes sense?
- __Choice of n for ngrams__: I made an initial choice of 2, which seemed reasonable to me. Would like your thoughts on this, though.

In [2]:
df = pd.read_pickle('initial_clean.pkl')

In [3]:
df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
16,577275,ROYALTY,1306130,Restaurant,2011-04-18,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,577343,2011-05-24,Pass,-36
38,1345428,PRET A MANGER,2138418,Restaurant,2013-08-06,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1345448,2013-08-13,Pass,-7
43,1114379,PRET A MANGER,2138418,Restaurant,2012-07-23,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1114384,2012-07-31,Pass,-8
65,343293,CHIPOTLE MEXICAN GRILL,1379435,Restaurant,2010-08-17,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,343310,2010-08-26,Pass,-9
72,2484973,HILLTOP FAMILY RESTAURANT,2652370,Restaurant,2021-02-19,Canvass Re-Inspection,Fail,"44. UTENSILS, EQUIPMENT & LINENS: PROPERLY STO...",2485081,2021-02-23,Pass w/ Conditions,-4


In [4]:
# Change 'Pass w/ Conditions' to 'Pass', and change target to 0/1 numeric
results_dict = {'Pass': 0, 'Fail': 1}
df['results_re'] = df['results_re'].str.replace('Pass w/ Conditions', 'Pass')
df['results_re'] = df['results_re'].apply(lambda x: results_dict[x])

### Split data into feature & target, and then into train and test sets

In [5]:
# Sort dataframe by inspection date
# Training set will include initial 80% of inspections
sorted_df = df.sort_values(by=['date_orig'])

In [6]:
sorted_df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
217726,104236,TEMPO CAFE,80916,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,104243,2010-01-12,0,-8
494307,67738,MICHAEL'S ON MAIN CAFE,2008948,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,124279,2010-01-19,0,-15
452398,67736,"MONTICELLO FOOD MART, INC",2013259,Grocery Store,2010-01-05,License,Fail,21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTE...,54219,2010-01-08,0,-3
281119,67741,CITGO,2013296,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,176270,2010-02-17,0,-43
460847,67744,GOLDEN CROWN RESTAURANT,2013539,Restaurant,2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,48215,2010-01-11,1,-6


In [7]:
target_col = ['results_re']
target = sorted_df[target_col]

cols_to_exclude = ['name', 'id_orig', 'id_re', 'license']
feature_cols = [col for col in sorted_df.columns if (col not in cols_to_exclude and col not in target_col)]
features = sorted_df[feature_cols]

In [8]:
target.head()

Unnamed: 0,results_re
217726,0
494307,0
452398,0
281119,0
460847,1


In [9]:
features.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
217726,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
494307,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
452398,Grocery Store,2010-01-05,License,Fail,21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTE...,2010-01-08,-3
281119,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,2010-02-17,-43
460847,Restaurant,2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-01-11,-6


In [10]:
# Don't shuffle data before splitting
train_feat, test_feat, train_targ, test_targ = train_test_split(features, target, test_size=0.2,
                                                                shuffle=False)

In [11]:
train_feat.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
217726,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
494307,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
452398,Grocery Store,2010-01-05,License,Fail,21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTE...,2010-01-08,-3
281119,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,2010-02-17,-43
460847,Restaurant,2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-01-11,-6


In [12]:
pct_fail = len(train_targ[train_targ['results_re'] == 1]) / len(train_targ)
print(f'{pct_fail*100:.2f}% reinspections fail in training set')

8.80% reinspections fail in training set


### Split out the text features

In [13]:
text_col = ['violations_orig']
train_feat_txt = train_feat[text_col].astype(str)
test_feat_txt = test_feat[text_col].astype(str)

In [14]:
train_feat_txt.head()

Unnamed: 0,violations_orig
217726,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
494307,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
452398,21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTE...
281119,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
460847,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...


### Split the text into tokens

In [15]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()

In [16]:
# Create vocab using training set only!!!!
for idx, text in train_feat_txt.itertuples():
    counter.update(tokenizer(text))

In [17]:
# Not sure what the min frequency should be
# Min freq = 1 -> 50768 vocab length
# Min freq = 50 -> 2942
# Min freq = 100 -> 2211
# Min freq = 250 -> 1481
# Min freq = 500 -> 1107
# Min freq = 1000 -> 806
MIN_FREQ = 500
vocab = Vocab(counter, min_freq=MIN_FREQ)

In [18]:
len(vocab)

1086

### Create BOW features

Note - I think we should use absolute rather than relative frequencies for the BOW vectors, as greater number of violations likely provides relevant information

In [19]:
def collate_into_bow(data, voc):
    bow = np.zeros((len(data), len(voc)))
    token_counts = np.zeros((len(data), 1))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        line_vocab = Vocab(counter)
        tot_freqs = sum(line_vocab.freqs.values())
        token_counts[i] = tot_freqs
        for token in line_vocab.freqs:
            bow[i, voc.stoi[token]] = line_vocab.freqs[token] / tot_freqs  # Using relative frequencies
    return token_counts, bow

In [20]:
# Create training bow set
token_counts, bow = collate_into_bow(train_feat_txt, vocab)
bow_concat = np.concatenate((bow, token_counts), axis=1)
bow_df = pd.DataFrame(bow_concat, columns=vocab.itos + ['token_counts'])

In [21]:
# Create test bow set using vocab from training set
test_token_counts, test_bow = collate_into_bow(test_feat_txt, vocab)
test_bow_concat = np.concatenate((test_bow, test_token_counts), axis=1)
test_bow_df = pd.DataFrame(test_bow_concat, columns=vocab.itos + ['token_counts'])

### __Create CBOW features__

In [22]:
# You'll need to copy your vector cache to the folder with this notebook!
VECTORS_CACHE_DIR = './.vector_cache'
DIM_GLOVE = 300

glove = GloVe('6B',cache=VECTORS_CACHE_DIR)

In [23]:
def collate_into_cbow(data):
    cbow = np.zeros((len(data), DIM_GLOVE))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        tokens = list(Vocab(counter).freqs)
        vecs = glove.get_vecs_by_tokens(tokens).numpy()
        cbow[i] = np.mean(vecs, axis=0)
    return cbow

In [24]:
# Create training cbow set
cbow = collate_into_cbow(train_feat_txt)
cbow_df = pd.DataFrame(cbow)

In [25]:
# Create test cbow set
test_cbow = collate_into_cbow(train_feat_txt)
test_cbow_df = pd.DataFrame(cbow)

### Create ngram features

In [26]:
# Using CountVectorizer to get ngrams (this was the most intuitive tool I could find...)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
def divide_rows_by_row_sum(arr):
    return np.nan_to_num(
            np.divide(arr, arr.sum(axis=1)[:, None]),
            nan=0  # Small number of ngram rows are all zeroes; don't divide row by 0 
    )

In [28]:
vectorizer = CountVectorizer(min_df=MIN_FREQ, ngram_range=(2,2))

In [29]:
corpus = train_feat_txt['violations_orig'].to_list()

In [30]:
# Create training ngram set
X = vectorizer.fit_transform(corpus)
ngram_arr = X.toarray()
ngram_arr = divide_rows_by_row_sum(ngram_arr)
ngram_arr = np.concatenate((ngram_arr, token_counts), axis=1)

  np.divide(arr, arr.sum(axis=1)[:, None]),


In [31]:
# Create test ngram set

# I'm pretty sure this is how it's done?
corpus_test = test_feat_txt['violations_orig'].to_list()
X_test = vectorizer.transform(corpus_test)
test_ngram_arr = X_test.toarray()
test_ngram_arr = divide_rows_by_row_sum(test_ngram_arr)
test_ngram_arr = np.concatenate((test_ngram_arr, test_token_counts), axis=1)

  np.divide(arr, arr.sum(axis=1)[:, None]),


In [32]:
# Should be same dimensionality as training array (1088)
test_ngram_arr.shape

(7442, 1907)

### Split train set into train and validation

In [33]:
# Don't shuffle data before splitting
train_bow, val_bow, train_targ_spl, val_targ = train_test_split(bow_df, train_targ, test_size=0.2,
                                                                shuffle=False)

train_cbow, val_cbow = train_test_split(cbow_df, test_size=0.2, shuffle=False)
train_2gram, val_2gram = train_test_split(ngram_arr, test_size=0.2, shuffle=False)

In [34]:
#Oversample on target vector
train_targ_reset = train_targ_spl.reset_index().drop('index', axis=1)
train_targ_fail = train_targ_reset[train_targ_reset['results_re'] == 1]
size_diff = train_targ_reset.shape[0] - train_targ_fail.shape[0]
train_resample = resample(train_targ_fail, n_samples = size_diff, replace=True)
train_targ_all = pd.concat([train_targ_reset, train_resample])

In [35]:
#then resample feature matrices
train_bow_resample = train_bow.iloc[train_targ_all.index]
train_cbow_resample = train_cbow.iloc[train_targ_all.index]
train_2gram_resample = train_2gram[[train_targ_all.index]]

  train_2gram_resample = train_2gram[[train_targ_all.index]]


In [36]:
# Model expects 1d array as target
#train_targ_1d = train_targ_spl.iloc[:, 0].ravel()
train_targ_1d = train_targ_all.iloc[:, 0].ravel()
val_targ_1d = val_targ.iloc[:, 0].ravel()
test_targ_1d = test_targ.iloc[:, 0].ravel()

### Set model baseline

Baseline for model will be if we predict 'pass' every time. If model cannot beat these scores, then it is not good!

In [37]:
def print_clf_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    print(f"""
    Accuracy: {accuracy:.2f}
    Precision: {precision:.2f}
    Recall: {recall:.2f}
    F1 score: {f1:.2f}
    ROC AUC: {auc:.2f}
    """)

In [38]:
# create y_pred array that includes only Passes
results = [0] * val_targ.shape[0]
d = {'results_re': results}
baseline_df = pd.DataFrame(data=d)

In [39]:
print_clf_metrics(val_targ['results_re'], baseline_df['results_re'])


    Accuracy: 0.91
    Precision: 0.00
    Recall: 0.00
    F1 score: 0.00
    ROC AUC: 0.50
    


### Naive Bayes model

BOW model:

In [40]:
# Use multinomial NB b/c features are discrete (counts)
nb_bow = MultinomialNB()
nb_bow.fit(train_bow_resample, train_targ_1d)

y_pred_bow = nb_bow.predict(val_bow)
print_clf_metrics(val_targ_1d, y_pred_bow)


    Accuracy: 0.47
    Precision: 0.12
    Recall: 0.78
    F1 score: 0.22
    ROC AUC: 0.61
    


CBOW model:

In [41]:
# Use Gaussian NB b/c features now continuous
nb_cbow = GaussianNB()
nb_cbow.fit(train_cbow_resample, train_targ_1d)

y_pred_cbow = nb_cbow.predict(val_cbow)
print_clf_metrics(val_targ_1d, y_pred_cbow)


    Accuracy: 0.89
    Precision: 0.03
    Recall: 0.01
    F1 score: 0.01
    ROC AUC: 0.49
    


2-gram model:

In [42]:
# Use multinomial NB b/c features are discrete (counts)
nb_2gram = MultinomialNB()
nb_2gram.fit(train_2gram_resample, train_targ_1d)

y_pred_2gram = nb_2gram.predict(val_2gram)
print_clf_metrics(val_targ_1d, y_pred_2gram)


    Accuracy: 0.53
    Precision: 0.14
    Recall: 0.76
    F1 score: 0.23
    ROC AUC: 0.63
    


### Logistic Regression model

BOW model:

In [43]:
log_reg_bow = LogisticRegression(max_iter=5000)
log_reg_bow.fit(train_bow_resample, train_targ_1d)
log_y_pred_bow = log_reg_bow.predict(val_bow)
print_clf_metrics(val_targ_1d, log_y_pred_bow)


    Accuracy: 0.61
    Precision: 0.14
    Recall: 0.64
    F1 score: 0.23
    ROC AUC: 0.62
    


CBOW model:

In [44]:
log_reg_cbow = LogisticRegression(max_iter=1000)
log_reg_cbow.fit(train_cbow_resample, train_targ_1d)
log_y_pred_cbow = log_reg_cbow.predict(val_cbow)
print_clf_metrics(val_targ_1d, log_y_pred_cbow)


    Accuracy: 0.63
    Precision: 0.16
    Recall: 0.72
    F1 score: 0.26
    ROC AUC: 0.67
    


2-gram model:

In [45]:
log_reg_2gram = LogisticRegression(max_iter=5000)
log_reg_2gram.fit(train_2gram_resample, train_targ_1d)

log_y_pred_2gram = log_reg_2gram.predict(val_2gram)
print_clf_metrics(val_targ_1d, log_y_pred_2gram)


    Accuracy: 0.61
    Precision: 0.15
    Recall: 0.66
    F1 score: 0.24
    ROC AUC: 0.63
    


### Grid Search on Logistic Regression and SVM

In [38]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV


In [48]:
parameters_log = {'C': [0.3, 0.5, 0.7]}
#scikit learn uses L2 penalty (ridge) by default, with C=1

scoring = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}

def grid_search_log(train_x, train_y, max_iter=5000):
    grid_log = GridSearchCV(LogisticRegression(max_iter=max_iter, random_state=0), param_grid=parameters_log, scoring=scoring,
                            cv=5, refit='Precision')
    grid_log.fit(train_x, train_y)
    logs_all = pd.DataFrame.from_dict(grid_log.cv_results_)
    return(logs_all.loc[:, ['param_C', 'mean_test_Accuracy', 'mean_test_Precision', 'mean_test_Recall', 'rank_test_Precision']])

In [49]:
grid_search_log(train_bow_resample, train_targ_1d)

Unnamed: 0,param_C,mean_test_Accuracy,mean_test_Precision,mean_test_Recall,rank_test_Precision
0,0.3,0.591527,0.613999,0.625384,1
1,0.5,0.587861,0.610199,0.631809,2
2,0.7,0.584546,0.606823,0.635421,3


In [50]:
grid_search_log(train_cbow_resample, train_targ_1d)

Unnamed: 0,param_C,mean_test_Accuracy,mean_test_Precision,mean_test_Recall,rank_test_Precision
0,0.3,0.627966,0.623519,0.757549,3
1,0.5,0.630469,0.62599,0.756037,2
2,0.7,0.631435,0.627015,0.755072,1


In [51]:
grid_search_log(train_2gram_resample, train_targ_1d)

Unnamed: 0,param_C,mean_test_Accuracy,mean_test_Precision,mean_test_Recall,rank_test_Precision
0,0.3,0.60202,0.622411,0.618034,2
1,0.5,0.599868,0.619125,0.619672,3
2,0.7,0.603271,0.622957,0.621184,1


In [54]:
parameters_svm = {'C': [0.1, 1, 10]}

scoring = {'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}

def grid_search_svm(train_x, train_y, max_iter=10000):
    grid_svm = GridSearchCV(LinearSVC(max_iter=max_iter, random_state=0), param_grid=parameters_log, scoring=scoring,
                            cv=3, refit='Precision')
    grid_svm.fit(train_x, train_y)
    svm_all = pd.DataFrame.from_dict(grid_svm.cv_results_)
    return(svm_all.loc[:, ['param_C', 'mean_test_Accuracy', 'mean_test_Precision', 'mean_test_Recall', 'rank_test_Precision']])

In [53]:
grid_search_svm(train_bow_resample, train_targ_1d)



Unnamed: 0,param_C,mean_test_Accuracy,mean_test_Precision,mean_test_Recall,rank_test_Precision
0,0.3,0.525431,0.558997,0.574384,3
1,0.5,0.528438,0.571346,0.51459,2
2,0.7,0.504774,0.613656,0.410542,1


In [57]:
grid_search_svm(train_cbow_resample, train_targ_1d)

Unnamed: 0,param_C,mean_test_Accuracy,mean_test_Precision,mean_test_Recall,rank_test_Precision
0,0.3,0.578158,0.57939,0.752005,1
1,0.5,0.576358,0.578143,0.749234,3
2,0.7,0.577719,0.5792,0.750451,2


In [59]:
grid_search_svm(train_2gram_resample, train_targ_1d)



Unnamed: 0,param_C,mean_test_Accuracy,mean_test_Precision,mean_test_Recall,rank_test_Precision
0,0.3,0.528438,0.526754,0.963294,3
1,0.5,0.509714,0.567246,0.641174,2
2,0.7,0.510109,0.582666,0.632229,1
