In [1]:
import pandas as pd
import numpy as np

import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

from collections import Counter

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample

__Notes:__
- If we want to keep our prediction binary, we should change all 'Pass w/ Conditions' in the `results_re` field to 'Pass'.
- We should also make this field numeric, with Fail as 1 and Pass as 0 (as 'Fail' is the event we are trying to predict). I think we can add both of these to the `data_prep` notebook.

__Questions to discuss:__
- __Shuffling during train/test split__: When we do our train/test split, should we sort our data by inspection date first? I know this matters for time series data (where we'd want to avoid leakage of future data into the past), but not sure how much it matters here, where the inspections are generally distinct establishments. Below I did sort the data, and didn't shuffle during the train/test split, but would like your thoughts on whether this matters.
- __Word frequency threshold__: In our HW 2, we only included in our vocab words that appeared at least 1000 times in the entire corpus. We'll need to decide what threshold we want to use. I used 1000 below, and also listed the vocab counts at different thresholds.
- __Pipeline operations__: What kinds of additional cleaning steps should we perform in our pipeline? Right now, we're just converting words to lower case and splitting them with a tokenizer. Other possible steps include removing stop words and lemmatizing (https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/). I'd recommend we go to Amitabh's office hours to get his thoughts on an appropriate 'good practice' pipeline.
- __Relative vs. absolute frequencies for BOW__: In our HW, we implemented BOW with relative frequencies, but I think for our purposes absolute frequencies make more sense, as more words/longer comments can mean more violations. Do you think this makes sense?
- __Choice of n for ngrams__: I made an initial choice of 2, which seemed reasonable to me. Would like your thoughts on this, though.

In [2]:
df = pd.read_pickle('initial_clean.pkl')

In [3]:
df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
10,1989768,VOLARE,2141813,Restaurant,2017-02-27,Canvass,Fail,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",1989902,2017-03-01,Pass,-2
13,1632809,VOLARE,2141813,Restaurant,2016-03-08,Canvass,Fail,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",1734225,2016-03-15,Pass,-7
32,1448101,STREETERS TAVERN,8864,TAVERN,2015-12-01,Complaint,Fail,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",1448119,2015-12-11,Pass,-10
36,2453925,MISS SAIGON,2699550,Restaurant,2020-10-28,Canvass,Fail,58. ALLERGEN TRAINING AS REQUIRED - Comments: ...,2456264,2020-11-05,Pass,-8
39,2352289,MISS SAIGON,2699550,Restaurant,2019-11-25,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,2352403,2019-11-27,Pass w/ Conditions,-2


In [4]:
# Change 'Pass w/ Conditions' to 'Pass', and change target to 0/1 numeric
results_dict = {'Pass': 0, 'Fail': 1}
df['results_re'] = df['results_re'].str.replace('Pass w/ Conditions', 'Pass')
df['results_re'] = df['results_re'].apply(lambda x: results_dict[x])

### Split data into feature & target, and then into train and test sets

In [5]:
# Sort dataframe by inspection date
# Training set will include initial 80% of inspections
sorted_df = df.sort_values(by=['date_orig'])

In [6]:
sorted_df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
236038,104236,TEMPO CAFE,80916,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,104243,2010-01-12,0,-8
494289,67738,MICHAEL'S ON MAIN CAFE,2008948,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,124279,2010-01-19,0,-15
492554,120271,IZUMI SUSHI BAR & RESTAURANT,1357260,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,120289,2010-01-13,0,-8
97666,118297,MAXWELL STREET DEPOT INC.,18135,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,118308,2010-01-12,0,-7
290030,67741,CITGO,2013296,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,176270,2010-02-17,0,-43


In [7]:
target_col = ['results_re']
target = sorted_df[target_col]

cols_to_exclude = ['name', 'id_orig', 'id_re', 'license']
feature_cols = [col for col in sorted_df.columns if (col not in cols_to_exclude and col not in target_col)]
features = sorted_df[feature_cols]

In [8]:
target.head()

Unnamed: 0,results_re
236038,0
494289,0
492554,0
97666,0
290030,0


In [9]:
features.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
236038,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
494289,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
492554,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-13,-8
97666,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-7
290030,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,2010-02-17,-43


In [10]:
# Don't shuffle data before splitting
train_feat, test_feat, train_targ, test_targ = train_test_split(features, target, test_size=0.2,
                                                                shuffle=False)

In [11]:
train_feat.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
236038,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
494289,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
492554,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-13,-8
97666,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-7
290030,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,2010-02-17,-43


In [12]:
pct_fail = len(train_targ[train_targ['results_re'] == 1]) / len(train_targ)
print(f'{pct_fail*100:.2f}% reinspections fail in training set')

8.27% reinspections fail in training set


### Split out the text features

In [13]:
text_col = ['violations_orig']
train_feat_txt = train_feat[text_col].astype(str)
test_feat_txt = test_feat[text_col].astype(str)

In [14]:
train_feat_txt.head()

Unnamed: 0,violations_orig
236038,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
494289,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
492554,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
97666,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
290030,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...


### Split the text into tokens

In [15]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()

In [16]:
# Create vocab using training set only!!!!
for idx, text in train_feat_txt.itertuples():
    counter.update(tokenizer(text))

In [17]:
# Not sure what the min frequency should be
# Min freq = 1 -> 50768 vocab length
# Min freq = 50 -> 2942
# Min freq = 100 -> 2211
# Min freq = 250 -> 1481
# Min freq = 500 -> 1107
# Min freq = 1000 -> 806
MIN_FREQ = 500
vocab = Vocab(counter, min_freq=MIN_FREQ)

In [18]:
len(vocab)

1108

### Create BOW features

Note - I think we should use absolute rather than relative frequencies for the BOW vectors, as greater number of violations likely provides relevant information

In [19]:
# I think we can convert numpy arrays to tensors, so I think returning a numpy array here shouldn't be a problem?
def collate_into_bow(data, voc):
    bow = np.zeros((len(data), len(voc)))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        line_vocab = Vocab(counter)
        for token in line_vocab.freqs:
            bow[i, voc.stoi[token]] = line_vocab.freqs[token]
    return bow

In [20]:
# Create training bow set
bow = collate_into_bow(train_feat_txt, vocab)
bow_df = pd.DataFrame(bow, columns=vocab.itos)

In [21]:
# Create test bow set using vocab from training set
test_bow = collate_into_bow(test_feat_txt, vocab)
test_bow_df = pd.DataFrame(test_bow, columns=vocab.itos)

In [22]:
bow_df.shape

(31698, 1108)

In [23]:
bow_df.head()

Unnamed: 0,<unk>,<pad>,.,",",and,the,in,of,comments,-,...,gun,disposed,legs,non-toxic,sweep,trays,#7-38-020,toiletroom,holder,180f
0,1.0,0.0,53.0,43.0,48.0,22.0,21.0,29.0,11.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,10.0,10.0,5.0,6.0,5.0,2.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,27.0,31.0,30.0,32.0,12.0,18.0,7.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,46.0,45.0,44.0,17.0,14.0,20.0,10.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,17.0,22.0,19.0,8.0,4.0,10.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### __Create CBOW features__

In [24]:
# You'll need to copy your vector cache to the folder with this notebook!
VECTORS_CACHE_DIR = './.vector_cache'
DIM_GLOVE = 300

glove = GloVe('6B',cache=VECTORS_CACHE_DIR)

In [25]:
def collate_into_cbow(data):
    cbow = np.zeros((len(data), DIM_GLOVE))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        tokens = list(Vocab(counter).freqs)
        vecs = glove.get_vecs_by_tokens(tokens).numpy()
        cbow[i] = np.mean(vecs, axis=0)
    return cbow

In [26]:
# Create training cbow set
cbow = collate_into_cbow(train_feat_txt)
cbow_df = pd.DataFrame(cbow)

In [27]:
# Create test cbow set
test_cbow = collate_into_cbow(train_feat_txt)
test_cbow_df = pd.DataFrame(cbow)

### Create ngram features

In [28]:
# Using CountVectorizer to get ngrams (this was the most intuitive tool I could find...)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
vectorizer = CountVectorizer(min_df=MIN_FREQ, ngram_range=(2,2))

In [30]:
corpus = train_feat_txt['violations_orig'].to_list()

In [31]:
# Create training ngram set
X = vectorizer.fit_transform(corpus)
ngram_arr = X.toarray()

In [32]:
# There are 1088 2-grams
ngram_arr.shape

(31698, 1982)

In [33]:
ngram_arr

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
#vectorizer.get_feature_names()

In [35]:
# Create test ngram set

# I'm pretty sure this is how it's done?
corpus_test = test_feat_txt['violations_orig'].to_list()
X_test = vectorizer.transform(corpus_test)
test_ngram_arr = X_test.toarray()

In [36]:
# Should be same dimensionality as training array (1088)
test_ngram_arr.shape

(7925, 1982)

### Split train set into train and validation

In [37]:
# Don't shuffle data before splitting
train_bow, val_bow, train_targ_spl, val_targ = train_test_split(bow_df, train_targ, test_size=0.2,
                                                                shuffle=False)

train_cbow, val_cbow = train_test_split(cbow_df, test_size=0.2, shuffle=False)
train_2gram, val_2gram = train_test_split(ngram_arr, test_size=0.2, shuffle=False)

In [38]:
#Oversample on target vector
train_targ_reset = train_targ_spl.reset_index().drop('index', axis=1)
train_targ_fail = train_targ_reset[train_targ_reset['results_re'] == 1]
size_diff = train_targ_reset.shape[0] - train_targ_fail.shape[0]
train_resample = resample(train_targ_fail, n_samples = size_diff, replace=True)
train_targ_all = pd.concat([train_targ_reset, train_resample])

In [39]:
#then resample feature matrices
train_bow_resample = train_bow.iloc[train_targ_all.index]
train_cbow_resample = train_cbow.iloc[train_targ_all.index]
train_2gram_resample = train_2gram[[train_targ_all.index]]

  train_2gram_resample = train_2gram[[train_targ_all.index]]


In [40]:
# Model expects 1d array as target
#train_targ_1d = train_targ_spl.iloc[:, 0].ravel()
train_targ_1d = train_targ_all.iloc[:, 0].ravel()
val_targ_1d = val_targ.iloc[:, 0].ravel()
test_targ_1d = test_targ.iloc[:, 0].ravel()

### Set model baseline

Baseline for model will be if we predict 'pass' every time. If model cannot beat these scores, then it is not good!

In [41]:
def print_clf_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    print(f"""
    Accuracy: {accuracy:.2f}
    Precision: {precision:.2f}
    Recall: {recall:.2f}
    F1 score: {f1:.2f}
    ROC AUC: {auc:.2f}
    """)

In [42]:
# create y_pred array that includes only Passes
results = [0] * val_targ.shape[0]
d = {'results_re': results}
baseline_df = pd.DataFrame(data=d)

In [43]:
print_clf_metrics(val_targ['results_re'], baseline_df['results_re'])


    Accuracy: 0.91
    Precision: 0.00
    Recall: 0.00
    F1 score: 0.00
    ROC AUC: 0.50
    


### Naive Bayes model

BOW model:

In [44]:
# Use multinomial NB b/c features are discrete (counts)
nb_bow = MultinomialNB()
nb_bow.fit(train_bow_resample, train_targ_1d)

y_pred_bow = nb_bow.predict(val_bow)
print_clf_metrics(val_targ_1d, y_pred_bow)


    Accuracy: 0.78
    Precision: 0.18
    Recall: 0.43
    F1 score: 0.25
    ROC AUC: 0.62
    


CBOW model:

In [45]:
# Use Gaussian NB b/c features now continuous
nb_cbow = GaussianNB()
nb_cbow.fit(train_cbow_resample, train_targ_1d)

y_pred_cbow = nb_cbow.predict(val_cbow)
print_clf_metrics(val_targ_1d, y_pred_cbow)


    Accuracy: 0.89
    Precision: 0.02
    Recall: 0.01
    F1 score: 0.01
    ROC AUC: 0.49
    


2-gram model:

In [46]:
# Use multinomial NB b/c features are discrete (counts)
nb_2gram = MultinomialNB()
nb_2gram.fit(train_2gram_resample, train_targ_1d)

y_pred_2gram = nb_2gram.predict(val_2gram)
print_clf_metrics(val_targ_1d, y_pred_2gram)


    Accuracy: 0.79
    Precision: 0.16
    Recall: 0.34
    F1 score: 0.22
    ROC AUC: 0.59
    


### Logistic Regression model

BOW model:

In [50]:
log_reg_bow = LogisticRegression(max_iter=5000)
log_reg_bow.fit(train_bow_resample, train_targ_1d)
log_y_pred_bow = log_reg_bow.predict(val_bow)
print_clf_metrics(val_targ_1d, log_y_pred_bow)


    Accuracy: 0.70
    Precision: 0.14
    Recall: 0.51
    F1 score: 0.22
    ROC AUC: 0.61
    


CBOW model:

In [48]:
log_reg_cbow = LogisticRegression(max_iter=1000)
log_reg_cbow.fit(train_cbow_resample, train_targ_1d)
log_y_pred_cbow = log_reg_cbow.predict(val_cbow)
print_clf_metrics(val_targ_1d, log_y_pred_cbow)


    Accuracy: 0.66
    Precision: 0.16
    Recall: 0.70
    F1 score: 0.26
    ROC AUC: 0.68
    


2-gram model:

In [51]:
log_reg_2gram = LogisticRegression(max_iter=5000)
log_reg_2gram.fit(train_2gram_resample, train_targ_1d)

log_y_pred_2gram = log_reg_2gram.predict(val_2gram)
print_clf_metrics(val_targ_1d, y_pred_2gram)


    Accuracy: 0.79
    Precision: 0.16
    Recall: 0.34
    F1 score: 0.22
    ROC AUC: 0.59
    
