In [1]:
import pandas as pd
import numpy as np

import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

from collections import Counter

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

__Notes:__
- If we want to keep our prediction binary, we should change all 'Pass w/ Conditions' in the `results_re` field to 'Pass'.
- We should also make this field numeric, with Fail as 1 and Pass as 0 (as 'Fail' is the event we are trying to predict). I think we can add both of these to the `data_prep` notebook.

__Questions to discuss:__
- __Shuffling during train/test split__: When we do our train/test split, should we sort our data by inspection date first? I know this matters for time series data (where we'd want to avoid leakage of future data into the past), but not sure how much it matters here, where the inspections are generally distinct establishments. Below I did sort the data, and didn't shuffle during the train/test split, but would like your thoughts on whether this matters.
- __Word frequency threshold__: In our HW 2, we only included in our vocab words that appeared at least 1000 times in the entire corpus. We'll need to decide what threshold we want to use. I used 1000 below, and also listed the vocab counts at different thresholds.
- __Pipeline operations__: What kinds of additional cleaning steps should we perform in our pipeline? Right now, we're just converting words to lower case and splitting them with a tokenizer. Other possible steps include removing stop words and lemmatizing (https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/). I'd recommend we go to Amitabh's office hours to get his thoughts on an appropriate 'good practice' pipeline.
- __Relative vs. absolute frequencies for BOW__: In our HW, we implemented BOW with relative frequencies, but I think for our purposes absolute frequencies make more sense, as more words/longer comments can mean more violations. Do you think this makes sense?
- __Choice of n for ngrams__: I made an initial choice of 2, which seemed reasonable to me. Would like your thoughts on this, though.

In [2]:
df = pd.read_pickle('initial_clean.pkl')

In [3]:
df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
2,567573,312 CHICAGO,1803058,Restaurant,2011-08-22,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,567588,2011-09-01,Pass,-10
17,1981882,312 CHICAGO,1803058,Restaurant,2017-01-23,Canvass,Fail,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",1982455,2017-02-01,Pass,-9
27,509340,VIDA SALUDABLE,2114784,Restaurant,2011-09-27,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,634796,2011-10-03,Pass,-6
32,634726,PICAZO'S TACO & FAST FOOD INC.,2120154,Restaurant,2011-09-26,License,Fail,12. HAND WASHING FACILITIES: WITH SOAP AND SAN...,634942,2011-10-13,Fail,-17
35,634942,PICAZO'S TACO & FAST FOOD INC.,2120154,Restaurant,2011-10-13,License Re-Inspection,Fail,24. DISH WASHING FACILITIES: PROPERLY DESIGNED...,634997,2011-10-18,Pass,-5


In [4]:
# Change 'Pass w/ Conditions' to 'Pass', and change target to 0/1 numeric
results_dict = {'Pass': 0, 'Fail': 1}
df['results_re'] = df['results_re'].str.replace('Pass w/ Conditions', 'Pass')
df['results_re'] = df['results_re'].apply(lambda x: results_dict[x])

### Split data into feature & target, and then into train and test sets

In [5]:
# Sort dataframe by inspection date
# Training set will include initial 80% of inspections
sorted_df = df.sort_values(by=['date_orig'])

In [6]:
sorted_df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
80173,104236,TEMPO CAFE,80916,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,104243,2010-01-12,0,-8
332950,67738,MICHAEL'S ON MAIN CAFE,2008948,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,124279,2010-01-19,0,-15
320316,160209,CHICAGO SINAI CONGREGATION,75567,Daycare (2 - 6 Years),2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,160331,2010-04-12,0,-97
216314,67744,GOLDEN CROWN RESTAURANT,2013539,Restaurant,2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,48215,2010-01-11,1,-6
199041,98313,WHIPPLE STORE,2009114,Grocery Store,2010-01-05,License,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",98371,2010-02-05,0,-31


In [7]:
target_col = ['results_re']
target = sorted_df[target_col]

cols_to_exclude = ['name', 'id_orig', 'id_re', 'license']
feature_cols = [col for col in sorted_df.columns if (col not in cols_to_exclude and col not in target_col)]
features = sorted_df[feature_cols]

In [8]:
target.head()

Unnamed: 0,results_re
80173,0
332950,0
320316,0
216314,1
199041,0


In [9]:
features.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
80173,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
332950,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
320316,Daycare (2 - 6 Years),2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-04-12,-97
216314,Restaurant,2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-01-11,-6
199041,Grocery Store,2010-01-05,License,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",2010-02-05,-31


In [10]:
# Don't shuffle data before splitting
train_feat, test_feat, train_targ, test_targ = train_test_split(features, target, test_size=0.2,
                                                                shuffle=False)

In [11]:
train_feat.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
80173,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
332950,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
320316,Daycare (2 - 6 Years),2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-04-12,-97
216314,Restaurant,2010-01-05,License,Fail,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,2010-01-11,-6
199041,Grocery Store,2010-01-05,License,Fail,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",2010-02-05,-31


In [12]:
pct_fail = len(train_targ[train_targ['results_re'] == 1]) / len(train_targ)
print(f'{pct_fail*100:.2f}% reinspections fail in training set')

8.28% reinspections fail in training set


### Split out the text features

In [13]:
text_col = ['violations_orig']
train_feat_txt = train_feat[text_col].astype(str)
test_feat_txt = test_feat[text_col].astype(str)

In [14]:
train_feat_txt.head()

Unnamed: 0,violations_orig
80173,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
332950,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
320316,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...
216314,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...
199041,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR..."


### Split the text into tokens

In [15]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()

In [16]:
# Create vocab using training set only!!!!
for idx, text in train_feat_txt.itertuples():
    counter.update(tokenizer(text))

In [17]:
# Not sure what the min frequency should be
# Min freq = 1 -> 50768 vocab length
# Min freq = 50 -> 2942
# Min freq = 100 -> 2211
# Min freq = 250 -> 1481
# Min freq = 500 -> 1107
# Min freq = 1000 -> 806
MIN_FREQ = 1000
vocab = Vocab(counter, min_freq=MIN_FREQ)

In [18]:
len(vocab)

806

### Create BOW features

Note - I think we should use absolute rather than relative frequencies for the BOW vectors, as greater number of violations likely provides relevant information

In [19]:
# I think we can convert numpy arrays to tensors, so I think returning a numpy array here shouldn't be a problem?
def collate_into_bow(data, voc):
    bow = np.zeros((len(data), len(voc)))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        line_vocab = Vocab(counter)
        for token in line_vocab.freqs:
            bow[i, voc.stoi[token]] = line_vocab.freqs[token]
    return bow

In [20]:
# Create training bow set
bow = collate_into_bow(train_feat_txt, vocab)
bow_df = pd.DataFrame(bow, columns=vocab.itos)

In [21]:
# Create test bow set using vocab from training set
test_bow = collate_into_bow(test_feat_txt, vocab)
test_bow_df = pd.DataFrame(test_bow, columns=vocab.itos)

In [22]:
bow_df.shape

(31700, 806)

In [23]:
bow_df.head()

Unnamed: 0,<unk>,<pad>,.,",",and,the,in,of,comments,-,...,bathroom,cock,old,pounds,slicer,close,encrusted,3rd,crevices,scrape
0,1.0,0.0,53.0,43.0,48.0,22.0,21.0,29.0,11.0,11.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,10.0,10.0,5.0,6.0,5.0,2.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,12.0,10.0,16.0,6.0,6.0,7.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,37.0,41.0,44.0,15.0,17.0,16.0,7.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,0.0,6.0,6.0,1.0,0.0,1.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### __Create CBOW features__

In [24]:
# You'll need to copy your vector cache to the folder with this notebook!
VECTORS_CACHE_DIR = './.vector_cache'
DIM_GLOVE = 300

glove = GloVe('6B',cache=VECTORS_CACHE_DIR)

In [25]:
def collate_into_cbow(data):
    cbow = np.zeros((len(data), DIM_GLOVE))
    for i, (idx, text) in enumerate(data.itertuples()):
        counter = Counter()
        counter.update(tokenizer(text))
        tokens = list(Vocab(counter).freqs)
        vecs = glove.get_vecs_by_tokens(tokens).numpy()
        cbow[i] = np.mean(vecs, axis=0)
    return cbow

In [26]:
# Create training cbow set
cbow = collate_into_cbow(train_feat_txt)
cbow_df = pd.DataFrame(cbow)

In [27]:
# Create test cbow set
test_cbow = collate_into_cbow(train_feat_txt)
test_cbow_df = pd.DataFrame(cbow)

### Create ngram features

In [28]:
# Using CountVectorizer to get ngrams (this was the most intuitive tool I could find...)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
vectorizer = CountVectorizer(min_df=MIN_FREQ, ngram_range=(2,2))

In [30]:
corpus = train_feat_txt['violations_orig'].to_list()

In [31]:
# Create training ngram set
X = vectorizer.fit_transform(corpus)
ngram_arr = X.toarray()

In [32]:
# There are 1088 2-grams
ngram_arr.shape

(31700, 1088)

In [33]:
ngram_arr

array([[0, 0, 0, ..., 2, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
vectorizer.get_feature_names()

['005 18',
 '005 potentially',
 '020 21',
 '020 30',
 '020 32',
 '020 33',
 '020 citation',
 '030 18',
 '030 32',
 '090 32',
 '090 comments',
 '11 adequate',
 '12 hand',
 '16 food',
 '18 no',
 '19 outside',
 '1st floor',
 '21 certified',
 '22 dish',
 '24 dish',
 '26 adequate',
 '29 previous',
 '2nd floor',
 '30 food',
 '31 clean',
 '32 food',
 '33 food',
 '34 floors',
 '35 walls',
 '36 lighting',
 '37 toilet',
 '38 005',
 '38 005a',
 '38 012',
 '38 020',
 '38 030',
 '38 ventilation',
 '40 refrigeration',
 '40f or',
 '41 premises',
 '42 090',
 '42 appropriate',
 '43 food',
 '45 food',
 'able to',
 'above the',
 'abrasive detergents',
 'accessible designed',
 'accessible properly',
 'accessible to',
 'accordance with',
 'accurate thermometers',
 'additional pest',
 'adequate number',
 'adequately trapped',
 'advisory posted',
 'affected areas',
 'against broken',
 'air temperature',
 'all affected',
 'all areas',
 'all artificial',
 'all cold',
 'all containers',
 'all cooking',
 'all co

In [35]:
# Create test ngram set

# I'm pretty sure this is how it's done?
corpus_test = test_feat_txt['violations_orig'].to_list()
X_test = vectorizer.transform(corpus_test)
test_ngram_arr = X_test.toarray()

In [36]:
# Should be same dimensionality as training array (1088)
test_ngram_arr.shape

(7925, 1088)

### Split train set into train and validation

In [37]:
# Don't shuffle data before splitting
train_bow, val_bow, train_targ_spl, val_targ = train_test_split(bow_df, train_targ, test_size=0.2,
                                                                shuffle=False)

train_cbow, val_cbow = train_test_split(cbow_df, test_size=0.2, shuffle=False)
train_2gram, val_2gram = train_test_split(ngram_arr, test_size=0.2, shuffle=False)

In [38]:
# Model expects 1d array as target
train_targ_1d = train_targ_spl.iloc[:, 0].ravel()
val_targ_1d = val_targ.iloc[:, 0].ravel()
test_targ_1d = test_targ.iloc[:, 0].ravel()

### Set model baseline

Baseline for model will be if we predict 'pass' every time. If model cannot beat these scores, then it is not good!

In [39]:
def print_clf_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    print(f"""
    Accuracy: {accuracy:.2f}
    Precision: {precision:.2f}
    Recall: {recall:.2f}
    F1 score: {f1:.2f}
    ROC AUC: {auc:.2f}
    """)

In [40]:
# create y_pred array that includes only Passes
results = [0] * 6340
d = {'results_re': results}
baseline_df = pd.DataFrame(data=d)

In [41]:
print_clf_metrics(val_targ['results_re'], baseline_df['results_re'])


    Accuracy: 0.91
    Precision: 0.00
    Recall: 0.00
    F1 score: 0.00
    ROC AUC: 0.50
    


### Naive Bayes model

BOW model:

In [42]:
# Use multinomial NB b/c features are discrete (counts)
nb_bow = MultinomialNB()
nb_bow.fit(train_bow, train_targ_1d)

y_pred_bow = nb_bow.predict(val_bow)
print_clf_metrics(val_targ_1d, y_pred_bow)


    Accuracy: 0.80
    Precision: 0.17
    Recall: 0.34
    F1 score: 0.23
    ROC AUC: 0.59
    


CBOW model:

In [43]:
# Use Gaussian NB b/c features now continuous
nb_cbow = GaussianNB()
nb_cbow.fit(train_cbow, train_targ_1d)

y_pred_cbow = nb_cbow.predict(val_cbow)
print_clf_metrics(val_targ_1d, y_pred_cbow)


    Accuracy: 0.89
    Precision: 0.02
    Recall: 0.01
    F1 score: 0.01
    ROC AUC: 0.49
    


2-gram model:

In [44]:
# Use multinomial NB b/c features are discrete (counts)
nb_2gram = MultinomialNB()
nb_2gram.fit(train_2gram, train_targ_1d)

y_pred_2gram = nb_2gram.predict(val_2gram)
print_clf_metrics(val_targ_1d, y_pred_2gram)


    Accuracy: 0.86
    Precision: 0.19
    Recall: 0.19
    F1 score: 0.19
    ROC AUC: 0.56
    
