In [2]:
import pandas as pd
import numpy as np
import random

import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

USE_CUDA = torch.cuda.is_available()

if USE_CUDA:
    DEVICE = torch.device('cuda')
    print("Using cuda.")
else:
    DEVICE = torch.device('cpu')
    print("Using cpu.")
    
random.seed(30255)
np.random.seed(30255)
torch.manual_seed(30255)
if USE_CUDA:
    torch.cuda.manual_seed(30255)
    
COLAB = False
#COLAB = True
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = "gdrive/My Drive/advanced_ml/"

Using cpu.


In [3]:
!pip install pytorch-ignite

distutils: /Users/michaelfeldman/opt/anaconda3/include/python3.7m/UNKNOWN
sysconfig: /Users/michaelfeldman/opt/anaconda3/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m
distutils: /Users/michaelfeldman/opt/anaconda3/include/python3.7m/UNKNOWN
sysconfig: /Users/michaelfeldman/opt/anaconda3/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m


In [4]:
if COLAB:
    df = pd.read_pickle(PATH + 'initial_clean.pkl')
else:
    df = pd.read_pickle('initial_clean.pkl')

In [5]:
print(f'df currently has {df.shape[0]} rows')

# Remove rows with no comments
df = df[df['violations_orig'].notna()]
print(f'df now has {df.shape[0]} rows')

df currently has 37207 rows
df now has 36553 rows


In [6]:
# Change 'Pass w/ Conditions' to 'Pass' and change target to 0/1 numeric
results_dict = {'Pass': 0, 'Fail': 1}
df['results_re'] = df['results_re'].str.replace('Pass w/ Conditions', 'Pass')
df['results_re'] = df['results_re'].apply(lambda x: results_dict[x])

### Split data into feature & target, and then into train and test sets

In [9]:
# Sort dataframe by inspection date
# Training set will include initial 80% of inspections
sorted_df = df.sort_values(by=['date_orig'])

In [10]:
target_col = ['results_re']
target = sorted_df[target_col]

cols_to_exclude = ['name', 'id_orig', 'id_re', 'license']
feature_cols = [col for col in sorted_df.columns if (col not in cols_to_exclude and col not in target_col)]
features = sorted_df[feature_cols]

In [11]:
# Don't shuffle data before splitting
train_feat, test_feat, train_targ, test_targ = train_test_split(features, target, test_size=0.2,
                                                                shuffle=False)

In [12]:
pct_fail = len(train_targ[train_targ['results_re'] == 1]) / len(train_targ)
print(f'{pct_fail*100:.2f}% reinspections fail in training set')

8.76% reinspections fail in training set


### Split out the text features

In [13]:
text_col = ['violations_orig']
train_feat_txt = train_feat[text_col].astype(str)
test_feat_txt = test_feat[text_col].astype(str)

In [14]:
train_feat_txt.head()

Unnamed: 0,violations_orig
220538,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
494292,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
45775,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
488807,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR..."
455503,21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTE...


### Split train set into train and validation

In [26]:
train_x, val_x, train_y, val_y = train_test_split(train_feat_txt, train_targ,
                                                    test_size=0.2, shuffle=False)

In [27]:
# Oversample on target vector
train_targ_reset = train_y.reset_index().drop('index', axis=1)
train_targ_fail = train_targ_reset[train_targ_reset['results_re'] == 1]
size_diff = train_targ_reset.shape[0] - train_targ_fail.shape[0]
train_resample = resample(train_targ_fail, n_samples = size_diff, replace=True)
train_targ_all = pd.concat([train_targ_reset, train_resample])

In [29]:
# then resample features
train_x_reset = train_x.reset_index().drop('index', axis=1)
train_x_resample = train_x_reset.iloc[train_targ_all.index]

In [30]:
# Combine text and label into 1 df
df_train = pd.concat([train_x_resample, train_targ_all], axis=1)
df_valid = pd.concat([val_x, val_y], axis=1)
df_test = pd.concat([test_feat_txt, test_targ], axis=1)

### CNN Data Prep

_Note: Data prep informed by [notebook](https://www.kaggle.com/swarnabha/pytorch-text-classification-torchtext-lstm) from Kaggle user swarnabha_

In [28]:
from torchtext.legacy.data import (Field, LabelField, TabularDataset,
                                   BucketIterator, Example, Dataset)

In [76]:
# create Field objects to process the text data
# these will include info for how to convert the text to tensors
TEXT = Field(tokenize='basic_english', lower=True, batch_first=True)
LABEL = LabelField(dtype=torch.float)

In [30]:
# Source:
# https://www.kaggle.com/swarnabha/pytorch-text-classification-torchtext-lstm
# Allows us to convert a pandas DataFrame to a Dataset object
class DataFrameDataset(Dataset):

    def __init__(self, df, fields, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.results_re
            text = row.violations_orig
            examples.append(Example.fromlist([text, label], fields))

        super().__init__(examples, fields)

    @staticmethod
    def sort_key(ex):
        return len(ex.violations_orig)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [77]:
# create Dataset objects
fields = [('violations_orig', TEXT), ('results_re', LABEL)]
train_ds, val_ds, test_ds = DataFrameDataset.splits(fields, train_df=df_train,
                                                    val_df=df_valid, test_df=df_test)

In [78]:
MIN_FREQ = 500

# Build vocab using training set
TEXT.build_vocab(train_ds, vectors='glove.6B.300d', min_freq=MIN_FREQ)

In [80]:
# Build vocab for labels
LABEL.build_vocab(train_ds)

defaultdict(None, {1: 0, 0: 1})

In [81]:
# Make sure labels aren't inadvertently swapped
from collections import defaultdict
d = defaultdict(None)
d[1] = 1
d[0] = 0
LABEL.vocab.stoi = d

In [35]:
# Build iterators
BATCH_SIZE=64
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_ds, val_ds, test_ds),
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.violations_orig),
    device = DEVICE)

### CNN Model Building, Training, & Evaluation

_Notes:_
- _Model architecture and training process informed by [tutorial](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb) from Ben Trevett_
- _Cells do not have output, as all model training was completed on Colab_

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        # This gives us n_filters number of filters for every filter size. So we get
        # n_filters * len(filter_sizes) total filters
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                 
        embedded = self.embedding(text)
        
        embedded = embedded.unsqueeze(1)
        
        conved = [F.leaky_relu(conv(embedded)).squeeze(3) for conv in self.convs]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        return self.fc(cat)

In [None]:
import torch.optim as optim

# Hyperparams to tune
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 10
FILTER_SIZES = [2, 3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
LR = 0.00005

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES,
            OUTPUT_DIM, DROPOUT, PAD_IDX)

# Load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Zero out the initial weights of the unknown and padding tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.AdamW(model.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [None]:
def get_accuracy(preds, y):
    """
    Returns accuracy for a given batch
    
    Inputs:
        preds: tensor with model predictions
        y: tensor with ground truth values
        
    Returns:
        Accuracy of the batch (float)
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

In [None]:
log_interval = 100
def train(model, iterator, optimizer, criterion):
    '''
    Train the model for one epoch
    
    Inputs:
        model: a model object
        iterator: an iterator object
        optimizer: the optimizer for the model
        criterion: loss function
    
    Returns:
        Average loss and accuracy over the epoch (both floats)
    '''
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        predictions = model(batch.violations_orig).squeeze(1)
        
        loss = criterion(predictions, batch.results_re)
        acc = get_accuracy(predictions, batch.results_re)
        
        loss.backward()
        optimizer.step()
        
        if i % log_interval == 0 and i > 0:
            print(f'At iteration {i} the training accuracy is {acc:.3f}.')
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    '''
    Evaluate the model over an epoch
    
    Inputs:
        model: a model object
        iterator: an iterator object
        criterion: loss function
    
    Returns:
        Average loss and accuracy over the epoch (both floats)
    '''
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.violations_orig).squeeze(1)
            loss = criterion(predictions, batch.results_re)
            
            acc = get_accuracy(predictions, batch.results_re)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    '''
    Calculate the amount of time that has elapsed over an epoch
    
    Inputs:
        start_time: Float, time the epoch started
        end_time: Float, time the epoch ended
        
    Returns:
        Tuple, number of minutes and seconds that elapsed
        over the epoch
    '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Train the model

N_EPOCHS = 8
best_valid_loss = float('inf')

# reset/clear out model just in case
for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator,
                                  optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator,
                                     criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # copy model if it's the best one so far
    if valid_loss < best_valid_loss:
        print('Best model yet is found! Saving it!')
        best_valid_loss = valid_loss
        model_copy = type(model)(INPUT_DIM, EMBEDDING_DIM, N_FILTERS,
                                 FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
        model_copy.load_state_dict(model.state_dict())
        best_model = model_copy
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
from ignite.metrics import Precision
from ignite.metrics import Recall
from ignite.metrics import Accuracy

def get_test_metrics(iterator):
    '''
    Print out metrics for best model from predictions
    on a data iterator
    '''
    best_model.to(DEVICE)
    best_model.eval()
    precision = Precision()
    recall = Recall()
    accuracy = Accuracy()
    f1 = (precision * recall * 2 / (precision + recall)).mean()

    with torch.no_grad():
        for batch in iterator:
            preds = best_model(batch.violations_orig).squeeze(1)
            rounded_preds = torch.round(torch.sigmoid(preds))
            precision.update((rounded_preds, batch.results_re))
            recall.update((rounded_preds, batch.results_re))
            accuracy.update((rounded_preds, batch.results_re))
            f1.update((rounded_preds, batch.results_re))

    print('Precision: ', precision.compute())
    print('Recall: ', recall.compute())
    print('F1: ', f1.compute())
    print('Accuracy: ', accuracy.compute())

In [None]:
# Get metrics for best model on validation set
get_test_metrics(valid_iterator)

In [None]:
# Get metrics for best model on test set
get_test_metrics(test_iterator)