In [None]:
import pandas as pd
import numpy as np
import random

import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

USE_CUDA = torch.cuda.is_available()

if USE_CUDA:
    DEVICE = torch.device('cuda')
    print("Using cuda.")
else:
    DEVICE = torch.device('cpu')
    print("Using cpu.")
    
random.seed(30255)
np.random.seed(30255)
torch.manual_seed(30255)
if USE_CUDA:
    torch.cuda.manual_seed(30255)
    
COLAB = False
#COLAB = True
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = "gdrive/My Drive/advanced_ml/"

In [None]:
!pip install pytorch-ignite

In [None]:
if COLAB:
    df = pd.read_pickle(PATH + 'initial_clean.pkl')
else:
    df = pd.read_pickle('initial_clean.pkl')

In [None]:
print(f'df currently has {df.shape[0]} rows')

# Remove rows with no comments
df = df[df['violations_orig'].notna()]
print(f'df now has {df.shape[0]} rows')

In [None]:
# Change 'Pass w/ Conditions' to 'Pass' and change target to 0/1 numeric
results_dict = {'Pass': 0, 'Fail': 1}
df['results_re'] = df['results_re'].str.replace('Pass w/ Conditions', 'Pass')
df['results_re'] = df['results_re'].apply(lambda x: results_dict[x])

### Split data into feature & target, and then into train and test sets

In [None]:
# Sort dataframe by inspection date
# Training set will include initial 80% of inspections
sorted_df = df.sort_values(by=['date_orig'])

In [None]:
target_col = ['results_re']
target = sorted_df[target_col]

cols_to_exclude = ['name', 'id_orig', 'id_re', 'license']
feature_cols = [col for col in sorted_df.columns if (col not in cols_to_exclude and col not in target_col)]
features = sorted_df[feature_cols]

In [None]:
# Don't shuffle data before splitting
train_feat, test_feat, train_targ, test_targ = train_test_split(features, target, test_size=0.2,
                                                                shuffle=False)

In [None]:
pct_fail = len(train_targ[train_targ['results_re'] == 1]) / len(train_targ)
print(f'{pct_fail*100:.2f}% reinspections fail in training set')

### Split out the text features

In [None]:
text_col = ['violations_orig']
train_feat_txt = train_feat[text_col].astype(str)
test_feat_txt = test_feat[text_col].astype(str)

In [None]:
train_feat_txt.head()

### Split the text into tokens

In [None]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()

In [None]:
# Create vocab using training set only!!!!
for idx, text in train_feat_txt.itertuples():
    counter.update(tokenizer(text))

In [None]:
# Not sure what the min frequency should be
# Min freq = 1 -> 50768 vocab length
# Min freq = 50 -> 2942
# Min freq = 100 -> 2211
# Min freq = 250 -> 1481
# Min freq = 500 -> 1107
# Min freq = 1000 -> 806
MIN_FREQ = 500
vocab = Vocab(counter, min_freq=MIN_FREQ)

In [None]:
len(vocab)

### Create BOW features

In [None]:
def collate_into_bow(data, voc=vocab):
    bow = torch.zeros((len(data), len(voc) + 1))
    labels = torch.zeros((len(data), 1))
    for i, (label, text) in enumerate(data):
        counter = Counter()
        counter.update(tokenizer(text))
        line_vocab = Vocab(counter)
        tot_freqs = sum(line_vocab.freqs.values())
        labels[i] = label
        for token in line_vocab.freqs:
            bow[i, voc.stoi[token]] = line_vocab.freqs[token] / tot_freqs  # Using relative frequencies
            bow[i, -1] = tot_freqs
    
    return (labels, bow)

### __Create CBOW features__

In [None]:
VECTORS_CACHE_DIR = './.vector_cache'
DIM_GLOVE = 300

glove = GloVe('6B',cache=VECTORS_CACHE_DIR)

In [None]:
def collate_into_cbow(data):
    cbow = torch.zeros((len(data), DIM_GLOVE))
    labels = torch.zeros((len(data), 1))
    for i, (label, text) in enumerate(data):
        counter = Counter()
        counter.update(tokenizer(text))
        tokens = list(Vocab(counter).freqs)
        vecs = glove.get_vecs_by_tokens(tokens)
        cbow[i] = torch.mean(vecs, axis=0)
        labels[i] = label
    return (labels, cbow)

### Create ngram features

In [None]:
# Using CountVectorizer to get ngrams (this was the most intuitive tool I could find...)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def divide_rows_by_row_sum(arr):
    return np.nan_to_num(
            np.divide(arr, arr.sum(axis=1)[:, None]),
            nan=0  # Small number of ngram rows are all zeroes; don't divide row by 0 
    )

In [None]:
vectorizer = CountVectorizer(min_df=MIN_FREQ, ngram_range=(2,2))
corpus = train_feat_txt['violations_orig'].to_list()
vocab_ngrams = vectorizer.fit_transform(corpus)

In [None]:
#I'm not sure how to get this to include document length, since
#I had to alter the BOW collate function so it would work
#with the dataloader

def collate_into_ngrams(data, voc=vocab_ngrams):
    ngrams = torch.zeros((len(data), voc.shape[1]))
    labels = torch.zeros((len(data), 1))
    for i, (label, text) in enumerate(data):
        X_batch = vectorizer.transform([text])
        ngram_arr = X_batch.toarray()
        ngram_arr = divide_rows_by_row_sum(ngram_arr)
        ngrams[i, :] = torch.tensor(ngram_arr)
        labels[i] = label
    
    return (labels, ngrams)

### Split train set into train and validation

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_feat_txt, train_targ,
                                                    test_size=0.2, shuffle=False)

In [None]:
#Oversample on target vector
train_targ_reset = train_y.reset_index().drop('index', axis=1)
train_targ_fail = train_targ_reset[train_targ_reset['results_re'] == 1]
size_diff = train_targ_reset.shape[0] - train_targ_fail.shape[0]
train_resample = resample(train_targ_fail, n_samples = size_diff, replace=True)
train_targ_all = pd.concat([train_targ_reset, train_resample])

In [None]:
from torch.utils.data import Dataset, DataLoader

class InspectionsDataset(Dataset):
    def __init__(self, features_df, target_df):

        self.features_text = features_df['violations_orig']
        self.labels = target_df['results_re']

    def __len__(self):
        return len(self.features_text)

    def __getitem__(self, idx):
        text = self.features_text.iloc[idx]
        label = self.labels.iloc[idx]

        return (label, text)

In [None]:
#then resample features
train_x_reset = train_x.reset_index().drop('index', axis=1)
train_x_resample = train_x_reset.iloc[train_targ_all.index]

#and convert to dataset object
inspections_train = InspectionsDataset(train_x_resample, train_targ_all)
inspections_val = InspectionsDataset(val_x, val_y)
inspections_test = InspectionsDataset(test_feat_txt, test_targ)

#these can all now be fed into DataLoaders with the proper collate functions

In [None]:
# Combine text and label in 1 file
df_train = pd.concat([train_x_resample, train_targ_all], axis=1)
df_valid = pd.concat([val_x, val_y], axis=1)
df_test = pd.concat([test_feat_txt, test_targ], axis=1)

### CNN Data Prep

- Data prep informed by: https://www.kaggle.com/swarnabha/pytorch-text-classification-torchtext-lstm<br>
- Model building informed by: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

In [None]:
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator, Example, Dataset

In [None]:
# create Field objects to process the text data
# they will include info for how to convert the text to tensors
TEXT = Field(tokenize='basic_english', lower=True, batch_first=True)
LABEL = LabelField(dtype=torch.float)

In [None]:
# source : https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8
# to use DataFrame as a Data source
class DataFrameDataset(Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.results_re
            text = row.violations_orig
            examples.append(Example.fromlist([text, label], fields))

        super().__init__(examples, fields)

    @staticmethod
    def sort_key(ex):
        return len(ex.violations_orig)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [None]:
# create Dataset objects using the train and validation dataframes
fields = [('violations_orig', TEXT), ('results_re', LABEL)]
train_ds, val_ds, test_ds = DataFrameDataset.splits(fields, train_df=df_train,
                                                    val_df=df_valid, test_df=df_test)

In [None]:
# Build vocab using original training set

# ***NOTE*** - this builds the vocab using the train dataset that was split
# from the validation dataset (couldn't get it to work otherwise)
TEXT.build_vocab(train_ds, vectors = 'glove.6B.300d', min_freq=MIN_FREQ)

In [None]:
# Make sure new vocab is the same as original vocab
print('Length of new vocab is...', len(TEXT.vocab.itos))
print('Length of original vocab is...', len(vocab))

In [None]:
LABEL.build_vocab(train_ds)

In [None]:
# Build iterators
BATCH_SIZE=64
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_ds, val_ds, test_ds),
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.violations_orig),
#     sort_within_batch = True,
    device = DEVICE)

### CNN Model Build, Training, & Eval

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [None]:
# Load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:
# Zero out the initial weights of the unknown and padding tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
log_interval = 100
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        predictions = model(batch.violations_orig).squeeze(1)
        
        loss = criterion(predictions, batch.results_re)
        acc = binary_accuracy(predictions, batch.results_re)
        
        loss.backward()
        optimizer.step()
        
        if i % log_interval == 0 and i > 0:
            print(f'At iteration {i} the training accuracy is {acc:.3f}.')
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:

            predictions = model(batch.violations_orig).squeeze(1)
            loss = criterion(predictions, batch.results_re)
            
            acc = binary_accuracy(predictions, batch.results_re)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
# Calculate metrics on validation set
from ignite.metrics import Precision
from ignite.metrics import Recall
from ignite.metrics import Accuracy

model.eval()
precision = Precision()
recall = Recall()
accuracy = Accuracy()

with torch.no_grad():
    for batch in valid_iterator:
        preds = model(batch.violations_orig).squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(preds))
        precision.update((rounded_preds, batch.results_re))
        recall.update((rounded_preds, batch.results_re))
        accuracy.update((rounded_preds, batch.results_re))
        
print('Precision: ', precision.compute())
print('Recall: ', recall.compute())
print('Accuracy: ', accuracy.compute())

### Continuous Bag of Words

In [None]:
#create dataloaders
train_dataloader = DataLoader(inspections_train, batch_size=BATCH_SIZE, shuffle=True, 
                                collate_fn=collate_into_cbow)
val_dataloader = DataLoader(inspections_val, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_cbow)
test_dataloader = DataLoader(inspections_test, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_cbow)

### Ngrams

In [None]:
#create dataloaders
train_dataloader = DataLoader(inspections_train, batch_size=BATCH_SIZE, shuffle=True, 
                                collate_fn=collate_into_ngrams)
val_dataloader = DataLoader(inspections_val, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_ngrams)
test_dataloader = DataLoader(inspections_test, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_ngrams)