In [1]:
import pandas as pd
import numpy as np

import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
print(torchtext.__version__)

0.9.1


In [3]:
! pip install torchtext

distutils: /Users/michaelfeldman/opt/anaconda3/include/python3.7m/UNKNOWN
sysconfig: /Users/michaelfeldman/opt/anaconda3/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m
distutils: /Users/michaelfeldman/opt/anaconda3/include/python3.7m/UNKNOWN
sysconfig: /Users/michaelfeldman/opt/anaconda3/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m


In [4]:
df = pd.read_pickle('initial_clean.pkl')

In [5]:
df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
8,2362749,DANIEL WILLIAM HALE,66311,School,2020-02-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",2363154,2020-03-03,Pass,-8
21,2009301,DANIEL WILLIAM HALE,66311,School,2017-03-27,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2009714,2017-04-04,Pass,-8
34,1734408,DANIEL WILLIAM HALE,66311,School,2016-03-18,Canvass,Fail,13. NO EVIDENCE OF RODENT OR INSECT INFESTATIO...,1734665,2016-03-23,Pass,-5
39,1386146,DANIEL WILLIAM HALE,66311,School,2015-05-20,Canvass,Fail,13. NO EVIDENCE OF RODENT OR INSECT INFESTATIO...,1386157,2015-05-22,Pass,-2
65,2463620,DAYGLOW,2749526,Restaurant,2020-12-10,License,Fail,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,2463913,2020-12-17,Pass w/ Conditions,-7


In [6]:
# Change 'Pass w/ Conditions' to 'Pass', and change target to 0/1 numeric
results_dict = {'Pass': 0, 'Fail': 1}
df['results_re'] = df['results_re'].str.replace('Pass w/ Conditions', 'Pass')
df['results_re'] = df['results_re'].apply(lambda x: results_dict[x])

### Split data into feature & target, and then into train and test sets

In [7]:
# Sort dataframe by inspection date
# Training set will include initial 80% of inspections
sorted_df = df.sort_values(by=['date_orig'])

In [8]:
sorted_df.head()

Unnamed: 0,id_orig,name,license,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,id_re,date_re,results_re,time_between
220538,104236,TEMPO CAFE,80916,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,104243,2010-01-12,0,-8
494292,67738,MICHAEL'S ON MAIN CAFE,2008948,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,124279,2010-01-19,0,-15
379705,118297,MAXWELL STREET DEPOT INC.,18135,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,118308,2010-01-12,0,-7
491825,80208,Dunkin Donuts,2013340,Restaurant,2010-01-05,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,80233,2010-02-05,1,-31
263231,67741,CITGO,2013296,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,176270,2010-02-17,0,-43


In [9]:
target_col = ['results_re']
target = sorted_df[target_col]

cols_to_exclude = ['name', 'id_orig', 'id_re', 'license']
feature_cols = [col for col in sorted_df.columns if (col not in cols_to_exclude and col not in target_col)]
features = sorted_df[feature_cols]

In [10]:
target.head()

Unnamed: 0,results_re
220538,0
494292,0
379705,0
491825,1
263231,0


In [11]:
features.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
220538,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
494292,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
379705,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-7
491825,Restaurant,2010-01-05,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-02-05,-31
263231,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,2010-02-17,-43


In [12]:
# Don't shuffle data before splitting
train_feat, test_feat, train_targ, test_targ = train_test_split(features, target, test_size=0.2,
                                                                shuffle=False)

In [13]:
train_feat.head()

Unnamed: 0,facility_type,date_orig,inspection_type_orig,results_orig,violations_orig,date_re,time_between
220538,Restaurant,2010-01-04,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-8
494292,Restaurant,2010-01-04,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-19,-15
379705,Restaurant,2010-01-05,Complaint,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-01-12,-7
491825,Restaurant,2010-01-05,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2010-02-05,-31
263231,Grocery Store,2010-01-05,License,Fail,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,2010-02-17,-43


In [14]:
pct_fail = len(train_targ[train_targ['results_re'] == 1]) / len(train_targ)
print(f'{pct_fail*100:.2f}% reinspections fail in training set')

8.79% reinspections fail in training set


### Split out the text features

In [15]:
text_col = ['violations_orig']
train_feat_txt = train_feat[text_col].astype(str)
test_feat_txt = test_feat[text_col].astype(str)

In [16]:
train_feat_txt.head()

Unnamed: 0,violations_orig
220538,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
494292,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
379705,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
491825,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
263231,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...


### Split the text into tokens

In [17]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()

In [18]:
# Create vocab using training set only!!!!
for idx, text in train_feat_txt.itertuples():
    counter.update(tokenizer(text))

In [19]:
tokenizer('I like the')

['i', 'like', 'the']

In [20]:
# Not sure what the min frequency should be
# Min freq = 1 -> 50768 vocab length
# Min freq = 50 -> 2942
# Min freq = 100 -> 2211
# Min freq = 250 -> 1481
# Min freq = 500 -> 1107
# Min freq = 1000 -> 806
MIN_FREQ = 500
vocab = Vocab(counter, min_freq=MIN_FREQ)

In [21]:
len(vocab)

1086

### Create BOW features

In [22]:
def collate_into_bow(data, voc=vocab):
    bow = torch.zeros((len(data), len(voc) + 1))
    labels = torch.zeros((len(data), 1))
    for i, (label, text) in enumerate(data):
        counter = Counter()
        counter.update(tokenizer(text))
        line_vocab = Vocab(counter)
        tot_freqs = sum(line_vocab.freqs.values())
        labels[i] = label
        for token in line_vocab.freqs:
            bow[i, voc.stoi[token]] = line_vocab.freqs[token] / tot_freqs  # Using relative frequencies
            bow[i, -1] = tot_freqs
    
    return (labels, bow)

### __Create CBOW features__

In [23]:
VECTORS_CACHE_DIR = './.vector_cache'
DIM_GLOVE = 300

glove = GloVe('6B',cache=VECTORS_CACHE_DIR)

In [24]:
def collate_into_cbow(data):
    cbow = torch.zeros((len(data), DIM_GLOVE))
    labels = torch.zeros((len(data), 1))
    for i, (label, text) in enumerate(data):
        counter = Counter()
        counter.update(tokenizer(text))
        tokens = list(Vocab(counter).freqs)
        vecs = glove.get_vecs_by_tokens(tokens)
        cbow[i] = torch.mean(vecs, axis=0)
        labels[i] = label
    return (labels, cbow)


### Create ngram features

In [25]:
# Using CountVectorizer to get ngrams (this was the most intuitive tool I could find...)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
def divide_rows_by_row_sum(arr):
    return np.nan_to_num(
            np.divide(arr, arr.sum(axis=1)[:, None]),
            nan=0  # Small number of ngram rows are all zeroes; don't divide row by 0 
    )

In [27]:
vectorizer = CountVectorizer(min_df=MIN_FREQ, ngram_range=(2,2))
corpus = train_feat_txt['violations_orig'].to_list()
vocab_ngrams = vectorizer.fit_transform(corpus)

In [28]:
#I'm not sure how to get this to include document length, since
#I had to alter the BOW collate function so it would work
#with the dataloader

def collate_into_ngrams(data, voc=vocab_ngrams):
    ngrams = torch.zeros((len(data), voc.shape[1]))
    labels = torch.zeros((len(data), 1))
    for i, (label, text) in enumerate(data):
        X_batch = vectorizer.transform([text])
        ngram_arr = X_batch.toarray()
        ngram_arr = divide_rows_by_row_sum(ngram_arr)
        ngrams[i, :] = torch.tensor(ngram_arr)
        labels[i] = label
    
    return (labels, ngrams)

### Split train set into train and validation

In [29]:
train_x, val_x, train_y, val_y = train_test_split(train_feat_txt, train_targ,
                                                    test_size=0.2, shuffle=False)

In [30]:
#Oversample on target vector
train_targ_reset = train_y.reset_index().drop('index', axis=1)
train_targ_fail = train_targ_reset[train_targ_reset['results_re'] == 1]
size_diff = train_targ_reset.shape[0] - train_targ_fail.shape[0]
train_resample = resample(train_targ_fail, n_samples = size_diff, replace=True)
train_targ_all = pd.concat([train_targ_reset, train_resample])

In [31]:
from torch.utils.data import Dataset, DataLoader

class InspectionsDataset(Dataset):
    def __init__(self, features_df, target_df):

        self.features_text = features_df['violations_orig']
        self.labels = target_df['results_re']

    def __len__(self):
        return len(self.features_text)

    def __getitem__(self, idx):
        text = self.features_text.iloc[idx]
        label = self.labels.iloc[idx]

        return (label, text)

In [32]:
#then resample features
train_x_reset = train_x.reset_index().drop('index', axis=1)
train_x_resample = train_x_reset.iloc[train_targ_all.index]

#and convert to dataset object
inspections_train = InspectionsDataset(train_x_resample, train_targ_all)
inspections_val = InspectionsDataset(val_x, val_y)
inspections_test = InspectionsDataset(test_feat_txt, test_targ)

#these can all now be fed into DataLoaders with the proper collate functions

In [33]:
train_feat_txt

Unnamed: 0,violations_orig
220538,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
494292,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
379705,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
491825,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
263231,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
...,...
298536,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E..."
345820,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E..."
4686,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E..."
274648,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E..."


In [34]:
train_targ

Unnamed: 0,results_re
220538,0
494292,0
379705,0
491825,1
263231,0
...,...
298536,0
345820,1
4686,0
274648,0


In [35]:
# Combine text and label in 1 file
df_train = pd.concat([train_x_resample, train_targ_all], axis=1)
df_valid = pd.concat([val_x, val_y], axis=1)
df_test = pd.concat([test_feat_txt, test_targ], axis=1)

# Create training file to build vocab (includes train+valid sets)
train_vocab = pd.concat([train_feat_txt, train_targ], axis=1)

# df_train.to_csv('train.csv', index=False)
# df_valid.to_csv('valid.csv', index=False)
# df_test.to_csv('test.csv', index=False)

### Set CUDA settings

In [36]:
import random

USE_CUDA = torch.cuda.is_available()

if USE_CUDA:
    DEVICE = torch.device('cuda')
    print("Using cuda.")
else:
    DEVICE = torch.device('cpu')
    print("Using cpu.")
    
random.seed(30255)
np.random.seed(30255)
torch.manual_seed(30255)
if USE_CUDA:
    torch.cuda.manual_seed(30255)
    
COLAB = False
#COLAB = True
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = "gdrive/My Drive/advanced_ml/"

Using cpu.


### CNN Prep

Assisted by: https://towardsdatascience.com/lstm-text-classification-using-pytorch-2c6c657f8fc0<br>
https://www.kaggle.com/swarnabha/pytorch-text-classification-torchtext-lstm<br>
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

In [65]:
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator, Example, Dataset

In [66]:
# create Field objects to process the text data
# they will include info for how to convert the text to tensors
TEXT = Field(tokenize='basic_english', lower=True, batch_first=True)
LABEL = LabelField(dtype=torch.float)

In [67]:
# source : https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8
# to use DataFrame as a Data source
class DataFrameDataset(Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.results_re if not is_test else None
            text = row.violations_orig
            examples.append(Example.fromlist([text, label], fields))

        super().__init__(examples, fields)

    @staticmethod
    def sort_key(ex):
        return len(ex.violations_orig)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [68]:
# create Dataset objects using the train and validation dataframes
fields = [('violations_orig', TEXT), ('results_re', LABEL)]
train_ds, val_ds = DataFrameDataset.splits(fields, train_df=df_train, val_df=df_valid)

# Create Dataset object on original train df (that includes train and validation
# sets), so we can reproduce original vocab
# train_vocab_ds = DataFrameDataset.splits(fields, train_df=train_vocab)[0]

In [180]:
# Build vocab using original training set
# TEXT.build_vocab(train_vocab_ds, vectors = 'glove.6B.300d', min_freq=MIN_FREQ)
TEXT.build_vocab(train_ds, vectors = 'glove.6B.300d', min_freq=MIN_FREQ)

In [42]:
# Make sure new vocab is the same as original vocab
print('Length of new vocab is...', len(TEXT.vocab.itos))
print('Length of original vocab is...', len(vocab))

Length of new vocab is... 1086
Length of original vocab is... 1086


In [183]:
LABEL.build_vocab(train_ds)

In [185]:
BATCH_SIZE=64
train_iterator, valid_iterator = BucketIterator.splits(
    (train_ds, val_ds),
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.violations_orig),
#     sort_within_batch = True,
    device = DEVICE)

In [187]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [206]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [207]:
# Load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1256,  0.0136,  0.1031,  ..., -0.3422, -0.0224,  0.1368],
        ...,
        [ 0.2763,  0.2984,  0.0830,  ..., -0.1555, -0.3882, -0.7194],
        [ 0.3941, -0.0404,  0.4790,  ...,  0.0949, -0.4145,  0.2473],
        [-0.3772, -0.1181,  0.0712,  ..., -0.2140, -0.2401,  0.6279]])

In [208]:
# Zero out the initial weights of the unknown and padding tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [209]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [192]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [210]:
log_interval = 50
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        predictions = model(batch.violations_orig).squeeze(1)
        
        loss = criterion(predictions, batch.results_re)
        acc = binary_accuracy(predictions, batch.results_re)
        
        loss.backward()
        optimizer.step()
        
        if i % log_interval == 0 and i > 0:
            print(f'At iteration {i} the training accuracy is {acc:.3f}.')
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [194]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:

            predictions = model(batch.violations_orig).squeeze(1)
            loss = criterion(predictions, batch.results_re)
            
            acc = binary_accuracy(predictions, batch.results_re)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [195]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [211]:
N_EPOCHS = 2

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

KeyboardInterrupt: 

In [108]:
import torch
import torch.nn as nn


class RNNLM(nn.Module):
    """ Container module with an linear encoder/embedding, an RNN module, and a linear decoder.
    """

    def __init__(self, rnn_type, vocab_size, embedding_dim, hidden_dim, num_layers, 
                 output_dim=1, dropout=0.5):
        ''' Initialize model parameters corresponding to ---
            - embedding layer
            - recurrent neural network layer---one of LSTM, GRU, or RNN---with 
              optionally more than one layer
            - linear layer to map from hidden vector to the vocabulary
            - optionally, dropout layers.  Dropout layers can be placed after 
              the embedding layer or/and after the RNN layer. Dropout within
              an RNN is only applied when there are two or more num_layers.
            - optionally, initialize the model parameters.
            
            The arguments are:
            
            rnn_type: One of 'LSTM', 'GRU', 'RNN_TANH', 'RNN_RELU'
            vocab_size: size of vocabulary
            embedding_dim: size of an embedding vector
            hidden_dim: size of hidden/state vector in RNN
            num_layers: number of layers in RNN
            dropout: dropout probability.
            
        '''
        super(RNNLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if rnn_type == 'LSTM':
            self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                                num_layers=num_layers, dropout=dropout)
        self.linear = nn.Linear(hidden_dim, output_dim)
        self.out = nn.Sigmoid
        

    def forward(self, batch, hidden0=None):
        ''' 
        Run forward propagation for a given minibatch of inputs using
        hidden0 as the initial hidden state.

        In LSTMs hidden0 = (h_0, c_0). 

        The output of the RNN includes the hidden vector hiddenn = (h_n, c_n).
        Return this as well so that it can be used to initialize the next
        batch.
        
        Unlike previous homework sets do not apply softmax or logsoftmax here, since we'll use
        the more efficient CrossEntropyLoss.  See 
        https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html.
        '''
        emb = self.embedding(batch)
        output, hidden = self.lstm(emb, hidden0)
        return self.out(nn.self.linear(output))  # not outputting hidden state; each example is a different inspection


In [164]:
GRAD_CLIP = 1
loss_function = torch.nn.BCELoss()

def train_an_epoch(dataloader):
    model.train() # Sets the module in training mode.
    log_interval = 500

    for (_, ((text, label), _)) in enumerate(dataloader):
        print('text is...', text)
        model.zero_grad()
        probs = model(text)
        loss = loss_function(probs, label)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
#         if idx % log_interval == 0 and idx > 0:
#             print(f'At iteration {idx} the loss is {loss:.3f}.')

In [146]:
def get_accuracy(dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            outputs = model(text)
            predicted = torch.round(outputs)
            #predicted = torch.max(outputs.data, 1)[1]
            total += label.size(0)
            correct += (predicted == label).sum().item()
    
    print("Total: {}, correct: {}".format(total, correct))
    return(correct/total)

In [165]:
import matplotlib.pyplot as plt
%matplotlib inline
import time

num_labels = 2
vocab_size = len(vocab) + 1
embedding_size = 50

model = RNNLM("LSTM", vocab_size, embedding_size, embedding_size, num_layers=2,
              dropout=0.5)
EPOCHS = 3 # epoch
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

#reset/clear out model just in case
for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


accuracies=[]
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_an_epoch(train_dataloader)
    accuracy = get_accuracy(valid_dataloader)
    accuracies.append(accuracy)
    print()
    print(f'After epoch {epoch} the validation accuracy is {accuracy:.3f}.')
    print()
    
plt.plot(range(1, EPOCHS+1), accuracies)

text is... tensor([[1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103,
         1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103,
         1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103,
         1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103,
         1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103, 1103,
         1103, 1103, 1103, 1103]])


IndexError: index out of range in self

### Continuous Bag of Words

In [142]:
#create dataloaders
train_dataloader = DataLoader(inspections_train, batch_size=BATCH_SIZE, shuffle=True, 
                                collate_fn=collate_into_cbow)
val_dataloader = DataLoader(inspections_val, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_cbow)
test_dataloader = DataLoader(inspections_test, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_cbow)

### Ngrams

In [143]:
#create dataloaders
train_dataloader = DataLoader(inspections_train, batch_size=BATCH_SIZE, shuffle=True, 
                                collate_fn=collate_into_ngrams)
val_dataloader = DataLoader(inspections_val, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_ngrams)
test_dataloader = DataLoader(inspections_test, batch_size=BATCH_SIZE, shuffle=False, 
                                collate_fn=collate_into_ngrams)