In [224]:
import pandas as pd
import copy
import torch
import re
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
import random 
SEED = 1234

In [272]:
df = pd.read_csv('training_with_text.csv')
df = df.loc[df.category.notna()]
df['text_content'] = df.text_content.str.lower()
df['text_content'] = df.text_content.str.replace('\\n', ' ')
df['text_content'] = df.text_content.str.replace("\\'", '')
df['text_content'] = df.text_content.str.replace("[0-9]", '')
df['category'] = df.category.str.lower()
category_label = {}
for i, cat in enumerate(df.category.unique()):
    df.loc[df.category == cat, 'category'] = i
    category_label[i] = cat

df = df[['text_content', 'category']]
df.to_csv('plain_text.csv', index=False)

In [273]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField()

complaints = data.TabularDataset(
    path='plain_text.csv', format='csv',
    fields=[('text', TEXT),
            ('labels', LABEL)],
    skip_header=True)

train, valid = complaints.split(split_ratio=0.7)

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train,
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train)

In [274]:
print(TEXT.vocab.itos[:100])

['<unk>', '<pad>', '.', ' ', 'the', ',', ':', '?', 'to', 'of', 'and', '-', 'a', 'that', 'i', 'in', 'you', ')', '(', 'or', 'no', 'was', 'on', '#', 'he', 'not', 'be', '/', 'at', 'by', '  ', 'for', 'police', 'with', 'this', 'officer', '"', 'date', 'as', 'is', 'accused', ';', 'unit', '~', 'his', 'time', 'from', 'report', '..', 'if', 'an', 'did', 'your', '_', 'him', 'department', 'it', 'subject', 'of?cer', 'were', 'have', 'a.', 'investigation', 'chicago', 'name', 'any', 'will', 'complainant', 'stated', 'are', 'complaint', 'her', 'star', 'attachment', 'q.', 'court', 'me', 'other', 'member', 'allegation', 'my', 'm', 'right', 'had', 'allegations', 'officers', 'counsel', 'incident', 'she', 'against', 'r', 'statement', 'all', '!', 'yes', '[', 'page', 'standards', 'sustained', 'professional']


In [275]:
print(LABEL.vocab.itos)

['0', '3', '2', '5', '4', '6', '1', '10', '7', '8', '11', '13', '12']


In [276]:
BATCH_SIZE = 50

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train, val), 
    batch_size = BATCH_SIZE,
    device = device,
    sort_key = lambda x: len(x.text),
    sort_within_batch=False)

In [277]:
def calculate_accuracy(preds, y):
    """
    Return accuracy per batch
    """
    pred_label = []
    idxs = torch.argmax(preds, 1)
    for i in idxs:
        pred_label.append(int(LABEL.vocab.itos[i]))
    pred_label = torch.tensor(pred_label)
    print('predicted: ', pred_label)
    print('actual: ', y)

    correct = (torch.tensor(pred_label) == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [278]:
class WordEmbAvg(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        
        super().__init__()
        
        # Define an embedding layer, a couple of linear layers, and 

        self.embedding = nn.Embedding(input_dim, embedding_dim, pad_idx)
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU(inplace=False)        
        
        
    def forward(self, text):
        embeds = self.embedding(text).mean(0)
        out = self.linear1(embeds)
        out = self.relu(input=out)

        return self.linear2(out)

In [279]:
class Training_module( ):

    def __init__(self, model):
       self.model = model
       self.loss_fn = nn.CrossEntropyLoss()
       self.optimizer = optim.Adam(self.model.parameters())
    
    def train_epoch(self, iterator):
        '''
        Train the model for one epoch. For this repeat the following, 
        going through all training examples.
        1. Get the next batch of inputs from the iterator.
        2. Determine the predictions using a forward pass.
        3. Compute the loss.
        4. Compute gradients using a backward pass.
        5. Execute one step of the optimizer to update the model paramters.
        '''
        epoch_loss = 0
        epoch_acc = 0
    
        for batch in iterator:        
            self.optimizer.zero_grad()
            predictions = self.model.forward(batch.text)
            loss = self.loss_fn(predictions, batch.labels)
            accuracy = calculate_accuracy(predictions, batch.labels)
        
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator, epochs):
        """
        Train the model for multiple epochs, and after each evaluate on the
        development set.  Return the best performing model.
        """  
        dev_accs = [0.]
        for epoch in range(epochs):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print(f"Epoch {epoch}: Dev Accuracy: {dev_acc[1]} Dev Loss:{dev_acc[0]}")
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        '''
        Evaluate the performance of the model on the given examples.
        '''
        epoch_loss = 0
        epoch_acc = 0
    
        with torch.no_grad():
    
            for batch in iterator:
                predictions = self.model(batch.text)
                loss = self.loss_fn(predictions, batch.labels)
                acc = calculate_accuracy(predictions, batch.labels)
        
                epoch_loss += loss.item()
                epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [280]:
INPUT_DIM = len(TEXT.vocab)
#You can try many different embedding dimensions. Common values are 20, 32, 64, 100, 128, 512
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = len(LABEL.vocab)
#Get the index of the pad token using the stoi function
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]


model = WordEmbAvg(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)

# If you want to use pre-trained embeddings uncomment following
# pretrained_embeddings = TEXT.vocab.vectors
# model.embedding.weight.data.copy_(pretrained_embeddings)
# UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
# model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
# model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [281]:
model = model.to(device)
tm = Training_module(model)

#Training the model
best_model = tm.train_model(train_iterator, valid_iterator, 5)

predicted:  tensor([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2, 12,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2])
actual:  tensor([2, 4, 0, 1, 2, 2, 4, 1, 0, 0, 0, 6, 6, 3, 2, 0, 5, 0, 1, 0, 2, 1, 0, 0,
        2, 2, 2, 7, 2, 2, 1, 5, 2, 9, 0, 1, 1, 1, 2, 1, 0, 0, 1, 0, 1, 0, 3, 4,
        1, 5])


  del sys.path[0]


predicted:  tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2])
actual:  tensor([ 0,  0,  2,  4,  4,  0,  5,  1,  2,  1,  0,  1,  0,  0,  0,  7,  1, 11,
         1,  6,  1,  3,  1,  2,  0,  2,  1,  0,  2,  1,  0,  2,  1,  2,  0,  1,
         1,  2,  4,  0,  3,  0,  0,  8,  4,  8,  0,  1,  3,  4])
predicted:  tensor([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 12,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2])
actual:  tensor([ 1,  2,  0,  1,  0,  0,  2,  0,  2,  4,  3,  0,  0,  2,  5,  8,  0,  4,
         6,  2,  2,  2,  0,  3,  2,  1,  2,  0,  0,  0,  1, 10,  1, 11,  0,  5,
         0,  1,  2,  0,  0,  3,  2,  1,  1,  0,  1,  3,  0,  0])
predicted:  tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2,

AttributeError: 'Example' object has no attribute 'comment_text'

In [None]:
# Determine accuracy of best model. You should obtain a test accuracy 
# well above 80%.
tm.model = best_model
test_loss, test_acc = tm.evaluate(test_iterator)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')