<a href="https://colab.research.google.com/github/andresnigenda/cpd_complaints_nlp/blob/krista/embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import torch
import re
from torchtext import data
import random
import copy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import spacy
import sys
import csv

csv.field_size_limit(sys.maxsize)
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# We are using 'spacy' tokenizer. You can also write your own tokenizer. You can 
# download spacy from https://spacy.io/usage
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
df = pd.read_csv('training_with_text.csv')
df['text_content'] = df.text_content.str.lower()
df['text_content'] = df.text_content.str.replace('\\n', ' ')
df['text_content'] = df.text_content.str.replace("\\'", '')
df['text_content'] = df['text_content'].fillna('')

In [0]:
#Use 'tasers_baton_aggressive_physical_touch_gun' as binary category
df['tasers_baton_aggressive_physical_touch_gun'] = df.apply(lambda x: 1 if x['tasers_baton_aggressive_physical_touch_gun'] == 1 else 0, axis=1)

In [0]:
from string import digits

def remove_digits(row):
    remove_digits = str.maketrans('', '', digits)
    return row['text_content'].translate(remove_digits)

df['text_content'] = df.apply(remove_digits, axis=1)

In [0]:
df = df[['text_content', 'tasers_baton_aggressive_physical_touch_gun']]
df.to_csv('plain_text.csv', index=False)

In [0]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField()

In [0]:
complaints = data.TabularDataset(
    path='plain_text.csv', format='csv',
    fields=[('text', TEXT),
            ('labels', LABEL)],
    skip_header=True)
train, val = complaints.split(split_ratio=0.7)

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train,
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train)

In [68]:
print(TEXT.vocab.itos[:100])

['<unk>', '<pad>', '.', ' ', 'the', ',', ':', '?', 'of', 'to', 'and', '-', 'a', 'that', 'i', 'in', 'you', 'or', ')', 'no', 'on', 'was', '(', 'at', 'he', 'not', '#', 'be', 'by', 'police', '/', 'for', '  ', 'with', 'this', 'is', '"', 'date', '~', 'officer', 'as', 'accused', 'time', 'his', 'unit', ';', 'report', 'did', 'from', 'your', 'if', 'an', 'department', '..', 'it', 'of?cer', 'have', 'him', 'any', 'chicago', 'were', 'name', 'investigation', 'are', 'will', 'a.', '_', 'subject', 'complaint', 'complainant', 'star', 'statement', 'her', 'member', 'page', 'my', 'other', 'yes', 'had', 'me', 'stated', 'counsel', 'court', 'attachment', 'against', 'q.', 'allegation', 'right', 'r', 'she', 'allegations', 'incident', 'all', 'when', 'been', '!', 'm', 'which', 'what', 'hours']


In [69]:
print(LABEL.vocab.itos)

['0', '1']


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train, val), 
    batch_size = BATCH_SIZE,
    device = device)

In [0]:
def binary_accuracy(preds, y):
    """
    Return accuracy per batch
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
class WordEmbAvg(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        
        super().__init__()
        
        # Define an embedding layer, a couple of linear layers, and 
        # the ReLU non-linearity.

        ##YOUR CODE HERE##
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.linear1 = nn.Linear(pad_idx * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)                                 
        
        
    def forward(self, text):

        ##YOUR CODE HERE##
        embeds = self.embedding(text)
        embeds = self.linear1(embeds.mean(0))
        out = F.relu(embeds)
        return self.linear2(out)

In [0]:
class Training_module( ):

    def __init__(self, model, epochs):
       self.model = model
       self.loss_fn = nn.BCEWithLogitsLoss()
       
       ##YOUR CODE HERE##
       # Choose an optimizer. optim.Adam is a popular choice
       self.optimizer = optim.Adam(model.parameters())
       self.epochs = epochs
    
    def train_epoch(self, iterator):
        '''
        Train the model for one epoch. For this repeat the following, 
        going through all training examples.
        1. Get the next batch of inputs from the iterator.
        2. Determine the predictions using a forward pass.
        3. Compute the loss.
        4. Compute gradients using a backward pass.
        5. Execute one step of the optimizer to update the model paramters.
        '''
        epoch_loss = 0
        epoch_acc = 0
    
        for batch in iterator:
          # batch.text has the texts and batch.label has the labels.
        
            self.optimizer.zero_grad()
                
            ##YOUR CODE HERE##
            predictions = model(batch.text).squeeze(1)
            loss = self.loss_fn(predictions, batch.labels)
            accuracy = binary_accuracy(predictions, batch.labels)
        
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        """
        Train the model for multiple epochs, and after each evaluate on the
        development set.  Return the best performing model.
        """  
        dev_accs = [0.]
        for epoch in range(self.epochs):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print(f"Epoch {epoch}: Dev Accuracy: {dev_acc[1]} Dev Loss:{dev_acc[0]}")
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        '''
        Evaluate the performance of the model on the given examples.
        '''
        epoch_loss = 0
        epoch_acc = 0
    
        with torch.no_grad():
    
            for batch in iterator:

                predictions = model(batch.text).squeeze(1)
            
                loss = self.loss_fn(predictions, batch.label)
                acc = binary_accuracy(predictions, batch.label)
                epoch_loss += loss.item()
                epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
INPUT_DIM = len(TEXT.vocab)
#You can try many different embedding dimensions. Common values are 20, 32, 64, 100, 128, 512
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 1
#Get the index of the pad token using the stoi function
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]


model = WordEmbAvg(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)

In [0]:
model = model.to(device)
tm = Training_module(model, 5)

In [73]:
#Training the model
best_model = tm.train_model(train_iterator, valid_iterator)

RuntimeError: ignored

In [0]:
# Determine accuracy of best model. You should obtain a test accuracy 
# well above 80%.
tm.model = best_model
test_loss, test_acc = tm.evaluate(test_iterator)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')