# Insult Detector
Have you ever wondered whether someone is insulting you online? Well now you can find out in one simple neural network!

In [1]:
import numpy as np
import scipy as scp
import json
import torch
from torch import nn

### Reformatting the vocabulary file
Python is very efficient with dictionaries, and we need the "index" of each word in the vocabulary. So, this creates a mapping from word -> index.

In [3]:
# Take the original dictionary of words and index it
def reformat_vocab():
    formatted_vocab = {}
    with open('glove.42B.300d.txt', 'r') as fp:
        with open('glove.json', 'w') as out:
            out.write('{\n')
            line = fp.readline()
            while line:
                tok = line.split()
                word = tok[0]
                vec = tok[1:]
                # write this word-vec combo to our json
                # doing this one line at a time instead of json dumping a whole dict
                # (5.3GB file oof)
                line = fp.readline()
                out.write('\t%s: [%s],\n' % (word, ','.join(vec)))
            out.write('}')



### Data Import and Formatting

We need to import the training text and labels, as well as construct the input matrices for each datum.

In [4]:
import string

def load_vocab():
    with open("glove.json") as fp:
        vocab = json.load(fp)
    return vocab

def comment_parse(comment, vocab):
    # to lower
    comment = comment.lower()
    # remove punctuation
    comment = comment.translate(comment.maketrans('','',string.punctuation))
    
    glove = np.zeros((len(comment.split()), 300))
    
    for word in comment:
        if word in vocab:
            glove[i] = np.asarray([float(val) for val in vocab[word]])
    
    return glove

def load_train(filename):
    labels = np.genfromtxt(filename
                           , delimiter=","
                           , dtype=None
                           , skip_header=1
                           , usecols=0 
                          )
    
    vocab = load_vocab()
    
    train_temp = []
    longest = 0;
    num_comments = 0
    
    with open(filename, 'r') as fp:
        fp.readline() # skip the fist line (column labels)
        line = fp.readline()
        while line:
            num_comments += 1
            tok = line.split('\"\"\"')
            comment = tok[1]
            
            if (len(comment) > longest):
                longest = len(comment)

            encoding = comment_parse(comment, vocab)
            train_temp.append[encoding]
            
            line = fp.readline()
        
    train = np.zeros((num_comments, longest, 300))
        
    for i in range(0, len(train_temp)):
        glove_enc = train_temp[i]
        n_words = glove_enc.shape[0]
        if n_words < longest:
            np.append(glove_enc, np.zeros((longest - n_words,300)), axis = 0)
        
        train[i] = glove_enc
        
    return train, labels

In [None]:
comment_parse("Hi how are you", load_vocab())

In [None]:
def get_device():
    use_gpu = torch.is_cuda_available()

    if use_gpu:
        device = torch.device('cuda')
        print('GPU used')
    else:
        device = torch.device('cpu')
        print('CPU used')
    return device

In [6]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, n_layers):
        super(Model, self).__init__()
        
        self.n_layers = n_layers
        
        self.rnn = nn.RNN(input_size, self.output_size, n_layers, batch_first=True)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        hidden = self.init_hidden(batch_size)
        
        out, hidden = self.rnn(x, hidden)
        
        return out, hidden

    def init_hidden(self, size):
        return torch.zeros(self.n_layers, size, self.output_size)

In [5]:
def train(model, train_data, labels, n_epochs=10, learning_rate=0.01):
    if model is not None:
        model = Model(300, 1, 1)
    
    n = train_data.shape[0]
    
    model.to(get_device())
    
    # Loss function & optimization
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for i in range(1, n_epochs + 1):
        optimizer.zero_grad()
        
        for j in range(0, n):
            comment_tensor = torch.from_numpy(train_data[j])
            label_tensor = torch.from_numpy(labels[j])
        
            # send input tensor to device
            comment_tensor.to(device)
            output, hidden = model(comment_tensor)
            loss = criterion(output, label_tensor.view(-1).long())
            loss.backward() # Do backprop
            optimizer.step() # Update weights
        
        if epoch % 10 == 0:
            print('Epoch: %d/%d.............' % (epoch, n_epochs), end=' ')
            print("Loss: {%.4f}".format(loss.item()))
        
    return model