# Train on Davidson

## Import stuff

In [1]:
import os
import sys
import ipdb
from tqdm import tqdm
import numpy as np
import torch.optim as optim
from torchtext.data import TabularDataset, BucketIterator, Field
sys.path.extend(['/Users/zeerakw/Documents/PhD/projects/Generalisable_abuse'])

import gen.shared.types as t
from gen.neural import LSTMClassifier
from gen.shared.data import BatchGenerator
from gen.shared.clean import Cleaner
from gen.shared.train import compute_unigram_liwc

## Setup variables

In [2]:
text_label = Field(sequential = True,
                   include_lengths = False,
                   use_vocab = True,
                   pad_token = "<PAD>",
                   unk_token = "<UNK>")

int_label = Field(sequential = False,
                  include_lengths = False,
                  use_vocab = False,
                  pad_token = None,
                  unk_token = None)

In [3]:
device = 'cpu'
data_dir = '/Users/zeerakw/Documents/PhD/projects/Generalisable_abuse/data/'
data_file = 'davidson_test.csv'
path = os.path.join(data_dir, data_file)
file_format = 'csv'
cleaners = ['lower', 'url', 'hashtag', 'username']
clean = Cleaner(cleaners)

# Set fields
text_field = text_label
label_field = int_label

# Update training field
#setattr(text_field, 'tokenize', clean.tokenize)
#setattr(text_field, 'preprocessing', compute_unigram_liwc)
fields = [('', None), ('CF_count', None), ('hate_speech', None), ('offensive', None), ('neither', None),
          ('label', label_field), ('text', text_field)]

## Load the data

In [4]:
data = TabularDataset(path, format = file_format, fields = fields, skip_header = True)
train, test = data.split(split_ratio = 0.8, stratified = True)
loaded = (train, test)
text_field.build_vocab(train)

In [5]:
print(len(text_field.vocab))
for doc in train:
    print(len(doc.text), doc.text)
    break

1113
19 ['"@FloKid88:', 'As', 'long', 'as', 'the', 'Lakers', 'trash', 'from', 'now', 'on,', 'I', 'could', 'careless.', 'And', "that's", 'real.".', 'CC:', '@BENBALLER', 'hahaha']


In [6]:
batch_sizes = (64, 64)
tmp_train, tmp_test = BucketIterator.splits(loaded, batch_sizes = batch_sizes, sort_key = lambda x: len(x.text),
                                            device = device, shuffle = True, repeat = False)

In [7]:
train_batches = BatchGenerator(tmp_train, 'text', 'label')
test_batches = BatchGenerator(tmp_test, 'text', 'label')
next(iter(train_batches))

(tensor([[ 267,    2,  281,  247,  266,    2,  314,    2,  126,  303,  273,  261,
           338,  308,  258,  317,  265,  304,    2,  323],
         [ 365,  931,   75,  157,   76,   50,  516,   45,  139,  384,    6,  352,
           383,  135,  804,  361,  438,    6,    4,  416],
         [  27,    3,   84,   31,  730, 1107,   16,   11,  551,  134,  614,   18,
           130,   36,   93,   20,  809, 1071,  205,  181],
         [ 444,  921,  160, 1039,   14, 1041,  706,   20,  521, 1112,  846,   64,
           204,    7,   93,  998,   96,   48, 1081,  181],
         [ 464,   33,    7,   33, 1019,   57,   16,    5,  112,  451,   74,  351,
           214,   85,   18,  683,  959,  211,    2,  591],
         [  15,    8,  115,    7,  818,  701,  630,  199,    4,  864,  102,   88,
           158,   72,  179,  467,   96,   34,    1,  952],
         [1072,   22,  104,  620,  174,    7, 1003, 1029,   39,  924,    1,    9,
            14,  776,  996,  173,  891,   13,    1,  793],
         [   

## Define our model

In [8]:
import torch.nn as nn

In [35]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, no_classes, no_layers):
        super(LSTMClassifier, self).__init__()
        self.emb = nn.Embedding(input_dim, embedding_dim)
        self.linear = nn.Linear(embedding_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.to_output = nn.Linear(hidden_dim, no_classes)
        self.softmax = nn.LogSoftmax(dim = 1)
    
    def forward(self, sequence):
        out = self.emb(sequence)
        out, last = self.lstm(out)
        out = self.to_output(out)
        out = self.softmax(out)
        return out

In [36]:
model = LSTMClassifier(len(text_field.vocab), embedding_dim = 64, hidden_dim = 128, no_classes = 3, no_layers = 1)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

def train(model, epochs, batches, loss_func, optimizer):
    for epoch in range(epochs):
        epoch_loss = []
        model.zero_grad()
        for X, y in batches:
            scores = model(X)
            print(X[0])
            print(X.shape, y.shape)
            loss = loss_func(scores, y)
            epoch_loss.append(float(loss))
            loss.backward()
            optimizer.step()
            break
        break
        print("Epoch {0}: Mean Loss: {1}".format(epoch, np.mean(epoch_loss)))

In [37]:
train(model, 10, train_batches, loss, optimizer)

tensor([304, 243, 273,   2,  94, 297, 248, 279, 250, 247,   2,   2, 277, 325,
         94, 328,   2, 341,   2, 265, 322, 261, 278, 292,   2,   2,   2,   2,
        268,   2,   2, 294, 291,   2,   2,   2, 289,   2, 129,   2, 259, 240,
        331, 126, 262,   2, 336, 319, 335, 300, 128, 333, 282, 305, 310,   2,
          2, 253, 264, 244, 263, 320, 127, 255])
torch.Size([28, 64]) torch.Size([64])


ValueError: Expected input batch_size (28) to match target batch_size (64).