In [2]:
import sys
import torch.nn as nn
import torch.optim as optim
sys.path.extend(['/Users/zeerakw/Documents/PhD/projects/active/Generalisable_abuse'])

from gen.shared.data import GeneralDataset, Batch, BatchExtractor
from gen.shared.base import Field
from gen.neural import RNNClassifier
from gen.shared.clean import Cleaner
from gen.shared.train import train_pytorch_model, evaluate_pytorch_model
from sklearn.metrics import accuracy_score

Here we set the text fields which correspond to the individual fields in the csv or json file. The distinction between JSON and CSV is set through the ```ftype``` argument in the GeneralDataset class. In the field objects, the ```cname``` attribute should correspond to json field attributes.

NOTE: Only works with top level JSON keys.

In [3]:
text_field = Field('text', train = True, label = False, ignore = False, ix = 6, cname = 'text')
label_field = Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 5)
ignore_field = Field('ignore', train = False, label = False, cname = 'ignore', ignore = True)

davidson_fields = [ignore_field, ignore_field, ignore_field, ignore_field, ignore_field, label_field, text_field]

dataset = GeneralDataset(data_dir = '~/PhD/projects/active/Generalisable_abuse/data/',
                         ftype = 'csv', fields = davidson_fields, train = 'davidson_train.csv', dev = None,
                         test = None, train_labels = None, tokenizer = lambda x: x.split(),
                         lower = True, preprocessor = None, transformations = None,
                         label_processor = None, sep = ',')
dataset.load('train')

We then split our dataset, and build our vocabulary and labels on only our training set. 
We then encode the documents in into onehot tensors and process the labels.

In [4]:
train, dev, test = dataset.split(dataset.data, [0.8, 0.1, 0.1])
dataset.build_token_vocab(train)
dataset.build_label_vocab(train)
dataset.process_labels(train)

dataset.process_labels(dev)

train = dataset.encode(train, onehot = True)
dev = dataset.encode(dev, onehot = True)
test = dataset.encode(test, onehot = True)
dataset.process_labels(test)

Building vocabulary: 100%|██████████| 709/709 [00:00<00:00, 113567.37it/s]
Encoding data: 100%|██████████| 709/709 [00:00<00:00, 1049.03it/s]
Encoding data: 0it [00:00, ?it/s]
Encoding data: 100%|██████████| 88/88 [00:00<00:00, 886.38it/s]


We finally batch the dataset and access the encoded data and processed labels.

In [5]:
batched = Batch(64, train)
batched.create_batches()
batched_train = BatchExtractor('encoded', 'label', batched)

batched = Batch(64, dev)
batched.create_batches()
batched_dev = BatchExtractor('encoded', 'label', batched)

batched = Batch(len(test), test)
batched.create_batches()
batched_test = BatchExtractor('encoded', 'label', batched)

Finally, we can initialise the model and losses, train and evaluate using it

In [6]:
model = RNNClassifier(len(dataset.stoi), hidden_dim = 128, output_dim = 3, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()
print(type(optimizer))
print(type(loss))

train_pytorch_model(model, 5, batched_train, loss, optimizer, text_field, batched_dev)
#evaluate_model(model, batched_dev, loss, accuracy_score, "accuracy")
evaluate_pytorch_model(model, batched_test, loss, accuracy_score, "accuracy")

  0%|          | 0/5 [00:00<?, ?it/s]

<class 'torch.optim.adam.Adam'>
<class 'torch.nn.modules.loss.NLLLoss'>





TypeError: 'NoneType' object is not callable