- Glove **twitter** 200d from **STANFORD** (trained on 2B tweets):

    - Trivial Data:
        - Importing **trivial** data
        - Models:
            - LTSM
                Loss = 0.336, Accuracy = 85.23%
            - CNN
                Loss = 0.356, Accuracy = 84.11%

In [5]:
from functions.nn_functions import *
from functions.classical_ML_functions import *

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True
vocab_size = 20000
batch_size = 64
embedding_dimension = 200

# Stanford Glove twitter 200d

# Trivial Data

###### LSTM

In [7]:
text = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
label = data.LabelField(dtype=torch.float)
fields = [('text', text), ('label', label)]

vectors = Vectors(name='embeddings/stanford_glove/glove.twitter.27B.200d.txt')

training, validation = data.TabularDataset.splits(path='trivial_data',
                                                  train='train_trivial.csv',
                                                  validation="val_trivial.csv",
                                                  format='csv',
                                                  fields=fields,
                                                  skip_header=True)

text.build_vocab(training,
                 max_size=vocab_size,
                 vectors=vectors,
                 unk_init=torch.Tensor.normal_)

label.build_vocab(training)

batch_bucket, batch_valid_bucket = data.BucketIterator.splits((training, validation),
                                                              batch_size=batch_size,
                                                              device=device,
                                                              sort=False)

model_trivial = BI_LSTM(vocabulary_size=len(text.vocab),
                        embedding_dimension=embedding_dimension)
model_trivial.embedding.weight.data.copy_(text.vocab.vectors)
optimizer = optim.Adam(model_trivial.parameters())
criterion = nn.BCEWithLogitsLoss()



In [None]:
train(model_trivial, 3, batch_bucket, batch_valid_bucket, optimizer, criterion)

In [None]:
_, _, test_trivial = load_data('trivial_data/trivial_positive.txt',
                               'trivial_data/trivial_negative.txt',
                               'trivial_data/trivial_test.txt')
_, lstm_submission = predict(text, model_trivial, test_trivial)
create_csv_submission(
    lstm_submission, 'Advanced_ML_results/stanford_glove/trivial/submission_lstm_model_sg_t.csv')

In [None]:
torch.save(model_trivial.state_dict(), 'nn_models/lstm_model_sg_t.pt')

###### CNN

In [None]:
num_filters = 128
filters = [2, 3, 4, 5]

In [None]:
text = data.Field(tokenize='spacy', batch_first=True)
label = data.LabelField(dtype=torch.float)
fields = [('text', text), ('label', label)]

training, validation = data.TabularDataset.splits(path='trivial_data',
                                                  train='train_trivial.csv',
                                                  validation="val_trivial.csv",
                                                  format='csv',
                                                  fields=fields,
                                                  skip_header=True)

text.build_vocab(training,
                 max_size=vocab_size,
                 vectors=vectors,
                 unk_init=torch.Tensor.normal_)

label.build_vocab(training)

batch_bucket, batch_valid_bucket = data.BucketIterator.splits((training, validation),
                                                              batch_size=batch_size,
                                                              device=device,
                                                              sort=False)

model = D1_CNN(vocabulary_size=len(text.vocab),
               embedding_dimension=embedding_dimension, num_filters=num_filters, filters=filters)
model.embedding.weight.data.copy_(text.vocab.vectors)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [None]:
train_cnn(model, 3, batch_bucket, batch_valid_bucket, optimizer, criterion)

In [None]:
_, _, test = load_data('trivial_data/trivial_positive.txt',
                       'trivial_data/trivial_negative.txt',
                       'trivial_data/trivial_test.txt')
_, cnn_submission = predict_cnn(text, model, test, max(filters))
create_csv_submission(
    cnn_submission, 'Advanced_ML_results/stanford_glove/trivial/submission_cnn_model_sg_t.csv')

In [None]:
torch.save(model.state_dict(), 'nn_models/cnn_model_sg_t.pt')