- Glove 200d (**Home made** - trained on 2.5M tweets)):

    - Trivial Data:
        - Importing **trivial** data
        - Models:
            - LTSM
                Loss = 0.338, Accuracy = 84.81%
            - CNN
                Loss = 0.365, Accuracy = 83.51%

In [1]:
from functions.nn_functions import *
from functions.classical_ML_functions import *

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True
vocab_size = 20000
batch_size = 64
embedding_dimension = 200

# Glove 200d

# Trivial Data

###### LSTM

In [3]:
text = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
label = data.LabelField(dtype=torch.float)
fields = [('text', text), ('label', label)]

vectors = Vectors(name='embeddings/glove/glove_200d.txt')

training, validation = data.TabularDataset.splits(path='trivial_data',
                                                  train='train_trivial.csv',
                                                  validation="val_trivial.csv",
                                                  format='csv',
                                                  fields=fields,
                                                  skip_header=True)

text.build_vocab(training,
                 max_size=vocab_size,
                 vectors=vectors,
                 unk_init=torch.Tensor.normal_)

label.build_vocab(training)

batch_bucket, batch_valid_bucket = data.BucketIterator.splits((training, validation),
                                                              batch_size=batch_size,
                                                              device=device,
                                                              sort=False)

model_trivial = BI_LSTM(vocabulary_size=len(text.vocab),
                        embedding_dimension=embedding_dimension)
model_trivial.embedding.weight.data.copy_(text.vocab.vectors)
optimizer = optim.Adam(model_trivial.parameters())
criterion = nn.BCEWithLogitsLoss()

In [4]:
train(model_trivial, 3, batch_bucket, batch_valid_bucket, optimizer, criterion)

                                                                    
Epoch 1
	Train :      Loss = 0.386, Accuracy = 82.06%
	Validation : Loss = 0.35, Accuracy = 84.11%
-----------------------------------------------------
                                                                    
Epoch 2
	Train :      Loss = 0.353, Accuracy = 84.05%
	Validation : Loss = 0.342, Accuracy = 84.58%
-----------------------------------------------------
                                                                    
Epoch 3
	Train :      Loss = 0.344, Accuracy = 84.56%
	Validation : Loss = 0.338, Accuracy = 84.81%
-----------------------------------------------------


In [5]:
_, _, test_trivial = load_data('trivial_data/trivial_positive.txt',
                               'trivial_data/trivial_negative.txt',
                               'trivial_data/trivial_test.txt')
_, lstm_submission = predict(text, model_trivial, test_trivial)
create_csv_submission(
    lstm_submission, 'Advanced_ML_results/glove/trivial/submission_lstm_model_g_t.csv')

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [6]:
torch.save(model_trivial.state_dict(), 'nn_models/lstm_model_g_t.pt')

###### CNN

In [7]:
num_filters = 128
filters = [2, 3, 4, 5]

In [8]:
text = data.Field(tokenize='spacy', batch_first=True)
label = data.LabelField(dtype=torch.float)
fields = [('text', text), ('label', label)]

training, validation = data.TabularDataset.splits(path='trivial_data',
                                                  train='train_trivial.csv',
                                                  validation="val_trivial.csv",
                                                  format='csv',
                                                  fields=fields,
                                                  skip_header=True)

text.build_vocab(training,
                 max_size=vocab_size,
                 vectors=vectors,
                 unk_init=torch.Tensor.normal_)

label.build_vocab(training)

batch_bucket, batch_valid_bucket = data.BucketIterator.splits((training, validation),
                                                              batch_size=batch_size,
                                                              device=device,
                                                              sort=False)

model = D1_CNN(vocabulary_size=len(text.vocab),
               embedding_dimension=embedding_dimension, num_filters=num_filters, filters=filters)
model.embedding.weight.data.copy_(text.vocab.vectors)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [9]:
train_cnn(model, 3, batch_bucket, batch_valid_bucket, optimizer, criterion)

                                                                    
Epoch 1
	Train :      Loss = 0.404, Accuracy = 81.34%
	Validation : Loss = 0.371, Accuracy = 83.08%
-----------------------------------------------------
                                                                    
Epoch 2
	Train :      Loss = 0.378, Accuracy = 82.95%
	Validation : Loss = 0.366, Accuracy = 83.5%
-----------------------------------------------------
                                                                    
Epoch 3
	Train :      Loss = 0.367, Accuracy = 83.57%
	Validation : Loss = 0.365, Accuracy = 83.51%
-----------------------------------------------------


In [10]:
_, _, test = load_data('trivial_data/trivial_positive.txt',
                       'trivial_data/trivial_negative.txt',
                       'trivial_data/trivial_test.txt')
_, cnn_submission = predict_cnn(text, model, test, max(filters))
create_csv_submission(
    cnn_submission, 'Advanced_ML_results/glove/trivial/submission_cnn_model_g_t.csv')

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [11]:
torch.save(model.state_dict(), 'nn_models/cnn_model_g_t.pt')