In [11]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import re
import numpy as np
import os as os
import pandas as pd
import pickle
import random

In [3]:
#load data and labels 
with open('twitter_data/tweet_embeddings25k.pkl', 'rb') as f:
    dataset = pickle.load(f)
    
with open('twitter_data/labels25k.pkl', 'rb') as f:
    labels = pickle.load(f)
#converting to tensor/adding labels
dataset = torch.stack(dataset)
labels = torch.tensor(labels.tolist())
dataset = torch.utils.data.TensorDataset(labels, dataset)

In [4]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_labels = train_dataset[:][0]
train_data = train_dataset[:][1]

test_labels = test_dataset[:][0]
test_data = test_dataset[:][1]

# check balance of dataset
# TODO: check in 'Training Model' if I actually need to 
# convert to labels to list or leave as tensor
train_labels = train_labels.tolist()
test_labels = test_labels.tolist()
print(train_labels.count(0))
print(train_labels.count(1))
print(test_labels.count(0))
print(test_labels.count(1))


9948
10052
2551
2449


# Model Definition

In [5]:
class CLSTM(nn.Module):
    def __init__(self):
        super(CLSTM, self).__init__()
        # expects input tensor of [1, 1, 768]
        # 1 input channel, 20 output channels, 1x5 convolution with padding
        self.conv1 = nn.Conv1d(1, 20, 5, stride=2, padding=2)
        # 20 input channel, 10 output channels, 1x5 convolution
        self.conv2 = nn.Conv1d(20, 10, 5, stride=2, padding=2)
        # end up with tensors [1, 10, 192]
        # these turn into tensors [1, 10, 48] after pooling which 
        # are turned into a list of 10 [1, 48] tensors to pass into the lstm
        # each of these 10 will be taken in as token representations
        
        self.lstm = nn.LSTM(input_size=48, hidden_size=48, num_layers=2, bidirectional=True)
        # the lstm will output tuple of ([4, 1, 48],[4, 1, 48]) where the first entry is
        # the hidden state for classification and the second entry is the cell state

        self.fc1 = nn.Linear(4*48, 120)
        self.fc2 = nn.Linear(120, 40)
        self.fc3 = nn.Linear(40, 1)
    
    def forward(self, x):
        #set initial hidden state and cell state for lstm
        # 2 is for the number of layers and the other 2 is because of bidirection
        hidden = torch.randn(2*2, 1, 48)
        cell = torch.randn(2*2, 1, 48)
        
        x = F.max_pool1d(F.relu(self.conv1(x)), kernel_size=2, stride=2)
        x = F.max_pool1d(F.relu(self.conv2(x)), kernel_size=2, stride=2)
        # x is now a tensor of shape [1, 10, 48] because we did the pooling
        
        # now put the "tokens" into a list
        tweet_representation = []
        for i in range(len(x[0])):
            tweet_representation.append(x[0][i].unsqueeze(0))
        # tweet_representation is a list of the ten [1, 48] tensors to
        # be passed into the lstm
        for j in tweet_representation:
            _, hidden_tuple = self.lstm(j.view(1, 1, -1),(hidden, cell)) 

        # hidden state tensor [4, 1, 48]
        hidden_state = hidden_tuple[0]
        # flatten the hidden_state to tensor [192]
        x = hidden_state.reshape(-1)
        x = F.relu(self.fc1(x))
        # x is now 1 x 120
        x = F.relu(self.fc2(x))
        # x is now 1 x 30
        x = self.fc3(x)
        # x is now a single predicted class
        return torch.sigmoid(x)
model = CLSTM()
print(model)

CLSTM(
  (conv1): Conv1d(1, 20, kernel_size=(5,), stride=(2,), padding=(2,))
  (conv2): Conv1d(20, 10, kernel_size=(5,), stride=(2,), padding=(2,))
  (lstm): LSTM(48, 48, num_layers=2, bidirectional=True)
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=40, bias=True)
  (fc3): Linear(in_features=40, out_features=1, bias=True)
)


In [6]:
# test input
inp = torch.randn(1, 1, 768)
out = model(inp)
print(out)

tensor([0.5252], grad_fn=<SigmoidBackward>)


# Training Model

In [10]:
# criterion = nn.BCELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=.0001)
# total_step = len(train_data)
# num_epochs = 1
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, data in enumerate(train_data, 0):
#         inputs = data  # inputs.shape is [1, 1, 768]
#         label = train_labels[i]
#         label = torch.tensor([float(label)])
#         optimizer.zero_grad()
#         inputs = inputs.unsqueeze(0).unsqueeze(0)
#         output = model(inputs)
#         loss = criterion(output, label)
#         loss.backward()
#         optimizer.step()
        
#         #print stats
#         running_loss += loss.item()
#         if (i+1) % 100 == 0:
#             print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Output [{}], True label [{}]' 
#                    .format(epoch+1, num_epochs, i+1, total_step, loss.item(), output.item(), label.item()))
#             running_loss = 0.0
# print('finished training')

Epoch [1/1], Step [100/20000], Loss: 0.0549, Output [0.9465402364730835], True label [1.0]
Epoch [1/1], Step [200/20000], Loss: 0.0403, Output [0.03951290249824524], True label [0.0]
Epoch [1/1], Step [300/20000], Loss: 0.2669, Output [0.23421761393547058], True label [0.0]
Epoch [1/1], Step [400/20000], Loss: 0.5438, Output [0.5805485248565674], True label [1.0]
Epoch [1/1], Step [500/20000], Loss: 0.0004, Output [0.9996104836463928], True label [1.0]
Epoch [1/1], Step [600/20000], Loss: 0.0012, Output [0.9987780451774597], True label [1.0]
Epoch [1/1], Step [700/20000], Loss: 0.0157, Output [0.9844262599945068], True label [1.0]
Epoch [1/1], Step [800/20000], Loss: 0.0011, Output [0.9989068508148193], True label [1.0]
Epoch [1/1], Step [900/20000], Loss: 0.1269, Output [0.1192060261964798], True label [0.0]
Epoch [1/1], Step [1000/20000], Loss: 0.1506, Output [0.13984382152557373], True label [0.0]
Epoch [1/1], Step [1100/20000], Loss: 0.0043, Output [0.995680570602417], True label [

Epoch [1/1], Step [9000/20000], Loss: 0.0010, Output [0.9990373849868774], True label [1.0]
Epoch [1/1], Step [9100/20000], Loss: 0.0132, Output [0.01311872061342001], True label [0.0]
Epoch [1/1], Step [9200/20000], Loss: 0.0226, Output [0.9776995182037354], True label [1.0]
Epoch [1/1], Step [9300/20000], Loss: 0.4391, Output [0.644634485244751], True label [1.0]
Epoch [1/1], Step [9400/20000], Loss: 0.0904, Output [0.0864422470331192], True label [0.0]
Epoch [1/1], Step [9500/20000], Loss: 0.0727, Output [0.07013227045536041], True label [0.0]
Epoch [1/1], Step [9600/20000], Loss: 0.0044, Output [0.004400345031172037], True label [0.0]
Epoch [1/1], Step [9700/20000], Loss: 0.0600, Output [0.05824108421802521], True label [0.0]
Epoch [1/1], Step [9800/20000], Loss: 0.0057, Output [0.005710248369723558], True label [0.0]
Epoch [1/1], Step [9900/20000], Loss: 0.0796, Output [0.923453688621521], True label [1.0]
Epoch [1/1], Step [10000/20000], Loss: 0.0012, Output [0.9987521171569824],

Epoch [1/1], Step [17800/20000], Loss: 0.3422, Output [0.28979170322418213], True label [0.0]
Epoch [1/1], Step [17900/20000], Loss: 0.2088, Output [0.8115971684455872], True label [1.0]
Epoch [1/1], Step [18000/20000], Loss: 0.0066, Output [0.006576733198016882], True label [0.0]
Epoch [1/1], Step [18100/20000], Loss: 0.0676, Output [0.06535021960735321], True label [0.0]
Epoch [1/1], Step [18200/20000], Loss: 0.0008, Output [0.9991543292999268], True label [1.0]
Epoch [1/1], Step [18300/20000], Loss: 0.2784, Output [0.7569639682769775], True label [1.0]
Epoch [1/1], Step [18400/20000], Loss: 0.0167, Output [0.016578640788793564], True label [0.0]
Epoch [1/1], Step [18500/20000], Loss: 0.0076, Output [0.9924752116203308], True label [1.0]
Epoch [1/1], Step [18600/20000], Loss: 0.0679, Output [0.06561397761106491], True label [0.0]
Epoch [1/1], Step [18700/20000], Loss: 3.0705, Output [0.04639894887804985], True label [1.0]
Epoch [1/1], Step [18800/20000], Loss: 0.1037, Output [0.90149

# Testing Model

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for i, data in enumerate(test_data,0):
        inputs = data
        label = test_labels[i]
        label = torch.tensor([float(label)])
        inputs = inputs.unsqueeze(0).unsqueeze(0)
        output = model(inputs)
        if(output >= .5):
            output = 1
        else:
            output = 0
        total += 1
        correct += int(output == label)
        
print('Accuracy of the model on 5000 test tweets: %d %%' % (100*correct/total))