In [10]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import re
import numpy as np
import os as os
import pandas as pd
import pickle
import random

from operator import itemgetter
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [11]:
#load data and labels 
with open('tweet_embeddings25k.pkl', 'rb') as f:
    dataset = pickle.load(f)
print(len(dataset))

with open('labels25k.pkl', 'rb') as f:
    labels = pickle.load(f)
print(len(labels))

#getting the indices for splitting
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset,[train_size,test_size])
# train_dataset and test_dataset are the random indices for splitting
#splitting the data randomly


idx_train = train_dataset.indices
idx_test = test_dataset.indices

train_dataset = list(itemgetter(*idx_train)(dataset))
train_labels = list(itemgetter(*idx_train)(labels))
test_dataset = list(itemgetter(*idx_test)(dataset))
test_labels = list(itemgetter(*idx_test)(labels))

#check balance of dataset
print(train_labels.count(0))
print(train_labels.count(1))

25000
25000
10026
9974


# Model Definition

In [12]:
class CLSTM(nn.Module):
    def __init__(self):
        super(CLSTM,self).__init__()
        # expects input tensor of [1,1,768]
        # 1 input channel, 20 output channels, 1x5 convolution with padding
        self.conv1 = nn.Conv1d(1,20,5,stride = 2,padding = 2)
        # 20 input channel, 10 output channels, 1x5 convolution
        self.conv2 = nn.Conv1d(20,10,5,stride = 2,padding = 2)
        # end up with tensors [1,10,192]
        # these turn into tensors [1,10,48] after pooling which 
        # are turned into a list of 10 [1,48] tensors to pass into the lstm
        # each of these 10 will be taken in as token representations
        
        self.lstm = nn.LSTM(
            input_size = 48,hidden_size = 48,num_layers = 2,bidirectional = True)
        # the lstm will output tuple of ([4,1,48],[4,1,48]) where the first entry is
        # the hidden state for classification and the second entry is the cell state

        self.fc1 = nn.Linear(4*48,120)
        self.fc2 = nn.Linear(120,40)
        self.fc3 = nn.Linear(40,1)
    
    def forward(self,x):
        #set initial hidden state and cell state for lstm
        # 2 is for the number of layers and the other 2 is because of bidirection
        hidden = torch.randn(2*2,1,48)
        cell = torch.randn(2*2,1,48)
        
        x = F.max_pool1d(F.relu(self.conv1(x)),kernel_size = 2,stride = 2)
        x = F.max_pool1d(F.relu(self.conv2(x)),kernel_size = 2,stride = 2)
        # x is now a tensor of shape [1,10,48] because we did the pooling
        
        # now put the "tokens" into a list
        tweet_representation = []
        for i in range(len(x[0])):
            tweet_representation.append(x[0][i].unsqueeze(0))
        # tweet_representation is a list of the ten [1,48] tensors to
        # be passed into the lstm
        for j in tweet_representation:
            _, hidden_tuple = self.lstm(j.view(1,1,-1),(hidden,cell)) 

        # hidden state tensor [4,1,48]
        hidden_state = hidden_tuple[0]
        # flatten the hidden_state to tensor [192]
        x = hidden_state.reshape(-1)
        x = F.relu(self.fc1(x))
        # x is now 1 x 120
        x = F.relu(self.fc2(x))
        # x is now 1 x 30
        x = self.fc3(x)
        # x is now a single predicted class
        return torch.sigmoid(x)
model = CLSTM()
print(model)

CLSTM(
  (conv1): Conv1d(1, 20, kernel_size=(5,), stride=(2,), padding=(2,))
  (conv2): Conv1d(20, 10, kernel_size=(5,), stride=(2,), padding=(2,))
  (lstm): LSTM(48, 48, num_layers=2, bidirectional=True)
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=40, bias=True)
  (fc3): Linear(in_features=40, out_features=1, bias=True)
)


In [4]:
# small test of the model---------------
input = torch.randn(1,1,768)
out = model(input)
print(out)

tensor([0.5107], grad_fn=<SigmoidBackward>)


In [5]:
# This cell formats the input data 
train_data = torch.stack(train_dataset)
train_data.shape

torch.Size([20000, 768])

# Training Model

In [6]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = .0005)

#10 is the number of epochs
total_step = len(train_data)
for epoch in range(5):
    running_loss = 0.0
    for i, data in enumerate(train_data,0):
        #inputs.shape is [1,1,768]
        inputs = data
        #label is int (1 or 0)
        label = train_labels[i]
        label = torch.tensor([float(label)])
        optimizer.zero_grad()
        inputs = inputs.unsqueeze(0).unsqueeze(0)
        output = model(inputs)
        loss = criterion(output,label)
        loss.backward()
        optimizer.step()
        
        #print stats
        running_loss +=loss.item()
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Output [{}], True label [{}]' 
                   .format(epoch+1, 100, i+1, total_step, loss.item(),output.item(),label.item()))
            running_loss = 0.0
print('finished training')

Epoch [1/100], Step [100/20000], Loss: 0.6547, Output [0.5195958018302917], True label [1.0]
Epoch [1/100], Step [200/20000], Loss: 0.6551, Output [0.5193886756896973], True label [1.0]
Epoch [1/100], Step [300/20000], Loss: 0.7018, Output [0.5043042898178101], True label [0.0]
Epoch [1/100], Step [400/20000], Loss: 0.7136, Output [0.5101428031921387], True label [0.0]
Epoch [1/100], Step [500/20000], Loss: 0.7290, Output [0.48241353034973145], True label [1.0]
Epoch [1/100], Step [600/20000], Loss: 0.6922, Output [0.49955064058303833], True label [0.0]
Epoch [1/100], Step [700/20000], Loss: 0.7100, Output [0.5083400011062622], True label [0.0]
Epoch [1/100], Step [800/20000], Loss: 0.6701, Output [0.4883502423763275], True label [0.0]
Epoch [1/100], Step [900/20000], Loss: 0.6920, Output [0.4994107186794281], True label [0.0]
Epoch [1/100], Step [1000/20000], Loss: 0.6715, Output [0.5109560489654541], True label [1.0]
Epoch [1/100], Step [1100/20000], Loss: 0.6500, Output [0.522067666

Epoch [1/100], Step [8900/20000], Loss: 0.1698, Output [0.8438082337379456], True label [1.0]
Epoch [1/100], Step [9000/20000], Loss: 0.5490, Output [0.4224635064601898], True label [0.0]
Epoch [1/100], Step [9100/20000], Loss: 0.2095, Output [0.811008095741272], True label [1.0]
Epoch [1/100], Step [9200/20000], Loss: 1.0072, Output [0.3652304410934448], True label [1.0]
Epoch [1/100], Step [9300/20000], Loss: 0.1182, Output [0.11146822571754456], True label [0.0]
Epoch [1/100], Step [9400/20000], Loss: 0.0126, Output [0.987466037273407], True label [1.0]
Epoch [1/100], Step [9500/20000], Loss: 1.1906, Output [0.6959583163261414], True label [0.0]
Epoch [1/100], Step [9600/20000], Loss: 0.0063, Output [0.9937593936920166], True label [1.0]
Epoch [1/100], Step [9700/20000], Loss: 0.0863, Output [0.9173100590705872], True label [1.0]
Epoch [1/100], Step [9800/20000], Loss: 0.0015, Output [0.9985495209693909], True label [1.0]
Epoch [1/100], Step [9900/20000], Loss: 0.2066, Output [0.186

Epoch [1/100], Step [17600/20000], Loss: 0.2824, Output [0.24599215388298035], True label [0.0]
Epoch [1/100], Step [17700/20000], Loss: 0.3970, Output [0.6723387837409973], True label [1.0]
Epoch [1/100], Step [17800/20000], Loss: 0.2601, Output [0.22901105880737305], True label [0.0]
Epoch [1/100], Step [17900/20000], Loss: 0.0020, Output [0.9979919195175171], True label [1.0]
Epoch [1/100], Step [18000/20000], Loss: 0.0190, Output [0.01881624199450016], True label [0.0]
Epoch [1/100], Step [18100/20000], Loss: 0.5449, Output [0.5799121856689453], True label [1.0]
Epoch [1/100], Step [18200/20000], Loss: 0.0707, Output [0.06828828155994415], True label [0.0]
Epoch [1/100], Step [18300/20000], Loss: 1.8675, Output [0.15450556576251984], True label [1.0]
Epoch [1/100], Step [18400/20000], Loss: 0.1140, Output [0.10773863643407822], True label [0.0]
Epoch [1/100], Step [18500/20000], Loss: 0.0070, Output [0.9930684566497803], True label [1.0]
Epoch [1/100], Step [18600/20000], Loss: 0.0

Epoch [2/100], Step [6300/20000], Loss: 0.0811, Output [0.07792235910892487], True label [0.0]
Epoch [2/100], Step [6400/20000], Loss: 0.0321, Output [0.03156811371445656], True label [0.0]
Epoch [2/100], Step [6500/20000], Loss: 0.0183, Output [0.018114257603883743], True label [0.0]
Epoch [2/100], Step [6600/20000], Loss: 0.4554, Output [0.6341958045959473], True label [1.0]
Epoch [2/100], Step [6700/20000], Loss: 0.0815, Output [0.07828905433416367], True label [0.0]
Epoch [2/100], Step [6800/20000], Loss: 0.4945, Output [0.6098706722259521], True label [1.0]
Epoch [2/100], Step [6900/20000], Loss: 0.2396, Output [0.7869529128074646], True label [1.0]
Epoch [2/100], Step [7000/20000], Loss: 0.1475, Output [0.862885057926178], True label [1.0]
Epoch [2/100], Step [7100/20000], Loss: 3.4434, Output [0.03195711597800255], True label [1.0]
Epoch [2/100], Step [7200/20000], Loss: 0.0184, Output [0.018203796818852425], True label [0.0]
Epoch [2/100], Step [7300/20000], Loss: 0.0794, Outpu

Epoch [2/100], Step [15000/20000], Loss: 0.0021, Output [0.9978669881820679], True label [1.0]
Epoch [2/100], Step [15100/20000], Loss: 0.1400, Output [0.8693951964378357], True label [1.0]
Epoch [2/100], Step [15200/20000], Loss: 0.0175, Output [0.01737183891236782], True label [0.0]
Epoch [2/100], Step [15300/20000], Loss: 0.9902, Output [0.3715190887451172], True label [1.0]
Epoch [2/100], Step [15400/20000], Loss: 0.7723, Output [0.5380613207817078], True label [0.0]
Epoch [2/100], Step [15500/20000], Loss: 0.0448, Output [0.9561833739280701], True label [1.0]
Epoch [2/100], Step [15600/20000], Loss: 0.0647, Output [0.06261946260929108], True label [0.0]
Epoch [2/100], Step [15700/20000], Loss: 0.1391, Output [0.12986840307712555], True label [0.0]
Epoch [2/100], Step [15800/20000], Loss: 0.6009, Output [0.4516942799091339], True label [0.0]
Epoch [2/100], Step [15900/20000], Loss: 0.5874, Output [0.4442179799079895], True label [0.0]
Epoch [2/100], Step [16000/20000], Loss: 0.1784

Epoch [3/100], Step [3700/20000], Loss: 4.2501, Output [0.9857368469238281], True label [0.0]
Epoch [3/100], Step [3800/20000], Loss: 0.0568, Output [0.9447932839393616], True label [1.0]
Epoch [3/100], Step [3900/20000], Loss: 0.0096, Output [0.990408182144165], True label [1.0]
Epoch [3/100], Step [4000/20000], Loss: 0.3860, Output [0.32020485401153564], True label [0.0]
Epoch [3/100], Step [4100/20000], Loss: 0.0274, Output [0.9729799628257751], True label [1.0]
Epoch [3/100], Step [4200/20000], Loss: 1.1763, Output [0.3084242343902588], True label [1.0]
Epoch [3/100], Step [4300/20000], Loss: 0.0349, Output [0.03430630639195442], True label [0.0]
Epoch [3/100], Step [4400/20000], Loss: 0.0711, Output [0.9314007759094238], True label [1.0]
Epoch [3/100], Step [4500/20000], Loss: 0.0176, Output [0.017466846853494644], True label [0.0]
Epoch [3/100], Step [4600/20000], Loss: 0.0209, Output [0.02065722458064556], True label [0.0]
Epoch [3/100], Step [4700/20000], Loss: 0.0379, Output [

Epoch [3/100], Step [12400/20000], Loss: 0.0007, Output [0.9992559552192688], True label [1.0]
Epoch [3/100], Step [12500/20000], Loss: 0.0090, Output [0.9910446405410767], True label [1.0]
Epoch [3/100], Step [12600/20000], Loss: 0.0040, Output [0.996037483215332], True label [1.0]
Epoch [3/100], Step [12700/20000], Loss: 0.0464, Output [0.04533683881163597], True label [0.0]
Epoch [3/100], Step [12800/20000], Loss: 0.5354, Output [0.414536714553833], True label [0.0]
Epoch [3/100], Step [12900/20000], Loss: 0.0394, Output [0.03863022103905678], True label [0.0]
Epoch [3/100], Step [13000/20000], Loss: 0.4162, Output [0.34048065543174744], True label [0.0]
Epoch [3/100], Step [13100/20000], Loss: 0.2103, Output [0.18963567912578583], True label [0.0]
Epoch [3/100], Step [13200/20000], Loss: 0.0337, Output [0.033102843910455704], True label [0.0]
Epoch [3/100], Step [13300/20000], Loss: 0.1186, Output [0.11187651008367538], True label [0.0]
Epoch [3/100], Step [13400/20000], Loss: 0.16

Epoch [4/100], Step [1000/20000], Loss: 0.0062, Output [0.9937763214111328], True label [1.0]
Epoch [4/100], Step [1100/20000], Loss: 0.3114, Output [0.732429027557373], True label [1.0]
Epoch [4/100], Step [1200/20000], Loss: 0.2669, Output [0.2342180609703064], True label [0.0]
Epoch [4/100], Step [1300/20000], Loss: 0.1779, Output [0.8369870185852051], True label [1.0]
Epoch [4/100], Step [1400/20000], Loss: 0.0086, Output [0.9913914203643799], True label [1.0]
Epoch [4/100], Step [1500/20000], Loss: 0.0139, Output [0.01381584070622921], True label [0.0]
Epoch [4/100], Step [1600/20000], Loss: 0.0181, Output [0.017960885539650917], True label [0.0]
Epoch [4/100], Step [1700/20000], Loss: 0.0819, Output [0.07861167192459106], True label [0.0]
Epoch [4/100], Step [1800/20000], Loss: 0.0877, Output [0.08397344499826431], True label [0.0]
Epoch [4/100], Step [1900/20000], Loss: 0.0463, Output [0.045273005962371826], True label [0.0]
Epoch [4/100], Step [2000/20000], Loss: 0.1442, Output

Epoch [4/100], Step [9700/20000], Loss: 0.0087, Output [0.9913249015808105], True label [1.0]
Epoch [4/100], Step [9800/20000], Loss: 0.0001, Output [0.9998549222946167], True label [1.0]
Epoch [4/100], Step [9900/20000], Loss: 0.1840, Output [0.1680440902709961], True label [0.0]
Epoch [4/100], Step [10000/20000], Loss: 0.0511, Output [0.04977668449282646], True label [0.0]
Epoch [4/100], Step [10100/20000], Loss: 0.2336, Output [0.20836026966571808], True label [0.0]
Epoch [4/100], Step [10200/20000], Loss: 0.0001, Output [0.9998511075973511], True label [1.0]
Epoch [4/100], Step [10300/20000], Loss: 0.0092, Output [0.9908186793327332], True label [1.0]
Epoch [4/100], Step [10400/20000], Loss: 0.0008, Output [0.9992165565490723], True label [1.0]
Epoch [4/100], Step [10500/20000], Loss: 0.0000, Output [0.9999872446060181], True label [1.0]
Epoch [4/100], Step [10600/20000], Loss: 0.0895, Output [0.914363443851471], True label [1.0]
Epoch [4/100], Step [10700/20000], Loss: 0.0000, Out

Epoch [4/100], Step [18300/20000], Loss: 0.5056, Output [0.6031233072280884], True label [1.0]
Epoch [4/100], Step [18400/20000], Loss: 0.3203, Output [0.27410510182380676], True label [0.0]
Epoch [4/100], Step [18500/20000], Loss: 0.0033, Output [0.9966829419136047], True label [1.0]
Epoch [4/100], Step [18600/20000], Loss: 0.0213, Output [0.021069055423140526], True label [0.0]
Epoch [4/100], Step [18700/20000], Loss: 0.0131, Output [0.986936628818512], True label [1.0]
Epoch [4/100], Step [18800/20000], Loss: 0.1998, Output [0.18107154965400696], True label [0.0]
Epoch [4/100], Step [18900/20000], Loss: 0.0029, Output [0.9971045851707458], True label [1.0]
Epoch [4/100], Step [19000/20000], Loss: 0.0068, Output [0.9932228326797485], True label [1.0]
Epoch [4/100], Step [19100/20000], Loss: 0.3101, Output [0.7333626747131348], True label [1.0]
Epoch [4/100], Step [19200/20000], Loss: 1.4519, Output [0.7658823728561401], True label [0.0]
Epoch [4/100], Step [19300/20000], Loss: 0.2650

Epoch [5/100], Step [7000/20000], Loss: 0.0608, Output [0.941002368927002], True label [1.0]
Epoch [5/100], Step [7100/20000], Loss: 2.7491, Output [0.06398602575063705], True label [1.0]
Epoch [5/100], Step [7200/20000], Loss: 0.0090, Output [0.008947568945586681], True label [0.0]
Epoch [5/100], Step [7300/20000], Loss: 0.1328, Output [0.12434623390436172], True label [0.0]
Epoch [5/100], Step [7400/20000], Loss: 0.0169, Output [0.01672566868364811], True label [0.0]
Epoch [5/100], Step [7500/20000], Loss: 0.3860, Output [0.3202190697193146], True label [0.0]
Epoch [5/100], Step [7600/20000], Loss: 0.0216, Output [0.02132013440132141], True label [0.0]
Epoch [5/100], Step [7700/20000], Loss: 0.3107, Output [0.2670917212963104], True label [0.0]
Epoch [5/100], Step [7800/20000], Loss: 0.0637, Output [0.061710186302661896], True label [0.0]
Epoch [5/100], Step [7900/20000], Loss: 0.0845, Output [0.919002890586853], True label [1.0]
Epoch [5/100], Step [8000/20000], Loss: 0.0286, Output

Epoch [5/100], Step [15700/20000], Loss: 0.1409, Output [0.13143178820610046], True label [0.0]
Epoch [5/100], Step [15800/20000], Loss: 0.4271, Output [0.3476143181324005], True label [0.0]
Epoch [5/100], Step [15900/20000], Loss: 0.3778, Output [0.31464147567749023], True label [0.0]
Epoch [5/100], Step [16000/20000], Loss: 0.2241, Output [0.7992240190505981], True label [1.0]
Epoch [5/100], Step [16100/20000], Loss: 0.4591, Output [0.6318541169166565], True label [1.0]
Epoch [5/100], Step [16200/20000], Loss: 0.3051, Output [0.26294490694999695], True label [0.0]
Epoch [5/100], Step [16300/20000], Loss: 0.0358, Output [0.035125795751810074], True label [0.0]
Epoch [5/100], Step [16400/20000], Loss: 0.0485, Output [0.04732256010174751], True label [0.0]
Epoch [5/100], Step [16500/20000], Loss: 0.0290, Output [0.9713807106018066], True label [1.0]
Epoch [5/100], Step [16600/20000], Loss: 0.7418, Output [0.5237246155738831], True label [0.0]
Epoch [5/100], Step [16700/20000], Loss: 0.0

# Testing Model

In [7]:
correct = 0
total = 0
with torch.no_grad():
    for i, data in enumerate(test_dataset,0):
        #inputs.shape is [1,1,768]
        inputs = data
        #label is int (1 or 0)
        label = test_labels[i]
        label = torch.tensor([float(label)])
        inputs = inputs.unsqueeze(0).unsqueeze(0)
        output = model(inputs)
        if(output >=.5):
            output = 1
        else:
            output = 0
        total+=1
        correct+= int(output == label)
        
print('Accuracy of the model on 5000 test tweets: %d %%' % (100*correct/total))


Accuracy of the model on 5000 test tweets: 87 %
