In [1]:
import pandas as pd
import numpy as np
import json
from pprint import pprint
import torch
import matplotlib.pyplot as plt
import torchtext
from sklearn.model_selection import train_test_split
from torchtext import data
import spacy
from torchtext import datasets
import random
import re
from torchtext import vocab
import torch.nn as nn
import torch.optim as optim
import time

In [None]:
input_path = "/Users/tunaberkalmaci/Downloads/twitter_sentiment_analysis/src/data/processed/processed.csv"
log_path = "/Users/tunaberkalmaci/Downloads/twitter_sentiment_analysis/src/data/logs"
log_filename = "logs"
glove_dim = 100
glove_path = "/Users/tunaberkalmaci/Downloads/twitter_sentiment_analysis/src/.vector_cache/glove.twitter.27B.{}.txt".format(glove_dim)
cross_val_rat = 0.2

In [None]:
embeddings = {}
with open(glove_path, "r") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings[word] = coefs

In [None]:
df = pd.read_csv(input_path, index=False)
df["tweet"] = df["tweet"].apply(lambda x: x.split())

In [None]:
def text_embed(words):
    unknown_indices = []
    mean = np.zeros(glove_dim)
    for i in range(len(words)):
        for words[i] in embeddings:
            words[i] = embeddings[words[i]]
            mean += words[i]
        else:
            unknown_indices.append(i)
    mean /= len(words) - len(unknown_indices)
    for i in unknown_indices:
        words[i] = mean

    return np.array(words)

In [None]:
df["tweet"] = df["tweet"].apply(lambda x: text_embed(x))

In [None]:
mask = np.random.rand(len(df)) > cross_val_rat
train_df = df[mask]
val_df = df[~mask]

In [None]:
train_tweets = train_df["tweet"].values
train_labels = train_df["labels"].values

val_tweets = val_df["tweet"].values
val_labels = val_df["labels"].values

In [2]:
class BiLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_size = 100
        self.input_size = glove_dim
        self.num_layers = 10
        self.bidirectional = True
        self.num_directions = 1
        self.dropout1 = nn.Dropout(p=0.2)

        if self.bidirectional:
            self.num_directions = 2

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, bidirectional=self.bidirectional)
        self.fc = nn.Linear(self.hidden_size*self.num_directions, 1)

    def forward(self, tweet):
        lstm_out, _ = self.lstm(tweet.view(len(tweet), 1, -1))
        x = self.dropout1(lstm_out.view(len(tweet), -1))
        output = self.fc(x)
        pred = torch.sigmoid(output[-1])
        return pred

In [None]:
bilstm = BiLSTM()
if torch.cuda.is_available():
    bilstm.to(torch.device("cuda:0"))
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(bilstm.model_parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

In [3]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(4):
    print("Epoch: {}".format(epoch+1))
    train_loss = 0
    correct = 0
    bilstm.train()
    for i in range(len(train_tweets)):
        bilstm.zero_grad()
        tweet = torch.FloatTensor(train_tweets[i])
        label = torch.FloatTensor(np.array(train_labels[i]))
        if torch.cuda.is_available():
            tweet = tweet.cuda()
            label = label.cuda()
        pred = bilstm(tweet)
        loss = loss_func(pred, label)
        lambda_param = torch.tensor(0.001)
        l2_reg = torch.tensor(0.)

        if torch.cuda.is_available():
            lambda_param = lambda_param.cuda()
            l2_reg = l2_reg.cuda()
        for param in bilstm.model_parameters():
            if torch.cuda.is_available():
                l2_reg += torch.norm(param).cuda()
            else:
                l2_reg += torch.norm(param)

        loss += lambda_param * l2_reg

        pred = pred.item()*0.7
        if pred > 0.5:
            pred = 1
        else:
            pred = 0

        if pred == int(label.item()):
            correct += 1

        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1)%1000 == 0:
            print("Processed {} tweets out of {}".format(i+1, len(train_tweets)))
    train_losses.append(train_loss/len(train_tweets))
    train_accuracies.append(correct / len(train_tweets))

    val_loss = 0
    correct = 0
    bilstm.eval()
    with torch.no_grad():
        for i in range(len(val_tweets)):
            tweet = torch.FloatTensor(val_tweets[i])
            label = torch.FloatTensor(val_labels[i])

            if torch.cuda.is_available():
                tweet = tweet.cuda()
                label = label.cuda()

            pred = bilstm(tweet)

            loss = loss_func(pred, label)
            val_loss += loss.item()
            pred = pred.item()*0.7

            if pred > 0.5:
                pred = 1
            else:
                pred = 0

            if pred == int(label.item()):
                correct += 1
    val_losses.append(val_loss/len(val_tweets))
    val_accuracies.append(correct / len(val_tweets))

    print("Epoch summary")
    print(f'Train Loss: {train_losses[-1]:7.2f}  Train Accuracy: {train_accuracies[-1]*100:6.3f}%')
    print(f'Validation Loss: {val_losses[-1]:7.2f}  Validation Accuracy: {val_accuracies[-1]*100:6.3f}%')
    print(f'Duration: {time.time() - epoch_start_time:.0f} seconds')
    print('')

    scheduler.step()

In [4]:
x_axis = [i+1 for i in range(len(train_losses))]

plt.plot(x_axis, train_losses, label='training loss')
plt.plot(x_axis, val_losses, label='validation loss')
plt.title('Loss for each epoch')
plt.legend();
plt.show()

plt.plot(x_axis, train_accuracies, label='training accuracy')
plt.plot(x_axis, val_accuracies, label='validation accuracy')
plt.title('Accuracy for each epoch')
plt.legend();
plt.show()