In [None]:
import os
import argparse
import numpy as np
import multiprocessing
from typing import List
import pandas as pd

import sys
filename = os.path.dirname(__file__)[:-1]
filename = "/".join(filename.split("/")[:-1])
sys.path.append(os.path.join(filename, 'preprocess'))

from tokenizer import Tokenizer
from torchtext.vocab import GloVe
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
from torch.utils.data import TensorDataset, DataLoader

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is NOT available")

device = torch.device("gpu") if torch.cuda.is_available() else torch.device("cpu")

: 

In [None]:
input_path = "/Users/tunaberkalmaci/Downloads/twitter_sentiment_analysis/src/data/processed/processed.csv"
log_path = "/Users/tunaberkalmaci/Downloads/twitter_sentiment_analysis/src/data/logs"
log_filename = "logs"

: 

In [None]:
data_type_name = "lowercase_slangconv_nouser_nourl_hashtagsegmented_noextraspace_nostop_softlem_nonumbers"
train_df = pd.read_csv(input_path)
train_df = train_df.dropna()
tweets = np.array(train_df["tweet"].values)
labels = np.array(train_df["labels"].values)

: 

In [None]:
glove_dims = [100, 200]
model_types = ["bilstm"]
vocab_size = 25000
batch_size = 64

: 

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
aggregated_acc = []

: 

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, num_layers, vocab_size, hidden_dim, embedding_dim, output_dim, drop_prob=0.2):
        super(BiLSTM, self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.num_layers = num_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, batch_first=True)

        self.dropout = nn.Dropout(drop_prob)

        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        bath_size = x.shape[0]
        embedding = self.embedding(x)
        lstm_out, hidden = self.lstm(embedding, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        seg_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden

    def init_hidden(self, batch_size):
        h0 = torch.zeros((self.num_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.num_layers, batch_size, self.hidden_dim)).to(device)
        hidden = (h0, c0)
        return hidden

In [None]:
for i, (train_indices, val_indices) in enumerate(skf.split(tweets, labels)):
    training_tweets = tweets[train_indices]
    val_tweets = tweets[val_indices]

    training_labels = labels[train_indices]
    val_labels = labels[val_indices]

    accuracy = None
    model = None

    word_list = []

    max_len = 0

    for tweet in training_tweets:
        max_len = max(max_len, len(tweet.split()))
        for word in tweet.split():
            word_list.append(word)

    print(f"Max length of a tweet is {max_len}")
    corpus = Counter(word_list)
    corpus_ = sorted(corpus, key=corpus.get, reverse=True)[:vocab_size]
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}

    final_training = []
    final_val = []

    for tweet in training_tweets:
        final_training.append([onehot_dict[word] for word in tweet.split() if word in onehot_dict.keys()])

    for tweet in val_tweets:
        final_val.append([onehot_dict[word] for word in tweet.split() if word in onehot_dict.keys()])

    train_padded = np.zeros((len(final_training), max_len), dtype=int)
    val_padded = np.zeros((len(final_val), max_len), dtype=int)

    for ii, tweet in enumerate(final_training):
        if len(tweet) != 0:
            train_padded[ii, -len(tweet):] = np.array(tweet)[:max_len]

    for ii, tweet in enumerate(final_val):
        if len(tweet) != 0:
            val_padded[ii, -len(tweet):] = np.array(tweet)[:max_len]

    train_data = TensorDataset(torch.from_numpy(train_padded), torch.from_numpy(training_labels))
    val_data = TensorDataset(torch.from_numpy(val_padded), torch.from_numpy(val_labels))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)

