In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from torch.utils.data.dataloader import DataLoader
import itertools
from torchtext.vocab import Vocab
from torch.utils.data.dataset import Dataset, TensorDataset
from pathlib import Path
from collections import Counter
from langdetect import detect

from torchtext import data   

UNKNOWN_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
SPECIAL_TOKENS = [UNKNOWN_TOKEN, PAD_TOKEN]


In [2]:
df = pd.read_csv('subjectivity_train.csv')

In [3]:
splitted_words = df['text'].apply(lambda x:x.split())
lengths = splitted_words.apply(lambda x:len(x))
splitted_words = sum(splitted_words, [])

In [4]:
vocab = list(set(splitted_words))
words_map = {word:i for i,word in enumerate(vocab)}
words_map[UNKNOWN_TOKEN] = len(words_map)
words_map[PAD_TOKEN] = len(words_map)

In [5]:
max_length = max(lengths)

In [6]:
class DataReader:
    # Read input file and get sentence
    def __init__(self, file_path, words_dict):
        self.file_path = file_path
        self.words_dict = words_dict
        self.get_sentences()
    
    def sentence_to_list_of_nums(self, sentence):
        words = sentence.split()
        new_sentence = [self.words_dict[x] if x in self.words_dict else self.words_dict[UNKNOWN_TOKEN] for x in words]
        return new_sentence
    
    def get_sentences(self):
        df = pd.read_csv(self.file_path)
        df['text'] = df['text'].apply(lambda x:self.sentence_to_list_of_nums(x))
        self.labels = list(df['label'])
        self.sentences = list(df['text'])
        

In [7]:
class parseDataset(Dataset):
    def __init__(self, word_dict, file, padding=False, word_embeddings=None):
        super().__init__()
        self.file = file
        self.datareader = DataReader(self.file, word_dict)
        self.word_vocab_size = len(self.datareader.words_dict)
  
        if word_embeddings:
            self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = word_embeddings
        else:
            self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = self.init_word_embeddings(word_dict)

        #self.unknown_idx = self.word_idx_mappings.get(UNKNOWN_TOKEN)
        self.word_vector_dim = self.word_vectors.size(-1)
        
        self.sentence_lens = [len(sentence) for sentence in self.datareader.sentences]
        self.max_seq_len = max(self.sentence_lens)
        self.sentences_dataset = self.convert_sentences_to_dataset(padding)

    def __len__(self):
        return len(self.sentences_dataset)
    
    def __getitem__(self, index):
        word_embed_idx,label = self.sentences_dataset[index]
        return word_embed_idx, label

    @staticmethod
    def init_word_embeddings(word_dict):
        glove = Vocab(Counter(word_dict), vectors="glove.6B.100d", specials=SPECIAL_TOKENS)
        return glove.stoi, glove.itos, glove.vectors
    
    def get_word_embeddings(self):
        return self.word_idx_mappings, self.idx_word_mappings, self.word_vectors

    def convert_sentences_to_dataset(self, padding):
        sentence_word_idx_list = []
        sentence_len_list = []
        labels = []
        for i in range(len(self.datareader.sentences)):
            words_idx_list = self.datareader.sentences[i]
            label = self.datareader.labels[i]
            sentence_len = len(words_idx_list)
            sentence_word_idx_list.append(torch.tensor(words_idx_list, dtype=torch.long, requires_grad=False))
            labels.append(torch.tensor(label, dtype=torch.float, requires_grad=False))
            sentence_len_list.append(sentence_len)
    
        return {i: sample_tuple for i, sample_tuple in enumerate(zip(sentence_word_idx_list,
                                                                     labels))}

In [8]:
class LSTM(nn.Module):
    def __init__(self, word_embedding_dim, hidden_dim, word_vocab_size):
        super(LSTM, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.word_embedding = nn.Embedding(word_vocab_size,word_embedding_dim)
        self.emb_dim = word_embedding_dim
        self.lstm = nn.LSTM(input_size=word_embedding_dim,bidirectional=False, hidden_size=hidden_dim, num_layers=1,batch_first=True )
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, sentence):
        word_idx_tensor = sentence
        word_embeds = self.word_embedding(word_idx_tensor.to(self.device))
        output, (hidden,_) = self.lstm(word_embeds)
        hidden = hidden.view(-1)
        out = self.fc(hidden)
        out =  self.sigmoid(out)
        return out

In [9]:

WORD_EMBEDDING_DIM = 20
HIDDEN_DIM = 50
train = parseDataset(words_map,'subjectivity_train.csv')
train_dataloader = DataLoader(train, shuffle=True)
word_vocab_size = train.word_vocab_size

In [10]:
test = parseDataset(words_map,'subjectivity_test.csv')
test_dataloader = DataLoader(test, shuffle=False)

In [11]:
import pickle
with open('words_dict_subjectivity3.pkl', 'wb') as f:
    pickle.dump(words_map, f)


In [12]:
model = LSTM(WORD_EMBEDDING_DIM, HIDDEN_DIM, word_vocab_size)
device = torch.device("cpu")

In [13]:
from sklearn.metrics import accuracy_score
model.train()
optimizer = optim.Adam(model.parameters(), lr=0.01)
acumulate_grad_steps = 50 
accuracy_list = []
loss_list = []
epochs = 5
loss = 0
loss_function = nn.BCELoss()
epoch_loss = []
epoch_accuracy = []

for epoch in range(epochs):
    acc = 0 #to keep track of accuracy
    loss = 0 # To keep track of the loss value
    i = 0
    total_loss = []
    scores = []
    true = []
    for sentence,label in train_dataloader:
        i += 1
        
        score = model(sentence)
        loss = loss_function(score, label)
        loss = loss / acumulate_grad_steps
        loss.backward()
        if i % acumulate_grad_steps == 0:
            optimizer.step()
            model.zero_grad()
        total_loss.append(loss.item()) 
        scores.append(score)
        true.append(label)
    
    predictions = [1 if score>0.5 else 0 for score in scores]
    accuracy = accuracy_score(predictions,true)
    epoch_accuracy.append(accuracy)
    mean_loss = sum(total_loss)/len(total_loss)
    epoch_loss.append(mean_loss)
    print(accuracy)

0.7157762413576367
0.9024512884978001
0.9637963544940289
0.9891891891891892
0.995223130106851


In [14]:
model.eval()

for sentence,label in test_dataloader:
    score = model(sentence)
    scores.append(score)
    true.append(label)
predictions = [1 if score>0.5 else 0 for score in scores]
accuracy = accuracy_score(predictions,true)

print(accuracy)

0.9942174732872407


In [15]:
torch.save(model.state_dict(), 'subjectivity_model3')