In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
import jovian
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


<IPython.core.display.Javascript object>

In [2]:
np.random.seed(112)
df = pd.read_csv("file_name.csv")
df['length'] = df['text'].apply(lambda x: len(x.split()))
df.head()
print( np.mean(df['length']))

25.73388029088197


In [3]:
tok = spacy.load('en_core_web_sm')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['text']))
    
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 43255
num_words after: 17548


In [4]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
    
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

df['encoded'] = df['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))
df.head()
Counter(df['labels'])

Counter({1: 17308, 0: 9507})

In [5]:
X = list(df['encoded'])
y = list(df['labels'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [6]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [11]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs +1 ):
        model.train()
        sum_loss = 0.0
        train_acc = 0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            loss = F.cross_entropy(y_pred, y)
            _, pred = torch.max(y_pred.data, 1)
            train_acc += (pred == y).sum().item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl, i)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, 100 * val_loss, val_acc, val_rmse))
        tra_accur = 100 * train_acc / len(train_ds)
        if i % 1 == 0:
            print(f'Epoch: {i+1}, training accuracy: {tra_accur}')

def conf_matrix_table(cf_matrix):
    group_names = ['TN','FP','FN','TP']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names, cf_matrix.flatten(), group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot=labels, fmt="", cmap='Blues')
    plt.show()

def validation_metrics (model, valid_dl, i):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    
    y_pred = []
    y_true= []
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        y_pred.extend(pred.data.numpy())
        y_true.extend(y.data.numpy())
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    cf_matrix = confusion_matrix(y_true, y_pred, labels=[0,1])
    conf_matrix_table(cf_matrix=cf_matrix)
    if i == 30:
        print("conf matrix: \n",cf_matrix )

    return sum_loss/total, correct/total, sum_rmse/total


In [12]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 25)
        self.linear2 = nn.Linear(25, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear2(self.linear(ht[-1]))
    
model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)

train_model(model_fixed, epochs=30, lr=0.01)


Epoch: 1, training accuracy: 49.664366958791724
train loss 0.765, val loss 69.550, val accuracy 0.399, and val rmse 0.775
Epoch: 2, training accuracy: 57.08092485549133
Epoch: 3, training accuracy: 58.99683013238859
Epoch: 4, training accuracy: 64.60469886257691
Epoch: 5, training accuracy: 64.60003729256013
Epoch: 6, training accuracy: 64.64199142271117
train loss 0.646, val loss 64.547, val accuracy 0.642, and val rmse 0.598
Epoch: 7, training accuracy: 64.65131456274473
Epoch: 8, training accuracy: 64.64665299272795
Epoch: 9, training accuracy: 64.64665299272795
Epoch: 10, training accuracy: 64.66529927279508
Epoch: 11, training accuracy: 64.66529927279508
train loss 0.617, val loss 58.978, val accuracy 0.689, and val rmse 0.558
Epoch: 12, training accuracy: 64.6606377027783
Epoch: 13, training accuracy: 69.14972962893903
Epoch: 14, training accuracy: 72.31027410031699
Epoch: 15, training accuracy: 76.24930076449748
Epoch: 16, training accuracy: 79.19541301510348
train loss 0.358, v