In [None]:
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from google.colab import drive
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
drive.mount('/content/gdrive')
path=f'/content/sentiment_analysis.pt'


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Mounted at /content/gdrive


In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")

GPU is available


In [None]:
train_dataset=pd.read_csv('/content/Train.csv')
train_x=train_dataset['text']
train_y=train_dataset['label'].values
test_dataset=pd.read_csv('/content/Test.csv')
test_x=test_dataset['text']
test_y=test_dataset['label'].values
valid_dataset=pd.read_csv('/content/Valid.csv')
valid_x=valid_dataset['text']
valid_y=valid_dataset['label'].values
def preprocess(s):
    s = re.sub(r"[^\w\s]", '', s)
    s = re.sub(r"\s+", '', s)
    s = re.sub(r"\d", '', s)
    
    return s


In [None]:
def tockenize(x_train,x_val,x_test):
    word_list = []
    stop_words = set(stopwords.words('english')) 
    for sentence in x_train:
        for word in sentence.lower().split():
            word = preprocess(word)
            
            if word not in stop_words and word != '':
                lemmatizer = WordNetLemmatizer()
                wordnet_lemmatizer=lemmatizer.lemmatize(word)

                word_list.append(wordnet_lemmatizer)

    Count = Counter(word_list)
    
    corpus = sorted(Count,key=Count.get,reverse=True)

    freq_words = {w:i+1 for i,w in enumerate(corpus)}
    
    train_set,test_set,valid_set = [],[],[]
    for i in x_train:
            train_set.append([freq_words[preprocess(word)] for word in i.lower().split() 
                                     if preprocess(word) in freq_words.keys()])
    for i in x_val:
            valid_set.append([freq_words[preprocess(word)] for word in i.lower().split() 
                                    if preprocess(word) in freq_words.keys()])
    
    for i in x_test:
            test_set.append([freq_words[preprocess(word)] for word in i.lower().split()
                                  if preprocess(word) in freq_words.keys()])
    
    return np.array(train_set),np.array(test_set),np.array(valid_set),freq_words


In [None]:
x_train,x_test,x_valid,vocab = tockenize(train_x,valid_x,test_x)

KeyboardInterrupt: ignored

In [None]:
def padding(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [None]:
x_train_pad = padding(x_train,500)
x_valid_pad = padding(x_valid,500)


In [None]:
train_y=np.array(train_y)
valid_y=np.array(valid_y)

In [None]:
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(x_valid_pad), torch.from_numpy(valid_y))
# training and validation dataloaders
batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)


In [None]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

In [None]:
class LSTM(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(LSTM,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        
        # dropout layer/ regularization
        self.dropout = nn.Dropout(0.4)
    
        # linear and sigmoid layer
        self.lin = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self,x,hidden):
      
        batch_size = x.size(0)
        
        embeds = self.embedding(x)  
        
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        out = self.dropout(lstm_out)
        out = self.lin(out)
        sig_out = self.sig(out)


        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1]
 
        return sig_out, hidden
    def init_hidden(self, batch_size):
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

    

In [None]:
no_layers = 3
vocab_size = len(vocab) + 1 
embedding_dim = 64
output_dim = 1
hidden_dim = 128
model = LSTM(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)
model.to(device)

print(model)

LSTM(
  (embedding): Embedding(145331, 64)
  (lstm): LSTM(64, 128, num_layers=3, batch_first=True)
  (dropout): Dropout(p=0.4, inplace=False)
  (lin): Linear(in_features=128, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
lr=0.001

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [None]:
clip = 5
epochs = 5
valid_loss_min = np.Inf

epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
     
    h = model.init_hidden(batch_size)
    train_pre=[]
    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device), labels.to(device)   
        h = tuple([each.data for each in h])
        
        model.zero_grad()
        output,h = model(inputs,h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        accuracy = acc(output,labels)
        train_acc += accuracy
        train_pre.append(1 if output>0.5 else 0)
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
 
    
        
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_pre=[]
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())

            val_losses.append(val_loss.item())
            val_pre.append(1 if output>0.5 else 0)
            accuracy = acc(output,labels)
            val_acc += accuracy


            
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss} ')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    Print(f'traing F score: {f1_score(train_y,train_pre)} val F score: {f1_score(valid_y,val_pre)}')
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), path)
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    print(50*'==')

Epoch 1
train_loss : 0.5529609617963434 val_loss : 0.3798975059390068 
train_accuracy : 70.085 val_accuracy : 83.6
Validation loss decreased (inf --> 0.379898).  Saving model ...
Epoch 2
train_loss : 0.31443127994425596 val_loss : 0.30401674456894395 
train_accuracy : 87.165 val_accuracy : 87.18
Validation loss decreased (0.379898 --> 0.304017).  Saving model ...
Epoch 3
train_loss : 0.21648646645247938 val_loss : 0.3018786977976561 
train_accuracy : 91.78500000000001 val_accuracy : 88.5
Validation loss decreased (0.304017 --> 0.301879).  Saving model ...
Epoch 4
train_loss : 0.15825813881820067 val_loss : 0.295968798995018 
train_accuracy : 94.325 val_accuracy : 88.75999999999999
Validation loss decreased (0.301879 --> 0.295969).  Saving model ...
Epoch 5
train_loss : 0.10925160888698883 val_loss : 0.3120652002096176 
train_accuracy : 96.45 val_accuracy : 88.44


In [None]:
def predict_text(text):
        word_seq = np.array([vocab[preprocess(word)] for word in text.split() 
                         if preprocess(word) in vocab.keys()])
        word_seq = np.expand_dims(word_seq,axis=0)
        pad =  torch.from_numpy(padding(word_seq,500))
        inputs = pad.to(device)
        batch_size = 1
        h = model.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        output, h = model(inputs, h)
        return(output)

In [None]:
def get_classification_metric(testy, probs):
    from sklearn.metrics import precision_recall_curve
    precision, recall, thresholds = precision_recall_curve(testy, probs)
    # convert to f score
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    return fscore[ix]

In [None]:
index = 0
pre_acc=0.0
pred=[]
print(len(test_x),len(test_y))
for i in test_x:
  prediction = predict_text(i)
  status = "negative" if prediction > 0.5 else "positive"
  
  pre_acc+=acc(prediction,test_y[index])
  prediction= 0 if status=="positive" else 1
  pred.append(prediction)
  
  print(test_y[index],pre_acc,prediction,status)
  index+=1
  print("-"*10)

f1=f1_score(test_y,pred)
ac=pre_acc/len(test_x)
print(ac)
print(f1*100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1 2180.0 1 negative
----------
1 2181.0 1 negative
----------
0 2182.0 0 positive
----------
1 2182.0 0 positive
----------
0 2183.0 0 positive
----------
0 2184.0 0 positive
----------
1 2185.0 1 negative
----------
0 2186.0 0 positive
----------
0 2187.0 0 positive
----------
1 2187.0 0 positive
----------
1 2188.0 1 negative
----------
1 2189.0 1 negative
----------
1 2190.0 1 negative
----------
1 2191.0 1 negative
----------
1 2192.0 1 negative
----------
1 2193.0 1 negative
----------
0 2194.0 0 positive
----------
1 2195.0 1 negative
----------
1 2196.0 1 negative
----------
1 2197.0 1 negative
----------
1 2198.0 1 negative
----------
1 2199.0 1 negative
----------
0 2200.0 0 positive
----------
0 2201.0 0 positive
----------
0 2202.0 0 positive
----------
1 2203.0 1 negative
----------
0 2204.0 0 positive
----------
1 2205.0 1 negative
----------
1 2206.0 1 negative
----------
1 2207.0 1 negative
----------
1 220

In [None]:
print("="*50)
print("Test accuracy: ",ac*100)
print("F1 Score:",f1*100)
print("="*50)

Test accuracy:  87.33999999999999
F1 Score: 87.28149487643158
