In [1]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import torchvision
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader,Dataset, random_split
import matplotlib.pyplot as plt
import random 
import time
from tqdm import tqdm_notebook as tq
import warnings
import pickle as pkl
warnings.filterwarnings("ignore")
import string
import sys
from nltk.corpus import stopwords
from torch.autograd import Variable
plt.ion()

In [2]:
BATCH_SIZE = 32
epochs = 10
num_class = 2
embed_dim = 100

In [3]:
device = torch.device("gpu" if torch.cuda.is_available() else "cpu")

In [4]:
class YelpReviewsSentimentAnalysis(nn.Module):
    
    def __init__(self,vocab_size,embed_dim,num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):w
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [5]:
class YelpDataset(Dataset):
    
    def __init__(self,json_file,threshold=3):
        
        self.raw_data = pd.read_json(json_file,lines=True)
    
        self.raw_data['label'] = self.raw_data.stars.apply(lambda x : 1 if x>=3 else 0)
        
        self.raw_data = self.raw_data[["label","text"]].iloc[:100000,]
        
        self.word2idx = {}
        
        self.idx2word = {}
        
        self.word2freq = {}
        self.word_count = 0
        
        self.maxLen = 0
        self.__init__preprocess()
        
        self.data = self.raw_data.to_numpy()
   
        
       
        
    def __len__(self):
        return self.data.shape[0]
    
    
    def __getitem__(self,idx):
        
        
        sample = self.data[idx,:]
            
        return sample
    
    def __init__preprocess(self):
        
        
        def clean(text):
            text = text.lower()
        
            text = [ch for ch in text if ch not in string.punctuation]


            text = "".join(text)

            text = [c for c in text if c == " " or c.isalnum()]

            text = "".join(text)

            stop_words = set(stopwords.words("english"))

            text = text.split(" ")

            text = [word for word in text if word not in stop_words]
        
        
            text = " ".join(text)
            
            return text
          
            
        def build_vocab(text):
            
            
            text = text.split(" ")
            
            text_token = []
            
            for word in text:
                
                if word not in self.word2idx:
                    
                    self.word2idx[word] = self.word_count
                    
                    self.idx2word[self.word_count] = word
                    
                    self.word_count+=1
                    
                    
                text_token.append(self.word2idx[word])
             
            self.maxLen = max(self.maxLen,len(text_token))
            return text_token
        
        self.raw_data['text'] = self.raw_data.text.apply(lambda x : clean(x))
        self.raw_data['text'] = self.raw_data.text.apply(lambda x : build_vocab(x))

In [6]:
def collate_offset(batch):
    
    text = [torch.tensor(x[1]) for x in batch]
    
    label = torch.tensor([x[0] for x in batch])
    offsets = [0] + [len(x) for x in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    
    text = torch.cat(text)
    
    return text, offsets, label

In [7]:
def train_func(train):
    
    train_data = DataLoader(train,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_offset,num_workers=4)
    
    train_loss = 0
    train_acc = 0
    for i, (text, offsets, label) in enumerate(train_data):
        optimizer.zero_grad()
        text, offsets, label = text.to(device), offsets.to(device), label.to(device)
        
    
        output = model(text, offsets)
        
        loss = criterion(output, label)
        
        loss = Variable(loss,requires_grad=True)
        
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == label).sum().item()

    
    scheduler.step()
    
    return train_loss / len(train), train_acc / len(train)
        

In [8]:
def test_func(test):
    
    test_data = DataLoader(test,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_offset,num_workers=4)
    
    test_loss = 0
    test_acc = 0
    
    for i, (text, offsets, label) in enumerate(test_data):
        
        text, offsets, label = text.to(device), offsets.to(device), label.to(device)

        with torch.no_grad():
            
            output = model(text, offsets)

            loss = criterion(output, label)
            
            loss = Variable(loss,requires_grad=True)
            
            test_loss += loss.item()
            loss.backward()
            optimizer.step()
            test_acc += (output.argmax(1) == label).sum().item()   
    
    return test_loss / len(test), test_acc / len(test)

In [18]:
if __name__ == "__main__":
    
    
    
    start_time = time.time()
    yelp_dataset = YelpDataset(json_file = "~/data/yelp/review.json")
    
    print("Time Taken to for dataset preprocessing is {} minutes...".format((time.time()-start_time)/60))
    
    
    train_len = int(len(yelp_dataset)*0.8)
    
    valid_len = int(len(yelp_dataset)*0.1)
    
    test_len = len(yelp_dataset) - train_len -valid_len
    
    train,valid,test = random_split(yelp_dataset,[train_len,valid_len,test_len])
    
    
    
    print("Length of training dataset is {}".format(train_len))
          
    print("Length of validation dataset is {}".format(valid_len))
          
      
    model = YelpReviewsSentimentAnalysis(yelp_dataset.word_count,embed_dim, num_class)
    
    
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
     
    tl_,ta_ = [],[]
    vl_,va_ = [],[]
    
    for epch in range(epochs):
          
        start_time = time.time()
        train_loss, train_acc = train_func(train)
        valid_loss, valid_acc = test_func(valid)

        tl_.append(train_loss)
        ta_.append(train_acc)
        
        
        vl_.append(valid_loss)
        va_.append(valid_acc)
        
        
        secs = int(time.time() - start_time)
        mins = secs / 60
        
        
        
        name = "model_"+str(epch)+".pkl"
        with open(name,"wb") as file:
            pkl.dump(model,file)
            
        print("{} saved..",name)
        print("Time Taken to complete epoch is {} minutes...".format(mins))

Time Taken to for dataset preprocessing is 2.8071921269098916 minutes...
Length of training dataset is 80000
Length of validation dataset is 10000
{} saved.. model_0.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_1.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_2.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_3.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_4.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_5.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_6.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_7.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_8.pkl
Time Taken to complete epoch is 0.13333333333333333 minutes...
{} saved.. model_9.pkl
Time Taken to complete epoch is 0.13333333333333333 minu

plt.figure(figsize=(10,8))
plt.plot(training_loss)

In [11]:
training_acc

[0.505725,
 0.505725,
 0.505725,
 0.505725,
 0.505725,
 0.505725,
 0.505725,
 0.505725,
 0.505725,
 0.505725]

In [12]:
validation_acc

[0.5098,
 0.5098,
 0.5098,
 0.5098,
 0.5098,
 0.5098,
 0.5098,
 0.5098,
 0.5098,
 0.5098]

In [13]:
yelp_dataset.word_count

156315

In [17]:
model.fc.weight

Parameter containing:
tensor([[-4.7731e-01,  3.7013e-01, -7.6838e-02, -4.5141e-01,  1.9208e-01,
          3.9211e-01,  4.8415e-02,  1.7409e-01,  4.2344e-01,  8.9563e-03,
         -2.9832e-01, -1.1177e-01, -4.4683e-01, -1.8006e-01,  7.3528e-02,
         -4.3666e-01,  3.7348e-01,  1.5231e-01, -9.3723e-02, -1.8647e-01,
          2.7397e-01,  2.6460e-02,  4.5881e-01,  3.3134e-01,  4.4123e-01,
         -2.7587e-01, -3.9694e-01,  2.6389e-01,  2.1557e-01,  2.1427e-01,
          4.8920e-01,  2.9288e-01,  2.3015e-01,  1.3968e-01, -3.8236e-01,
          2.7120e-01, -3.7601e-01,  9.5279e-02,  2.0113e-01, -2.3488e-01,
          2.9296e-01, -4.9180e-02, -3.6096e-01, -1.6072e-03, -2.0295e-02,
         -3.5595e-01, -2.2731e-01, -9.5978e-03, -1.2998e-01, -3.7972e-01,
         -2.4652e-02,  5.9012e-02, -2.2972e-01,  3.5491e-01, -2.8106e-01,
         -2.6579e-01, -2.1652e-01, -4.3964e-01, -3.4342e-01, -4.7103e-01,
          9.5100e-02,  4.5640e-01,  2.1777e-01,  2.4502e-01,  2.0616e-01,
          1.8029

In [19]:
with open("model_1.pkl","rb") as file:
    model_1 = pkl.load(file)

In [29]:
with open("model_9.pkl","rb") as file:
    model_9 = pkl.load(file)

In [30]:
model_1.fc.weight

Parameter containing:
tensor([[-1.3565e-01,  3.0212e-02,  2.9023e-01,  3.5425e-01, -3.6518e-01,
          2.1126e-01,  3.4472e-01,  4.5807e-01,  2.2894e-01, -2.3293e-01,
         -1.0985e-01,  1.6811e-01, -2.5006e-01,  4.1443e-01, -1.6038e-01,
         -3.8092e-01,  2.6662e-01,  1.5123e-01, -8.5601e-02, -3.4376e-01,
         -2.9098e-01,  2.1467e-01,  1.1533e-01,  3.5420e-01, -5.0050e-04,
         -1.9033e-01, -3.0156e-01,  4.3343e-01,  2.4082e-01, -3.8623e-01,
         -2.7676e-01,  8.2076e-02, -1.6928e-01, -3.8063e-01, -4.3473e-01,
         -3.2350e-02, -4.1604e-05,  4.6637e-01, -1.1415e-01,  3.0355e-01,
          1.5025e-01,  1.0685e-01,  2.7617e-01, -7.3167e-03,  3.0006e-02,
          3.5998e-01,  1.7222e-01, -1.3075e-01,  3.9354e-01, -2.7977e-01,
          5.0323e-02, -2.4725e-01,  3.3777e-01, -3.7121e-01, -1.1962e-01,
         -3.5043e-01,  1.6030e-01,  4.9341e-01,  1.1727e-02, -2.6584e-01,
         -2.1087e-01,  3.0885e-01, -3.5393e-02,  4.8703e-01,  7.8680e-02,
         -4.2900

In [31]:
model_9.fc.weight.data

tensor([[-1.3565e-01,  3.0212e-02,  2.9023e-01,  3.5425e-01, -3.6518e-01,
          2.1126e-01,  3.4472e-01,  4.5807e-01,  2.2894e-01, -2.3293e-01,
         -1.0985e-01,  1.6811e-01, -2.5006e-01,  4.1443e-01, -1.6038e-01,
         -3.8092e-01,  2.6662e-01,  1.5123e-01, -8.5601e-02, -3.4376e-01,
         -2.9098e-01,  2.1467e-01,  1.1533e-01,  3.5420e-01, -5.0050e-04,
         -1.9033e-01, -3.0156e-01,  4.3343e-01,  2.4082e-01, -3.8623e-01,
         -2.7676e-01,  8.2076e-02, -1.6928e-01, -3.8063e-01, -4.3473e-01,
         -3.2350e-02, -4.1604e-05,  4.6637e-01, -1.1415e-01,  3.0355e-01,
          1.5025e-01,  1.0685e-01,  2.7617e-01, -7.3167e-03,  3.0006e-02,
          3.5998e-01,  1.7222e-01, -1.3075e-01,  3.9354e-01, -2.7977e-01,
          5.0323e-02, -2.4725e-01,  3.3777e-01, -3.7121e-01, -1.1962e-01,
         -3.5043e-01,  1.6030e-01,  4.9341e-01,  1.1727e-02, -2.6584e-01,
         -2.1087e-01,  3.0885e-01, -3.5393e-02,  4.8703e-01,  7.8680e-02,
         -4.2900e-01,  3.3733e-01,  1.

In [32]:
w1 = model_1.fc.weight

In [33]:
w2 = model_2.fc.weight

In [34]:
w9 = model_9.fc.weight

In [36]:
torch.all(torch.eq(w1, w9))

tensor(True)