In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
import torch.nn.utils.rnn as utils
import torch.utils.data as data_utils
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df=pd.read_csv('C:/Users/ajit/Anaconda3/Scripts/files/reviews.csv', encoding="ISO-8859-1")

In [3]:
df=df[~df['Comments / Feedback'].isnull()]

In [4]:
df['rating']=df['Please rate the overall quality of the repair.'].apply(lambda x: 1 if x>3 else 0) #Converting the rating to a binary outcome

In [5]:
df['rating'].value_counts()

1    904
0    309
Name: rating, dtype: int64

In [6]:
tokenizer=Tokenizer(num_words=2000)

In [7]:
tokenizer.fit_on_texts(df['Comments / Feedback'])

In [8]:
word_index=tokenizer.word_index

In [9]:
X=tokenizer.texts_to_sequences(df['Comments / Feedback']) #tokenizing the input sequences. Keras Tokenizer also takes care of Punctuations

In [10]:
X=[sequence[:200] for sequence in X] #only considering first 200 words of the sequence

In [11]:
file=open("C:/Users/ajit/Downloads/Compressed/glove50d.txt",encoding="utf8") #importing Glove vectors

In [12]:
embeddings_index={}
for line in file:
    values=line.split()
    word=values[0]
    embeddings=torch.from_numpy(np.asarray(values[1:],dtype='float32')).view(1,-1).float()
    embeddings_index[word]=embeddings
file.close()

In [13]:
embeddings_matrix={}
for word in word_index.keys():
    if word in embeddings_index.keys():
        embeddings_matrix[word_index[word]]=embeddings_index[word]
    else:
        embeddings_matrix[word_index[word]]=torch.zeros(1,50,dtype=torch.float32)

In [14]:
for sequence in X:
    for i in range(len(sequence)):
        sequence[i]=embeddings_matrix[sequence[i]] #replacing the tokens with corrsponding dense vector from Glove

In [15]:
X=[torch.cat(sequence,dim=0) for sequence in X]

In [16]:
y=[float(y) for y in df['rating']]

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=5)

<p style="font-size:14px; color:black;">
  <span style="color:red;font-weight:bold;">Customdatasets</span> need to subclass "Dataset" class in utils module. All subclasess of <a href="https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#Dataset"><span style="color: orange;font-weight:bold;">Dataset</span></a> should override __len__ and __getitem__ methods of the parent class. </p>

In [18]:
class customDataset(data_utils.Dataset):
    
    def __init__(self,x,y):
        self.x=x
        self.y=y
        self.lengths=[len(sequence) for sequence in self.sorting()]
        self.padded_x=self.padding()
    def padding(self):
        return utils.pad_sequence(self.sorting(),batch_first=True)
    def sorting(self):
        return sorted(self.x,key=len,reverse=True)
    def __len__(self):
        return len(self.x)
    def __getitem__(self,id):
        return (self.padded_x[id],self.y[id],self.lengths[id])

In [19]:
new_dataset=customDataset(X_train,torch.tensor(y_train).view(-1,1)) #creating an object of the customedataset

In [20]:
trainloader=data_utils.DataLoader(new_dataset,batch_size=5,drop_last=True) #creating a generator for batch processing data

In [28]:
test_dataset=customDataset(X_test,torch.tensor(y_test).view(-1,1))

In [29]:
testloader=data_utils.DataLoader(test_dataset,batch_size=5,drop_last=True)

In [21]:
len(trainloader)

181

I am using a deep bidirectional LSTM with 3 layers followed by a dense layer and output layer with sigmoid activation

In [22]:
class DeepLSTM(nn.Module):
    
    def __init__(self,input_dim,hidden_dim,num_layers,dropout):
        super().__init__()
        self.hidden_dim=hidden_dim
        self.num_layers=num_layers
        self.lstm=nn.LSTM(input_dim,hidden_dim,num_layers,bias=True,batch_first=True,dropout=dropout,bidirectional=True)
        self.fc1=nn.Linear(hidden_dim*2,10)
        self.fc2=nn.Linear(10,1)
        self.batchnorm=nn.BatchNorm1d(10)
        
    def forward(self,seq,lengths):
        bs=seq.size(0)
        self.h=self.init_hidden(bs)
        inputs=utils.pack_padded_sequence(seq,lengths,batch_first=True) #to pack the variable lengths in the batch to feed it to LSTM
        output,last_layer=self.lstm(inputs,self.h)
        output,lengths=utils.pad_packed_sequence(output,batch_first=True) #collecting the output from the LSTM and padding it
        output=self.fc1(output[:,-1,:])
        output=F.dropout(F.relu(self.batchnorm(output)),p=0.1,training=True)
        output=self.fc2(output)
        output=F.sigmoid(output)
        return output
    
    def init_hidden(self,batch_size): #function for hidden state initializations
        return (torch.zeros(self.num_layers*2, batch_size,self.hidden_dim),
                torch.zeros(self.num_layers*2,batch_size, self.hidden_dim))
        

In [23]:
model=DeepLSTM(50,40,3,0.1)

In [24]:
criteria=nn.BCELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)

In [26]:
for epoch in range(10):
    running_loss=0.0
    for i,data in enumerate(trainloader):
        inputs,labels,lengths=data
        outputs=model(inputs,lengths)
        loss=criteria(outputs,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss+=loss
        if i %8==7:
            print('[%d, %5d] loss: %.3f' % (epoch+1,i+1,running_loss/8))
            running_loss=0.0
print('Finished training')

[1,     8] loss: 0.711
[1,    16] loss: 0.674
[1,    24] loss: 0.668
[1,    32] loss: 0.654
[1,    40] loss: 0.650
[1,    48] loss: 0.686
[1,    56] loss: 0.662
[1,    64] loss: 0.700
[1,    72] loss: 0.622
[1,    80] loss: 0.708
[1,    88] loss: 0.686
[1,    96] loss: 0.602
[1,   104] loss: 0.598
[1,   112] loss: 0.657
[1,   120] loss: 0.637
[1,   128] loss: 0.625
[1,   136] loss: 0.636
[1,   144] loss: 0.598
[1,   152] loss: 0.604
[1,   160] loss: 0.607
[1,   168] loss: 0.580
[1,   176] loss: 0.623
[2,     8] loss: 0.633
[2,    16] loss: 0.602
[2,    24] loss: 0.578
[2,    32] loss: 0.576
[2,    40] loss: 0.534
[2,    48] loss: 0.624
[2,    56] loss: 0.651
[2,    64] loss: 0.577
[2,    72] loss: 0.635
[2,    80] loss: 0.616
[2,    88] loss: 0.700
[2,    96] loss: 0.488
[2,   104] loss: 0.484
[2,   112] loss: 0.616
[2,   120] loss: 0.620
[2,   128] loss: 0.553
[2,   136] loss: 0.597
[2,   144] loss: 0.582
[2,   152] loss: 0.587
[2,   160] loss: 0.591
[2,   168] loss: 0.507
[2,   176] 

In [53]:
with torch.no_grad():
    predictions=[]
    for data in testloader:
        inputs,labels,lengths=data
        prediction=model(inputs,lengths)
        predictions.extend(prediction)