In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from sklearn.metrics import accuracy_score
import torch
from torch import nn, optim
import torch.nn.functional as F

data = pd.read_csv("amazon_reviews.txt", sep="\t", header=None)
reviews = data.iloc[:, 0].str.lower()
sentiment = data.iloc[:,1].values

In [22]:
sentiment

array([0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,

In [8]:
#Creting the vocabulary of entire set.
#Creating a dictionary that maps a vocabulary to an integer (like ids), key = word, value=int

words = ' '.join(reviews)
words = words.split()
vocabulary = set(words)
len(vocabulary) #1905 words

indexer = {word : index for (index, word) in enumerate(vocabulary)}
indexer


{'let': 0,
 'mins': 1,
 'for': 2,
 'keys': 3,
 'buyerbe': 4,
 'wellwell': 5,
 'killer': 6,
 'upgrade': 7,
 'saggy': 8,
 'given': 9,
 'market': 10,
 'into': 11,
 '45': 12,
 'me': 13,
 'complaint': 14,
 '18': 15,
 'lg': 16,
 'tools': 17,
 'everywhere': 18,
 'encourage': 19,
 'cellular': 20,
 'owned': 21,
 'applifies': 22,
 'glad': 23,
 'people': 24,
 'quality': 25,
 'lately': 26,
 'earpieces': 27,
 'recommended': 28,
 'infatuated': 29,
 'screens': 30,
 'dirty': 31,
 'sucked': 32,
 'scratch': 33,
 'override': 34,
 'leopard': 35,
 'holding': 36,
 'wonder': 37,
 'program': 38,
 'ripped': 39,
 'smoking': 40,
 'fraction': 41,
 'performance': 42,
 'overnight': 43,
 'carried': 44,
 '325': 45,
 'tricky': 46,
 'bluetoooth': 47,
 'joy': 48,
 '23': 49,
 'numerous': 50,
 '700w': 51,
 'feel': 52,
 'cost': 53,
 'bluetooth': 54,
 'eargels': 55,
 'exactly': 56,
 'lightly': 57,
 'experience': 58,
 'continue': 59,
 'dustpan': 60,
 'verizons': 61,
 'minutesmajor': 62,
 'functions': 63,
 '42': 64,
 'conveni

In [11]:
indexed_reviews = []
for review in reviews:
    indexed_reviews.append([indexer[word] for word in review.split()])
    
indexed_reviews[5]

[109,
 696,
 1796,
 1156,
 300,
 817,
 1796,
 858,
 1353,
 1796,
 1450,
 1477,
 1765,
 1796,
 858,
 1424,
 369]

In [14]:
#LSTM 
class LSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, n_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, n_layers, batch_first=True)
        self.output = nn.Linear(hidden_size, 1)
        
    def forward(self,x):
        out = self.embedding(x)
        out, _ = self.lstm(out)
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.output(out)
        out = out[-1, 0]
        out = torch.sigmoid(out).unsqueeze(0)
        
        return out

In [15]:
#creating model
#LSTM(vocab_size, embedded dimensions=64, hidden neurons in each layer=128, n_layers=3)
model = LSTM(len(vocabulary), 64, 128, 3)
model

LSTM(
  (embedding): Embedding(1905, 64)
  (lstm): LSTM(64, 128, num_layers=3, batch_first=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
)

In [16]:
#Loss function as Binary-Cross Entropy loss
loss_function = nn.BCELoss()
#Adam optimizer with learning rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=0.001)
#Epochs
epochs = 10

In [26]:
#Perform a prediction for each review, calculate the loss function, and update the parameters of network.
losses = []
acc = []
for e in range(1, epochs+1):
    single_loss = []
    preds = []
    targets = []
    #i is index, r is the review
    for i, r in enumerate(indexed_reviews):
        if len(r) <= 1:
            continue
            
        x = torch.Tensor([r]).long()
        y = torch.Tensor([sentiment[i]])
        
        pred = model(x)
        loss = loss_function(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        final_pred = np.round(pred.detach().numpy())
        preds.append(final_pred)
        targets.append(y)
        single_loss.append(loss.item())
        
        
    losses.append(np.mean(single_loss))
    print("targets: ", len(targets), "| predictions: ", len(preds))
    accuracy = accuracy_score(targets, preds)
    acc.append(accuracy)
    
    if e%1 == 0:
        print("Epoch: ", e, "| Loss function: ", losses[-1], "| Accuracy: ", acc[-1])

targets:  994 | predictions:  994


  return array(a, dtype, copy=False, order=order)
  y = np.array(y, dtype=object)
  return array(a, dtype, copy=False, order=order)


ValueError: Classification metrics can't handle a mix of unknown and binary targets

In [29]:
print(type(targets[2]), "|", type(preds[2]))

<class 'torch.Tensor'> | <class 'numpy.ndarray'>
