In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import sklearn
import warnings
from platform import python_version
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
print(python_version())

3.9.5


### Read Data:

In [2]:
# Could not load it directly from the url:
ratings_df = pd.read_csv("./data/amazon_reviews_us_Kitchen_v1_00.tsv", sep="\t",
                         error_bad_lines=False, warn_bad_lines=False)

In [3]:
ratings_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,37000337,R3DT59XH7HXR9K,B00303FI0G,529320574,Arthur Court Paper Towel Holder,Kitchen,5.0,0.0,0.0,N,Y,Beautiful. Looks great on counter,Beautiful. Looks great on counter.,2015-08-31
1,US,15272914,R1LFS11BNASSU8,B00JCZKZN6,274237558,Olde Thompson Bavaria Glass Salt and Pepper Mi...,Kitchen,5.0,0.0,1.0,N,Y,Awesome & Self-ness,I personally have 5 days sets and have also bo...,2015-08-31
2,US,36137863,R296RT05AG0AF6,B00JLIKA5C,544675303,Progressive International PL8 Professional Man...,Kitchen,5.0,0.0,0.0,N,Y,Fabulous and worth every penny,Fabulous and worth every penny. Used for clean...,2015-08-31
3,US,43311049,R3V37XDZ7ZCI3L,B000GBNB8G,491599489,Zyliss Jumbo Garlic Press,Kitchen,5.0,0.0,1.0,N,Y,Five Stars,A must if you love garlic on tomato marinara s...,2015-08-31
4,US,13763148,R14GU232NQFYX2,B00VJ5KX9S,353790155,"1 X Premier Pizza Cutter - Stainless Steel 14""...",Kitchen,5.0,0.0,0.0,N,Y,Better than sex,Worth every penny! Buy one now and be a pizza ...,2015-08-31


In [4]:
# Simplify Dataset: Ensure all have reviews
ratings_df = ratings_df.loc[:, ["review_body", "star_rating"]]
ratings_df = ratings_df[ratings_df["review_body"].notnull()]
ratings_df.head()

Unnamed: 0,review_body,star_rating
0,Beautiful. Looks great on counter.,5.0
1,I personally have 5 days sets and have also bo...,5.0
2,Fabulous and worth every penny. Used for clean...,5.0
3,A must if you love garlic on tomato marinara s...,5.0
4,Worth every penny! Buy one now and be a pizza ...,5.0


In [5]:
# Gather 50k of each rating through random selection:
rating_1 = ratings_df[ratings_df["star_rating"] == 1].sample(50000)
rating_2 = ratings_df[ratings_df["star_rating"] == 2].sample(50000)
rating_3 = ratings_df[ratings_df["star_rating"] == 3].sample(50000)
rating_4 = ratings_df[ratings_df["star_rating"] == 4].sample(50000)
rating_5 = ratings_df[ratings_df["star_rating"] == 5].sample(50000)

ratings_sampled_df = pd.concat([rating_1, rating_2, rating_3, rating_4, rating_5])
ratings_sampled_df = ratings_sampled_df.sample(frac=1)
ratings_sampled_df.reset_index(drop=True, inplace=True)
ratings_sampled_df.head()

Unnamed: 0,review_body,star_rating
0,The Moderno panels were even better than I had...,5.0
1,I just have trouble believing that you can't u...,2.0
2,Be careful when using this coffee maker on a g...,4.0
3,These little guys work very well. They keep o...,5.0
4,Prompt delivery and quality as promised. boug...,4.0


In [6]:
# Map star rating to sentiment rating: We will assign -1 as neutral
d_ = {4:1, 5:1, 1:0, 2:0, 3:2}
ratings_sampled_df["sentiment"] = ratings_sampled_df["star_rating"].map(d_)
ratings_sampled_df.head()

Unnamed: 0,review_body,star_rating,sentiment
0,The Moderno panels were even better than I had...,5.0,1
1,I just have trouble believing that you can't u...,2.0,0
2,Be careful when using this coffee maker on a g...,4.0,1
3,These little guys work very well. They keep o...,5.0,1
4,Prompt delivery and quality as promised. boug...,4.0,1


In [7]:
# Train-test Split
train, test = train_test_split(ratings_sampled_df, test_size=0.2)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

### Word Embedding:

In [8]:
# Load in the word2vec:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

In [9]:
#Check Smeantic Similarities: (close enough!)
result_vec = word2vec["king"] - word2vec["man"] + word2vec["woman"]
word2vec.most_similar(positive=result_vec, topn=3)

[('king', 0.844939112663269),
 ('queen', 0.7300516366958618),
 ('monarch', 0.6454660296440125)]

In [10]:
word2vec.most_similar(positive="excellent", topn=3)

[('terrific', 0.7409728765487671),
 ('superb', 0.7062715888023376),
 ('exceptional', 0.681470513343811)]

In [11]:
# Train a Word2Vec model using own dataset: embedding_size=300, window_size=11, min_word_cnt=10
# Clean data:
train["review_body"] = train["review_body"].str.lower()
test["review_body"] = test["review_body"].str.lower()

train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
test["review_body"] = test["review_body"].replace(r'http\S+|www.\S+', '', regex=True)


train["review_body"] = train["review_body"].replace(r'[^a-z|\s]', '', regex=True)
test["review_body"] = test["review_body"].replace(r'[^a-z|\s]', '', regex=True)

train["review_body"] = train["review_body"].replace(r'\s\s+', ' ', regex=True)
test["review_body"] = test["review_body"].replace(r'\s\s+', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
A value

In [12]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(s):
    tokens = word_tokenize(s)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

train["review_body"] = train["review_body"].apply(remove_stopwords)
test["review_body"] = test["review_body"].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].apply(remove_stopwords)


In [13]:
train_sentences = np.array(train["review_body"])
test_sentences = np.array(test["review_body"])

In [65]:
model = gensim.models.Word2Vec(train_sentences, min_count=10, vector_size=200, window=11)

In [15]:
# Test semantic similarities:
result_vec = model.wv["king"] - model.wv["man"] + model.wv["woman"]
model.wv.similar_by_vector(result_vec)

[('king', 0.6305949687957764),
 ('rsvp', 0.4951222240924835),
 ('mm', 0.4950771629810333),
 ('shallower', 0.48441171646118164),
 ('standard', 0.4726628065109253),
 ('cm', 0.46526360511779785),
 ('multiclad', 0.4493022859096527),
 ('norpro', 0.44640490412712097),
 ('matfer', 0.43968307971954346),
 ('quot', 0.43900465965270996)]

In [16]:
model.wv.similar_by_word("excellent")

[('terrific', 0.7809708118438721),
 ('outstanding', 0.7695194482803345),
 ('fantastic', 0.7127118706703186),
 ('exceptional', 0.6841062903404236),
 ('wonderful', 0.6829191446304321),
 ('superb', 0.6584928035736084),
 ('great', 0.6146989464759827),
 ('fabulous', 0.5876462459564209),
 ('exceeds', 0.5675843954086304),
 ('good', 0.5586900115013123)]

From the Semantic Similarities above we can see that our model actually performs relatively well given the results of the top 10 closest words to Excellent. However we can see the problem emerges that the reviews do not necessarily contain all the words we might want such as "Queen", which may or may not be in the data based on the randomness involved in selecting reviews. One would prefer to utilize the all encompassing Google word2vec to help with unorthodox words, and we can trust that the performance is better. Yet, the create w2v does produce results faster.

### Simple Models:

#### Personally Trained Word2Vec

In [17]:
# Need to make binary model at this point:
train_binary = train[~(train["sentiment"] == 2)]
test_binary = test[~(test["sentiment"] == 2)]

In [18]:
X_train = train_binary["review_body"]
Y_train = train_binary["sentiment"].values

X_test = test_binary["review_body"]
Y_test = test_binary["sentiment"].values

In [19]:
def avg_vector_mine(review):
    tot_ = 0
    for word_ in review:
        if word_ in model.wv:
            tot_ += model.wv[word_]
        else:
            continue
    if len(review) == 0:
        return 0
    return tot_/len(review)

In [20]:
# Vectorize X_train:
X_train_avg = X_train.apply(avg_vector_mine)
X_train_df = pd.DataFrame.from_dict(dict(zip(X_train_avg.index, X_train_avg.values))).T
X_train_vectorized_mine = X_train_df.values

In [21]:
# Vectorize X_test:
X_test_avg = X_test.apply(avg_vector_mine)
X_test_df = pd.DataFrame.from_dict(dict(zip(X_test_avg.index, X_test_avg.values))).T
X_test_vectorized_mine = X_test_df.values

In [22]:
perceptron = Perceptron(tol=1e-3, random_state=42)
perceptron.fit(X_train_vectorized_mine, Y_train)
print("Perceptron Training Accuracy:", perceptron.score(X_train_vectorized_mine, Y_train))

Perceptron Training Accuracy: 0.773316588252201


In [23]:
print("Perceptron Test Accuracy:", perceptron.score(X_test_vectorized_mine, Y_test))

Perceptron Test Accuracy: 0.7759367256526418


In [24]:
svm = LinearSVC(max_iter=1000)
svm.fit(X_train_vectorized_mine, Y_train)
print("SVM Training Accuracy:", svm.score(X_train_vectorized_mine, Y_train))

SVM Training Accuracy: 0.8586727648753179




In [25]:
print("SVM Test Accuracy:", svm.score(X_test_vectorized_mine, Y_test))

SVM Test Accuracy: 0.8592095712462143


#### Pretrained Word2Vec

In [26]:
def avg_vector_google(review):
    tot_ = 0
    for word_ in review:
        if word_ in word2vec:
            tot_ += word2vec[word_]
        else:
            continue
    if len(review) == 0:
        return 0
    return tot_/len(review)

In [27]:
# Vectorize X_train:
X_train_avg = X_train.apply(avg_vector_google)
X_train_df = pd.DataFrame.from_dict(dict(zip(X_train_avg.index, X_train_avg.values))).T
X_train_vectorized_google = X_train_df.values

In [28]:
# Vectorize X_test:
X_test_avg = X_test.apply(avg_vector_google)
X_test_df = pd.DataFrame.from_dict(dict(zip(X_test_avg.index, X_test_avg.values))).T
X_test_vectorized_google = X_test_df.values

In [29]:
perceptron = Perceptron(tol=1e-3, random_state=42)
perceptron.fit(X_train_vectorized_google, Y_train)
print("Perceptron Training Accuracy:", perceptron.score(X_train_vectorized_google, Y_train))

Perceptron Training Accuracy: 0.7919048779421045


In [30]:
print("Perceptron Test Accuracy:", perceptron.score(X_test_vectorized_google, Y_test))

Perceptron Test Accuracy: 0.7948089004580382


In [31]:
svm = LinearSVC(max_iter=1000)
svm.fit(X_train_vectorized_google, Y_train)
print("SVM Training Accuracy:", svm.score(X_train_vectorized_google, Y_train))

SVM Training Accuracy: 0.820708916755703


In [32]:
print("SVM Test Accuracy:", svm.score(X_test_vectorized_google, Y_test))

SVM Test Accuracy: 0.8229419567992391


Perceptron Results TF.IDF: Test-Accuracy = .8569 <br>
SVM Results TF.IDF: Test-Accuracy = 0.898025 <br>
<br>
We can see from the above results that the models are better for TF.IDF, then personally trained Word2Vec, and lastly for the Google pre-trained Word2Vec. One likely explanation for this could be that the TF.IDF and personally train Word2Vec might be more tailored to the specific review set at hand. In addition, the averaging of the word vectors might be a very inefficient way to combine words in a review.

### Feedforward Neural Networks

#### Avg Word2Vec Vectors - Personally Trained Word2Vec:

In [33]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn

In [34]:
class MLP(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 200
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        self.dropout = nn.Dropout(0.2)
        #self.act3 = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 200)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [35]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_vectorized_mine)):
    train_data.append([X_train_vectorized_mine[i], Y_train[i]])
    
test_data = []
for i in range(len(X_test_vectorized_mine)):
    test_data.append([X_test_vectorized_mine[i], Y_test[i]])

In [36]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp = MLP()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp.state_dict(), 'model.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.394658 	Test Loss: 0.350803
Epoch: 2 	Training Loss: 0.340230 	Test Loss: 0.342475
Epoch: 3 	Training Loss: 0.331398 	Test Loss: 0.335663
Epoch: 4 	Training Loss: 0.326384 	Test Loss: 0.334857
Epoch: 5 	Training Loss: 0.321979 	Test Loss: 0.330810
Epoch: 6 	Training Loss: 0.319132 	Test Loss: 0.328701
Epoch: 7 	Training Loss: 0.316832 	Test Loss: 0.329605
Epoch: 8 	Training Loss: 0.314467 	Test Loss: 0.329483
Epoch: 9 	Training Loss: 0.312335 	Test Loss: 0.328389
Epoch: 10 	Training Loss: 0.310825 	Test Loss: 0.328902
All done.


In [37]:
mlp.load_state_dict(torch.load('model.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [57]:
# Calculate Accuracy from trained model:
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
        for data, target in dataloader:
            output = model(data.float())
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    return prediction_list

In [39]:
predictions = predict(mlp, testloader)
predictions = np.array(predictions)

In [58]:
def accuracy(y_true, y_pred):
    score = sum(y_true == y_pred)/len(y_pred)
    return score

In [41]:
print("Test Accuracy:", accuracy(Y_test, predictions))

Test Accuracy: 0.8642596206775759


In [61]:
# Formatting process for 3 class:
X_train_3 = train["review_body"]
Y_train_3 = train["sentiment"].values

X_test_3 = test["review_body"]
Y_test_3 = test["sentiment"].values

In [43]:
# Vectorize X_train_3:
X_train_avg_3 = X_train_3.apply(avg_vector_mine)
X_train_df_3 = pd.DataFrame.from_dict(dict(zip(X_train_avg_3.index, X_train_avg_3.values))).T
X_train_vectorized_mine_3 = X_train_df_3.values

In [44]:
# Vectorize X_test_3:
X_test_avg_3 = X_test_3.apply(avg_vector_mine)
X_test_df_3 = pd.DataFrame.from_dict(dict(zip(X_test_avg_3.index, X_test_avg_3.values))).T
X_test_vectorized_mine_3 = X_test_df_3.values

In [45]:
# Formatting the data for DataLoader (coverts to tensors):
train_data_3 = []
for i in range(len(X_train_vectorized_mine_3)):
    train_data_3.append([X_train_vectorized_mine_3[i], Y_train_3[i]])
    
test_data_3 = []
for i in range(len(X_test_vectorized_mine_3)):
    test_data_3.append([X_test_vectorized_mine_3[i], Y_test_3[i]])

In [46]:
class MLP_3(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 200
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 200)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [47]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data_3, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=20, num_workers=1)
mlp_3 = MLP_3()
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(mlp.parameters(), lr=.0001)
optimizer = torch.optim.SGD(mlp_3.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_3.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_3(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_3(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_3.state_dict(), 'model_3.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.788616 	Test Loss: 0.746170
Epoch: 2 	Training Loss: 0.734794 	Test Loss: 0.732704
Epoch: 3 	Training Loss: 0.724323 	Test Loss: 0.726283
Epoch: 4 	Training Loss: 0.718909 	Test Loss: 0.721847
Epoch: 5 	Training Loss: 0.713667 	Test Loss: 0.720896
Epoch: 6 	Training Loss: 0.709951 	Test Loss: 0.718727
Epoch: 7 	Training Loss: 0.704529 	Test Loss: 0.714739
Epoch: 8 	Training Loss: 0.702430 	Test Loss: 0.717705
Epoch: 9 	Training Loss: 0.701406 	Test Loss: 0.715015
Epoch: 10 	Training Loss: 0.699552 	Test Loss: 0.714167
All done.


In [48]:
mlp_3.load_state_dict(torch.load('model_3.pt'))
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=1, num_workers=1)

In [49]:
predictions = predict(mlp_3, testloader)
predictions = np.array(predictions)

In [50]:
print("Test Accuracy:", accuracy(Y_test_3, predictions))

Test Accuracy: 0.69822


#### Avg Word2Vec Vectors - Google Trained Word2Vec:

In [51]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_vectorized_google)):
    train_data.append([X_train_vectorized_google[i], Y_train[i]])
    
test_data = []
for i in range(len(X_test_vectorized_google)):
    test_data.append([X_test_vectorized_google[i], Y_test[i]])

In [52]:
class MLP_Google(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 300
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        self.dropout = nn.Dropout(0.2)
        #self.act3 = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 300)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [53]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp_g = MLP_Google()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_g.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_g.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_g(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_g(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_g.state_dict(), 'model_google.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.603884 	Test Loss: 0.474237
Epoch: 2 	Training Loss: 0.447847 	Test Loss: 0.439263
Epoch: 3 	Training Loss: 0.426689 	Test Loss: 0.423203
Epoch: 4 	Training Loss: 0.418082 	Test Loss: 0.416761
Epoch: 5 	Training Loss: 0.411910 	Test Loss: 0.415010
Epoch: 6 	Training Loss: 0.408220 	Test Loss: 0.415638
Epoch: 7 	Training Loss: 0.403407 	Test Loss: 0.411095
Epoch: 8 	Training Loss: 0.401081 	Test Loss: 0.407462
Epoch: 9 	Training Loss: 0.399308 	Test Loss: 0.404369
Epoch: 10 	Training Loss: 0.394773 	Test Loss: 0.404496
All done.


In [54]:
mlp_g.load_state_dict(torch.load('model_google.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [55]:
predictions = predict(mlp_g, testloader)
predictions = np.array(predictions)

In [56]:
print("Test Accuracy:", accuracy(Y_test, predictions))

Test Accuracy: 0.8277786118200471


In [57]:
# Vectorize X_train_3:
X_train_avg_3 = X_train_3.apply(avg_vector_google)
X_train_df_3 = pd.DataFrame.from_dict(dict(zip(X_train_avg_3.index, X_train_avg_3.values))).T
X_train_vectorized_google_3 = X_train_df_3.values

In [58]:
# Vectorize X_test_3:
X_test_avg_3 = X_test_3.apply(avg_vector_google)
X_test_df_3 = pd.DataFrame.from_dict(dict(zip(X_test_avg_3.index, X_test_avg_3.values))).T
X_test_vectorized_google_3 = X_test_df_3.values

In [59]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_vectorized_google_3)):
    train_data.append([X_train_vectorized_google_3[i], Y_train_3[i]])
    
test_data = []
for i in range(len(X_test_vectorized_google_3)):
    test_data.append([X_test_vectorized_google_3[i], Y_test_3[i]])

In [60]:
class MLP_Google_3(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 300
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 300)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [77]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp_g_3 = MLP_Google_3()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_g_3.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_g_3.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_g_3(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_g_3(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_g_3.state_dict(), 'model_google_3.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 1.001519 	Test Loss: 0.884704
Epoch: 2 	Training Loss: 0.853755 	Test Loss: 0.839135
Epoch: 3 	Training Loss: 0.826782 	Test Loss: 0.825397
Epoch: 4 	Training Loss: 0.817725 	Test Loss: 0.817663
Epoch: 5 	Training Loss: 0.809638 	Test Loss: 0.811981
Epoch: 6 	Training Loss: 0.802460 	Test Loss: 0.804927
Epoch: 7 	Training Loss: 0.796096 	Test Loss: 0.798836
Epoch: 8 	Training Loss: 0.791201 	Test Loss: 0.796966
Epoch: 9 	Training Loss: 0.785747 	Test Loss: 0.791367
Epoch: 10 	Training Loss: 0.782573 	Test Loss: 0.787759
All done.


In [79]:
mlp_g_3.load_state_dict(torch.load('model_google_3.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [80]:
predictions = predict(mlp_g_3, testloader)
predictions = np.array(predictions)

In [81]:
print("Test Accuracy:", accuracy(Y_test_3, predictions))

Test Accuracy: 0.66382


### Binary & Ternary Models For First 10 W2V Vectors Models:

In [63]:
def first_10_w2v_mine(review):
    features = np.zeros((10,200))
    for i, word in enumerate(review):
        if i >= 10:
            break
        elif word in model.wv:
            features[i] = model.wv[word]
        else:
            continue
    return features            

In [35]:
# Vectorize X_train, X_test:
X_train_10 = X_train.apply(first_10_w2v_mine)
X_test_10 = X_test.apply(first_10_w2v_mine)

In [36]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_10)):
    train_data.append([X_train_10.iloc[i], Y_train[i]])

In [37]:
test_data = []
for i in range(len(X_test_10)):
    test_data.append([X_test_10.iloc[i], Y_test[i]])

In [51]:
class MLP_Mine_Binary_10(nn.Module):

    def __init__(self):
        super().__init__()

        vocab_size = 10
        embedding_size = 200
        hidden_1 = 50
        hidden_2 = 10
        
#         self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(vocab_size*embedding_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 2000)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [54]:
n_epochs = 5
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp_mine_binary_10 = MLP_Mine_Binary_10()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_mine_binary_10.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_mine_binary_10.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_mine_binary_10(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_mine_binary_10(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_mine_binary_10.state_dict(), 'model_mine_binary_10.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.471934 	Test Loss: 0.445850
Epoch: 2 	Training Loss: 0.429089 	Test Loss: 0.440019
Epoch: 3 	Training Loss: 0.413497 	Test Loss: 0.442179
Epoch: 4 	Training Loss: 0.399391 	Test Loss: 0.443434
Epoch: 5 	Training Loss: 0.385732 	Test Loss: 0.448577
All done.


In [55]:
mlp_mine_binary_10.load_state_dict(torch.load('model_mine_binary_10.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [59]:
predictions = predict(mlp_mine_binary_10, testloader)
predictions = np.array(predictions)

In [60]:
print("Test Accuracy:", accuracy(Y_test, predictions))

Test Accuracy: 0.79851325307236


In [66]:
# Vectorize X_train_3, X_test_3:
X_train_3_10 = X_train_3.apply(first_10_w2v_mine)
X_test_3_10 = X_test_3.apply(first_10_w2v_mine)

In [77]:
# Formatting the data for DataLoader (coverts to tensors):
train_data_3 = []
for i in range(len(X_train_3_10)):
    train_data_3.append([X_train_3_10.iloc[i], Y_train_3[i]])
    
test_data_3 = []
for i in range(len(X_test_3_10 )):
    test_data_3.append([X_test_3_10.iloc[i], Y_test_3[i]])

In [78]:
class MLP_Mine_Ternary_10(nn.Module):

    def __init__(self):
        super().__init__()

        vocab_size = 10
        embedding_size = 200
        hidden_1 = 50
        hidden_2 = 10
        
#         self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(vocab_size*embedding_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 2000)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [79]:
n_epochs = 5
trainloader = torch.utils.data.DataLoader(train_data_3, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=20, num_workers=1)
mlp_mine_ternary_10 = MLP_Mine_Ternary_10()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_mine_ternary_10.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_mine_ternary_10.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_mine_ternary_10(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_mine_ternary_10(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_mine_ternary_10.state_dict(), 'model_mine_ternary_10.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.852706 	Test Loss: 0.828688
Epoch: 2 	Training Loss: 0.813003 	Test Loss: 0.822266
Epoch: 3 	Training Loss: 0.797715 	Test Loss: 0.822398
Epoch: 4 	Training Loss: 0.785272 	Test Loss: 0.825645
Epoch: 5 	Training Loss: 0.773386 	Test Loss: 0.828790
All done.


In [80]:
mlp_mine_ternary_10.load_state_dict(torch.load('model_mine_ternary_10.pt'))
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=1, num_workers=1)

In [81]:
predictions = predict(mlp_mine_ternary_10, testloader)
predictions = np.array(predictions)

In [82]:
print("Test Accuracy:", accuracy(Y_test_3, predictions))

Test Accuracy: 0.64138


#### First 10 Features - Google Model:

In [85]:
def first_10_w2v_google(review):
    features = np.zeros((10,300))
    for i, word in enumerate(review):
        if i >= 10:
            break
        elif word in word2vec:
            features[i] = word2vec[word]
        else:
            continue
    return features  

In [86]:
# Vectorize X_train, X_test:
X_train_10 = X_train.apply(first_10_w2v_google)
X_test_10 = X_test.apply(first_10_w2v_google)

In [87]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_10)):
    train_data.append([X_train_10.iloc[i], Y_train[i]])

In [88]:
test_data = []
for i in range(len(X_test_10)):
    test_data.append([X_test_10.iloc[i], Y_test[i]])

In [89]:
class MLP_Google_Binary_10(nn.Module):

    def __init__(self):
        super().__init__()

        vocab_size = 10
        embedding_size = 300
        hidden_1 = 50
        hidden_2 = 10
        
#         self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(vocab_size*embedding_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 3000)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [90]:
n_epochs = 5
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp_google_binary_10 = MLP_Google_Binary_10()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_google_binary_10.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_google_binary_10.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_google_binary_10(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_google_binary_10(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_google_binary_10.state_dict(), 'model_google_binary_10.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.556440 	Test Loss: 0.497817
Epoch: 2 	Training Loss: 0.483974 	Test Loss: 0.482728
Epoch: 3 	Training Loss: 0.464027 	Test Loss: 0.476726
Epoch: 4 	Training Loss: 0.446072 	Test Loss: 0.474405
Epoch: 5 	Training Loss: 0.429112 	Test Loss: 0.476100
All done.


In [91]:
mlp_google_binary_10.load_state_dict(torch.load('model_google_binary_10.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [92]:
predictions = predict(mlp_google_binary_10, testloader)
predictions = np.array(predictions)

In [93]:
print("Test Accuracy:", accuracy(Y_test, predictions))

Test Accuracy: 0.7742096963932621


In [94]:
# Vectorize X_train_3, X_test_3:
X_train_3_10 = X_train_3.apply(first_10_w2v_google)
X_test_3_10 = X_test_3.apply(first_10_w2v_google)

In [95]:
# Formatting the data for DataLoader (coverts to tensors):
train_data_3 = []
for i in range(len(X_train_3_10)):
    train_data_3.append([X_train_3_10.iloc[i], Y_train_3[i]])
    
test_data_3 = []
for i in range(len(X_test_3_10 )):
    test_data_3.append([X_test_3_10.iloc[i], Y_test_3[i]])

In [96]:
class MLP_Google_Ternary_10(nn.Module):

    def __init__(self):
        super().__init__()

        vocab_size = 10
        embedding_size = 300
        hidden_1 = 50
        hidden_2 = 10
        
#         self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(vocab_size*embedding_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 3000)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [97]:
n_epochs = 5
trainloader = torch.utils.data.DataLoader(train_data_3, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=20, num_workers=1)
mlp_google_ternary_10 = MLP_Google_Ternary_10()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_google_ternary_10.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_google_ternary_10.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_google_ternary_10(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_google_ternary_10(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_google_ternary_10.state_dict(), 'model_google_ternary_10.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.931125 	Test Loss: 0.883479
Epoch: 2 	Training Loss: 0.868061 	Test Loss: 0.869124
Epoch: 3 	Training Loss: 0.848297 	Test Loss: 0.861928
Epoch: 4 	Training Loss: 0.832560 	Test Loss: 0.859491
Epoch: 5 	Training Loss: 0.817644 	Test Loss: 0.858458
All done.


In [98]:
mlp_google_ternary_10.load_state_dict(torch.load('model_google_ternary_10.pt'))
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=1, num_workers=1)

In [99]:
predictions = predict(mlp_google_ternary_10, testloader)
predictions = np.array(predictions)

In [100]:
print("Test Accuracy:", accuracy(Y_test_3, predictions))

Test Accuracy: 0.62066


### 5. RNN