In [64]:
import numpy as np
import pandas as pd
import re
import nltk
import sklearn
import warnings
from platform import python_version
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
print(python_version())

3.9.5


### Read Data:

In [65]:
# Could not load it directly from the url:
ratings_df = pd.read_csv("./data/amazon_reviews_us_Kitchen_v1_00.tsv", sep="\t",
                         error_bad_lines=False, warn_bad_lines=False)

In [66]:
ratings_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,37000337,R3DT59XH7HXR9K,B00303FI0G,529320574,Arthur Court Paper Towel Holder,Kitchen,5.0,0.0,0.0,N,Y,Beautiful. Looks great on counter,Beautiful. Looks great on counter.,2015-08-31
1,US,15272914,R1LFS11BNASSU8,B00JCZKZN6,274237558,Olde Thompson Bavaria Glass Salt and Pepper Mi...,Kitchen,5.0,0.0,1.0,N,Y,Awesome & Self-ness,I personally have 5 days sets and have also bo...,2015-08-31
2,US,36137863,R296RT05AG0AF6,B00JLIKA5C,544675303,Progressive International PL8 Professional Man...,Kitchen,5.0,0.0,0.0,N,Y,Fabulous and worth every penny,Fabulous and worth every penny. Used for clean...,2015-08-31
3,US,43311049,R3V37XDZ7ZCI3L,B000GBNB8G,491599489,Zyliss Jumbo Garlic Press,Kitchen,5.0,0.0,1.0,N,Y,Five Stars,A must if you love garlic on tomato marinara s...,2015-08-31
4,US,13763148,R14GU232NQFYX2,B00VJ5KX9S,353790155,"1 X Premier Pizza Cutter - Stainless Steel 14""...",Kitchen,5.0,0.0,0.0,N,Y,Better than sex,Worth every penny! Buy one now and be a pizza ...,2015-08-31


In [67]:
# Simplify Dataset: Ensure all have reviews
ratings_df = ratings_df.loc[:, ["review_body", "star_rating"]]
ratings_df = ratings_df[ratings_df["review_body"].notnull()]
ratings_df.head()

Unnamed: 0,review_body,star_rating
0,Beautiful. Looks great on counter.,5.0
1,I personally have 5 days sets and have also bo...,5.0
2,Fabulous and worth every penny. Used for clean...,5.0
3,A must if you love garlic on tomato marinara s...,5.0
4,Worth every penny! Buy one now and be a pizza ...,5.0


In [68]:
# Gather 50k of each rating through random selection:
rating_1 = ratings_df[ratings_df["star_rating"] == 1].sample(50000)
rating_2 = ratings_df[ratings_df["star_rating"] == 2].sample(50000)
rating_3 = ratings_df[ratings_df["star_rating"] == 3].sample(50000)
rating_4 = ratings_df[ratings_df["star_rating"] == 4].sample(50000)
rating_5 = ratings_df[ratings_df["star_rating"] == 5].sample(50000)

ratings_sampled_df = pd.concat([rating_1, rating_2, rating_3, rating_4, rating_5])
ratings_sampled_df = ratings_sampled_df.sample(frac=1)
ratings_sampled_df.reset_index(drop=True, inplace=True)
ratings_sampled_df.head()

Unnamed: 0,review_body,star_rating
0,I was wondering why the coffee started tasting...,1.0
1,I have had this pan for 10 years. It is well-...,5.0
2,good seal,4.0
3,"Variable grind and a decent size bean hopper, ...",3.0
4,Great pan for small pizzas.,4.0


In [69]:
# Map star rating to sentiment rating: We will assign -1 as neutral
d_ = {4:1, 5:1, 1:0, 2:0, 3:2}
ratings_sampled_df["sentiment"] = ratings_sampled_df["star_rating"].map(d_)
ratings_sampled_df.head()

Unnamed: 0,review_body,star_rating,sentiment
0,I was wondering why the coffee started tasting...,1.0,0
1,I have had this pan for 10 years. It is well-...,5.0,1
2,good seal,4.0,1
3,"Variable grind and a decent size bean hopper, ...",3.0,2
4,Great pan for small pizzas.,4.0,1


In [70]:
# Train-test Split
train, test = train_test_split(ratings_sampled_df, test_size=0.2)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

### Word Embedding:

In [71]:
# Load in the word2vec:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

In [72]:
#Check Smeantic Similarities: (close enough!)
result_vec = word2vec["king"] - word2vec["man"] + word2vec["woman"]
word2vec.most_similar(positive=result_vec, topn=3)

[('king', 0.844939112663269),
 ('queen', 0.7300516366958618),
 ('monarch', 0.6454660296440125)]

In [73]:
word2vec.most_similar(positive="excellent", topn=3)

[('terrific', 0.7409728765487671),
 ('superb', 0.7062715888023376),
 ('exceptional', 0.681470513343811)]

In [74]:
# Train a Word2Vec model using own dataset: embedding_size=300, window_size=11, min_word_cnt=10
# Clean data:
train["review_body"] = train["review_body"].str.lower()
test["review_body"] = test["review_body"].str.lower()

train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
test["review_body"] = test["review_body"].replace(r'http\S+|www.\S+', '', regex=True)


train["review_body"] = train["review_body"].replace(r'[^a-z|\s]', '', regex=True)
test["review_body"] = test["review_body"].replace(r'[^a-z|\s]', '', regex=True)

train["review_body"] = train["review_body"].replace(r'\s\s+', ' ', regex=True)
test["review_body"] = test["review_body"].replace(r'\s\s+', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
A value

In [75]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(s):
    tokens = word_tokenize(s)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

train["review_body"] = train["review_body"].apply(remove_stopwords)
test["review_body"] = test["review_body"].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].apply(remove_stopwords)


In [76]:
train_sentences = np.array(train["review_body"])
test_sentences = np.array(test["review_body"])

In [77]:
model = gensim.models.Word2Vec(train_sentences, min_count=10, vector_size=200, window=11)

In [78]:
# Test semantic similarities:
result_vec = model.wv["king"] - model.wv["man"] + model.wv["woman"]
model.wv.similar_by_vector(result_vec)

[('king', 0.668415367603302),
 ('deluxe', 0.4885542392730713),
 ('compatible', 0.47892406582832336),
 ('stainlesssteel', 0.47053804993629456),
 ('ii', 0.4653341472148895),
 ('xx', 0.45887693762779236),
 ('differences', 0.44889411330223083),
 ('burnished', 0.44834545254707336),
 ('identical', 0.4480072557926178),
 ('catalog', 0.4453065097332001)]

In [79]:
model.wv.similar_by_word("excellent")

[('outstanding', 0.7804235219955444),
 ('fantastic', 0.7426757216453552),
 ('exceptional', 0.6960605382919312),
 ('superb', 0.6956508159637451),
 ('wonderful', 0.6925870180130005),
 ('terrific', 0.6809049844741821),
 ('great', 0.6170509457588196),
 ('superior', 0.5734264254570007),
 ('fabulous', 0.5658342838287354),
 ('amazing', 0.5618612766265869)]

From the Semantic Similarities above we can see that our model actually performs relatively well given the results of the top 10 closest words to Excellent. However we can see the problem emerges that the reviews do not necessarily contain all the words we might want such as "Queen", which may or may not be in the data based on the randomness involved in selecting reviews. One would prefer to utilize the all encompassing Google word2vec to help with unorthodox words, and we can trust that the performance is better. Yet, the create w2v does produce results faster.

### Simple Models:

#### Personally Trained Word2Vec

In [17]:
# Need to make binary model at this point:
train_binary = train[~(train["sentiment"] == 2)]
test_binary = test[~(test["sentiment"] == 2)]

In [18]:
X_train = train_binary["review_body"]
Y_train = train_binary["sentiment"].values

X_test = test_binary["review_body"]
Y_test = test_binary["sentiment"].values

In [19]:
def avg_vector_mine(review):
    tot_ = 0
    for word_ in review:
        if word_ in model.wv:
            tot_ += model.wv[word_]
        else:
            continue
    if len(review) == 0:
        return 0
    return tot_/len(review)

In [20]:
# Vectorize X_train:
X_train_avg = X_train.apply(avg_vector_mine)
X_train_df = pd.DataFrame.from_dict(dict(zip(X_train_avg.index, X_train_avg.values))).T
X_train_vectorized_mine = X_train_df.values

In [21]:
# Vectorize X_test:
X_test_avg = X_test.apply(avg_vector_mine)
X_test_df = pd.DataFrame.from_dict(dict(zip(X_test_avg.index, X_test_avg.values))).T
X_test_vectorized_mine = X_test_df.values

In [22]:
perceptron = Perceptron(tol=1e-3, random_state=42)
perceptron.fit(X_train_vectorized_mine, Y_train)
print("Perceptron Training Accuracy:", perceptron.score(X_train_vectorized_mine, Y_train))

Perceptron Training Accuracy: 0.7960728471857903


In [23]:
print("Perceptron Test Accuracy:", perceptron.score(X_test_vectorized_mine, Y_test))

Perceptron Test Accuracy: 0.7959582790091264


In [24]:
svm = LinearSVC(max_iter=1000)
svm.fit(X_train_vectorized_mine, Y_train)
print("SVM Training Accuracy:", svm.score(X_train_vectorized_mine, Y_train))

SVM Training Accuracy: 0.8586837043143721




In [25]:
print("SVM Test Accuracy:", svm.score(X_test_vectorized_mine, Y_test))

SVM Test Accuracy: 0.8573864206197974


#### Pretrained Word2Vec

In [26]:
def avg_vector_google(review):
    tot_ = 0
    for word_ in review:
        if word_ in word2vec:
            tot_ += word2vec[word_]
        else:
            continue
    if len(review) == 0:
        return 0
    return tot_/len(review)

In [27]:
# Vectorize X_train:
X_train_avg = X_train.apply(avg_vector_google)
X_train_df = pd.DataFrame.from_dict(dict(zip(X_train_avg.index, X_train_avg.values))).T
X_train_vectorized_google = X_train_df.values

In [28]:
# Vectorize X_test:
X_test_avg = X_test.apply(avg_vector_google)
X_test_df = pd.DataFrame.from_dict(dict(zip(X_test_avg.index, X_test_avg.values))).T
X_test_vectorized_google = X_test_df.values

In [29]:
perceptron = Perceptron(tol=1e-3, random_state=42)
perceptron.fit(X_train_vectorized_google, Y_train)
print("Perceptron Training Accuracy:", perceptron.score(X_train_vectorized_google, Y_train))

Perceptron Training Accuracy: 0.8067401134177721


In [30]:
print("Perceptron Test Accuracy:", perceptron.score(X_test_vectorized_google, Y_test))

Perceptron Test Accuracy: 0.8041570554608364


In [31]:
svm = LinearSVC(max_iter=1000)
svm.fit(X_train_vectorized_google, Y_train)
print("SVM Training Accuracy:", svm.score(X_train_vectorized_google, Y_train))

SVM Training Accuracy: 0.8208236528516826


In [32]:
print("SVM Test Accuracy:", svm.score(X_test_vectorized_google, Y_test))

SVM Test Accuracy: 0.817420519506569


Perceptron Results TF.IDF: Test-Accuracy = .8569 <br>
SVM Results TF.IDF: Test-Accuracy = 0.898025 <br>
<br>
We can see from the above results that the models are better for TF.IDF, then personally trained Word2Vec, and lastly for the Google pre-trained Word2Vec. One likely explanation for this could be that the TF.IDF and personally train Word2Vec might be more tailored to the specific review set at hand. In addition, the averaging of the word vectors might be a very inefficient way to combine words in a review.

### Feedforward Neural Networks

#### Avg Word2Vec Vectors - Personally Trained Word2Vec:

In [33]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn

In [44]:
class MLP(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 200
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        self.dropout = nn.Dropout(0.2)
        #self.act3 = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 200)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [35]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_vectorized_mine)):
    train_data.append([X_train_vectorized_mine[i], Y_train[i]])
    
test_data = []
for i in range(len(X_test_vectorized_mine)):
    test_data.append([X_test_vectorized_mine[i], Y_test[i]])

In [45]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp = MLP()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp.state_dict(), 'model.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.389826 	Test Loss: 0.351972
Epoch: 2 	Training Loss: 0.345981 	Test Loss: 0.344050
Epoch: 3 	Training Loss: 0.337811 	Test Loss: 0.335044
Epoch: 4 	Training Loss: 0.330687 	Test Loss: 0.332909
Epoch: 5 	Training Loss: 0.327175 	Test Loss: 0.327461
Epoch: 6 	Training Loss: 0.323126 	Test Loss: 0.325748
Epoch: 7 	Training Loss: 0.320272 	Test Loss: 0.326852
Epoch: 8 	Training Loss: 0.318014 	Test Loss: 0.323711
Epoch: 9 	Training Loss: 0.315877 	Test Loss: 0.323428
Epoch: 10 	Training Loss: 0.312822 	Test Loss: 0.322309
All done.


In [46]:
mlp.load_state_dict(torch.load('model.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [47]:
# Calculate Accuracy from trained model:
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
        for data, target in dataloader:
            output = model(data.float())
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    return prediction_list

In [48]:
predictions = predict(mlp, testloader)
predictions = np.array(predictions)

In [40]:
def accuracy(y_true, y_pred):
    score = sum(y_true == y_pred)/len(y_pred)
    return score

In [49]:
print("Test Accuracy:", accuracy(Y_test, predictions))

Test Accuracy: 0.8674907230969813


In [80]:
# Formatting process for 3 class:
X_train_3 = train["review_body"]
Y_train_3 = train["sentiment"].values

X_test_3 = test["review_body"]
Y_test_3 = test["sentiment"].values

In [82]:
# Vectorize X_train_3:
X_train_avg_3 = X_train_3.apply(avg_vector_mine)
X_train_df_3 = pd.DataFrame.from_dict(dict(zip(X_train_avg_3.index, X_train_avg_3.values))).T
X_train_vectorized_mine_3 = X_train_df_3.values

In [83]:
# Vectorize X_test_3:
X_test_avg_3 = X_test_3.apply(avg_vector_mine)
X_test_df_3 = pd.DataFrame.from_dict(dict(zip(X_test_avg_3.index, X_test_avg_3.values))).T
X_test_vectorized_mine_3 = X_test_df_3.values

In [84]:
# Formatting the data for DataLoader (coverts to tensors):
train_data_3 = []
for i in range(len(X_train_vectorized_mine_3)):
    train_data_3.append([X_train_vectorized_mine_3[i], Y_train_3[i]])
    
test_data_3 = []
for i in range(len(X_test_vectorized_mine_3)):
    test_data_3.append([X_test_vectorized_mine_3[i], Y_test_3[i]])

In [85]:
class MLP_3(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 200
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 200)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [109]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data_3, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=20, num_workers=1)
mlp_3 = MLP_3()
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(mlp.parameters(), lr=.0001)
optimizer = torch.optim.SGD(mlp_3.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_3.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_3(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_3(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_3.state_dict(), 'model_3.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.796609 	Test Loss: 0.747008
Epoch: 2 	Training Loss: 0.735844 	Test Loss: 0.735304
Epoch: 3 	Training Loss: 0.725607 	Test Loss: 0.727630
Epoch: 4 	Training Loss: 0.718142 	Test Loss: 0.724393
Epoch: 5 	Training Loss: 0.714715 	Test Loss: 0.722115
Epoch: 6 	Training Loss: 0.710518 	Test Loss: 0.720172
Epoch: 7 	Training Loss: 0.708163 	Test Loss: 0.716973
Epoch: 8 	Training Loss: 0.705964 	Test Loss: 0.713493
Epoch: 9 	Training Loss: 0.701447 	Test Loss: 0.713009
Epoch: 10 	Training Loss: 0.698704 	Test Loss: 0.711250
All done.


In [110]:
mlp_3.load_state_dict(torch.load('model_3.pt'))
testloader = torch.utils.data.DataLoader(test_data_3, batch_size=1, num_workers=1)

In [111]:
predictions = predict(mlp_3, testloader)
predictions = np.array(predictions)

In [112]:
print("Test Accuracy:", accuracy(Y_test_3, predictions))

Test Accuracy: 0.6971


#### Avg Word2Vec Vectors - Google Trained Word2Vec:

In [113]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_vectorized_google)):
    train_data.append([X_train_vectorized_google[i], Y_train[i]])
    
test_data = []
for i in range(len(X_test_vectorized_google)):
    test_data.append([X_test_vectorized_google[i], Y_test[i]])

In [117]:
class MLP_Google(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 300
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        self.dropout = nn.Dropout(0.2)
        #self.act3 = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 300)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [118]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp_g = MLP_Google()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_g.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_g.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_g(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_g(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_g.state_dict(), 'model_google.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.657763 	Test Loss: 0.530660
Epoch: 2 	Training Loss: 0.482147 	Test Loss: 0.460093
Epoch: 3 	Training Loss: 0.452953 	Test Loss: 0.448825
Epoch: 4 	Training Loss: 0.440136 	Test Loss: 0.435709
Epoch: 5 	Training Loss: 0.430075 	Test Loss: 0.431090
Epoch: 6 	Training Loss: 0.424777 	Test Loss: 0.426827
Epoch: 7 	Training Loss: 0.420459 	Test Loss: 0.429685
Epoch: 8 	Training Loss: 0.417830 	Test Loss: 0.428514
Epoch: 9 	Training Loss: 0.415146 	Test Loss: 0.425935
Epoch: 10 	Training Loss: 0.413365 	Test Loss: 0.423997
All done.


In [119]:
mlp_g.load_state_dict(torch.load('model_google.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [120]:
predictions = predict(mlp_g, testloader)
predictions = np.array(predictions)

In [121]:
print("Test Accuracy:", accuracy(Y_test, predictions))

Test Accuracy: 0.809973924380704


In [122]:
# Vectorize X_train_3:
X_train_avg_3 = X_train_3.apply(avg_vector_google)
X_train_df_3 = pd.DataFrame.from_dict(dict(zip(X_train_avg_3.index, X_train_avg_3.values))).T
X_train_vectorized_google_3 = X_train_df_3.values

In [123]:
# Vectorize X_test_3:
X_test_avg_3 = X_test_3.apply(avg_vector_google)
X_test_df_3 = pd.DataFrame.from_dict(dict(zip(X_test_avg_3.index, X_test_avg_3.values))).T
X_test_vectorized_google_3 = X_test_df_3.values

In [None]:
# Formatting the data for DataLoader (coverts to tensors):
train_data = []
for i in range(len(X_train_vectorized_google_3)):
    train_data.append([X_train_vectorized_google_3[i], Y_train_3[i]])
    
test_data = []
for i in range(len(X_test_vectorized_google_3)):
    test_data.append([X_test_vectorized_google_3[i], Y_test_3[i]])

In [124]:
class MLP_Google_3(nn.Module):

    def __init__(self):
        super().__init__()

        input_dim = 300
        hidden_1 = 50
        hidden_2 = 10
        
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 300)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [125]:
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_data, batch_size=20, num_workers=1)
testloader = torch.utils.data.DataLoader(test_data, batch_size=20, num_workers=1)
mlp_g_3 = MLP_Google_3()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(mlp_g.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(mlp_g_3.parameters(), lr=0.01)

test_loss_min = 5

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    mlp_g_3.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        output = mlp_g_3(data.float())

        # Compute loss
        loss = criterion(output, target)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        train_loss += loss.item()*data.size(0)
        
    with torch.no_grad():
        for data, target in testloader:
            output = mlp_g_3(data.float())
            loss = criterion(output, target)
            test_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(testloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(mlp_g_3.state_dict(), 'model_google_3.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.617759 	Test Loss: 0.488001
Epoch: 2 	Training Loss: 0.466237 	Test Loss: 0.452972
Epoch: 3 	Training Loss: 0.446832 	Test Loss: 0.443936
Epoch: 4 	Training Loss: 0.438610 	Test Loss: 0.439470
Epoch: 5 	Training Loss: 0.433162 	Test Loss: 0.435645
Epoch: 6 	Training Loss: 0.428534 	Test Loss: 0.430747
Epoch: 7 	Training Loss: 0.423448 	Test Loss: 0.428099
Epoch: 8 	Training Loss: 0.421179 	Test Loss: 0.430775
Epoch: 9 	Training Loss: 0.417592 	Test Loss: 0.424281
Epoch: 10 	Training Loss: 0.415827 	Test Loss: 0.428536
All done.


In [126]:
mlp_g_3.load_state_dict(torch.load('model_google_3.pt'))
testloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1)

In [127]:
predictions = predict(mlp_g_3, testloader)
predictions = np.array(predictions)

In [128]:
print("Test Accuracy:", accuracy(Y_test, predictions))

Test Accuracy: 0.8077173804031692


### Binary & Ternary Models For First 10 W2V Vectors Models: