In [3]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
 

[nltk_data] Downloading package wordnet to /Users/abhay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# ! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz

## Read Data

In [5]:
filepath = './amazon_reviews_us_Jewelry_v1_00.tsv'
reviews_df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip')

  reviews_df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip')


In [6]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1766992 entries, 0 to 1766991
Data columns (total 15 columns):
 #   Column             Dtype  
---  ------             -----  
 0   marketplace        object 
 1   customer_id        int64  
 2   review_id          object 
 3   product_id         object 
 4   product_parent     int64  
 5   product_title      object 
 6   product_category   object 
 7   star_rating        object 
 8   helpful_votes      float64
 9   total_votes        float64
 10  vine               object 
 11  verified_purchase  object 
 12  review_headline    object 
 13  review_body        object 
 14  review_date        object 
dtypes: float64(2), int64(2), object(11)
memory usage: 202.2+ MB


In [7]:
reviews_df.head(2)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,50423057,R135Q3VZ4DQN5N,B00JWXFDMG,657335467,Everbling Purple and Clear Briolette Drop Swar...,Jewelry,5,0.0,0.0,N,Y,Beauties!,so beautiful even tho clearly not high end ......,2015-08-31
1,US,11262325,R2N0QQ6R4T7YRY,B00W5T1H9W,26030170,925 Sterling Silver Finish 6ct Simulated Diamo...,Jewelry,5,0.0,0.0,N,N,Great product.,"Great product.. I got this set for my mother, ...",2015-08-31


In [8]:
reviews_df = reviews_df[['review_id', 'star_rating', 'review_headline', 'review_body', 'review_date']]

In [9]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1766992 entries, 0 to 1766991
Data columns (total 5 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   review_id        object
 1   star_rating      object
 2   review_headline  object
 3   review_body      object
 4   review_date      object
dtypes: object(5)
memory usage: 67.4+ MB


## Randomly selecting reviews from each star_rating_class

In [10]:
reviews_df['star_rating'] = pd.to_numeric(reviews_df['star_rating'],errors='coerce')
reviews_df = reviews_df[reviews_df['star_rating'].notna()]
reviews_df['star_rating'] = reviews_df['star_rating'].astype(int)

#Convert all reviews to string
reviews_df['review_body'] = reviews_df['review_body'].astype(str)

## Keep Reviews and Ratings

In [11]:
reviews_df.head(1)

Unnamed: 0,review_id,star_rating,review_headline,review_body,review_date
0,R135Q3VZ4DQN5N,5,Beauties!,so beautiful even tho clearly not high end ......,2015-08-31


 ## We select 20000 reviews randomly from each rating class.



In [12]:
rating_1 = reviews_df[reviews_df.star_rating.eq(1)].sample(20000, random_state=1)
rating_2 = reviews_df[reviews_df.star_rating.eq(2)].sample(20000, random_state=1)
rating_3 = reviews_df[reviews_df.star_rating.eq(3)].sample(20000, random_state=1)
rating_4 = reviews_df[reviews_df.star_rating.eq(4)].sample(20000, random_state=1)
rating_5 = reviews_df[reviews_df.star_rating.eq(5)].sample(20000, random_state=1)

In [13]:
sampled_reviews_df = pd.concat([rating_1, rating_2, rating_3, rating_4, rating_5])

In [14]:
sampled_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 179967 to 1073422
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   review_id        100000 non-null  object
 1   star_rating      100000 non-null  int64 
 2   review_headline  99998 non-null   object
 3   review_body      100000 non-null  object
 4   review_date      100000 non-null  object
dtypes: int64(1), object(4)
memory usage: 4.6+ MB


# Data Cleaning



# Pre-processing

In [15]:
#Convert reviews to lower case
sampled_reviews_df['review_body'] = sampled_reviews_df['review_body'].apply(lambda value:value.lower())

In [16]:
#Remove URLs
import re

def URLRemoval(sentence):
    return re.sub(r"http\S+", "", sentence)

sampled_reviews_df['review_body'] = sampled_reviews_df['review_body'].apply(URLRemoval)

In [17]:
def nonAlphabeticRemoval(sentence):
    return re.sub(r"[^a-zA-Z ]+", "", sentence)  #This will also remove numbers.

sampled_reviews_df['review_body'] = sampled_reviews_df['review_body'].apply(nonAlphabeticRemoval)

In [18]:
# performing contractions
import contractions
sampled_reviews_df['review_body_contracted'] = sampled_reviews_df['review_body'].apply(lambda value:[contractions.fix(word) for word in value.split()])
sampled_reviews_df['review_body_contracted'] = sampled_reviews_df['review_body_contracted'].apply(' '.join)

In [19]:
sampled_reviews_df.head(1)

Unnamed: 0,review_id,star_rating,review_headline,review_body,review_date,review_body_contracted
179967,R3NZV5F5X17AO9,1,Cheaply Made Chain,chain was cheaply made defective clasp returned,2015-06-08,chain was cheaply made defective clasp returned


In [20]:
sampled_reviews_df.to_csv('sampled_reviews_df.tsv', sep="\t")

## remove the stop words 

In [21]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /Users/abhay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/abhay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/abhay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
from nltk.tokenize import word_tokenize
def stopWordRemoval(sentence):
    # sentence = 'Hello my name is abhay how are you'
    word_tokens = word_tokenize(sentence)
    filtered_sentence = []
    stop_words = set(stopwords.words('english'))
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

In [23]:
sampled_reviews_df['review_body_without_stop_words'] = sampled_reviews_df['review_body_contracted'].apply(lambda value:stopWordRemoval(value))

In [24]:
sampled_reviews_df.head(1)

Unnamed: 0,review_id,star_rating,review_headline,review_body,review_date,review_body_contracted,review_body_without_stop_words
179967,R3NZV5F5X17AO9,1,Cheaply Made Chain,chain was cheaply made defective clasp returned,2015-06-08,chain was cheaply made defective clasp returned,chain cheaply made defective clasp returned


In [25]:
sampled_reviews_df.to_csv('sampled_reviews_df.tsv', sep="\t")

## perform lemmatization  

In [26]:
from nltk.stem import WordNetLemmatizer
def lemmatizeSentence(sentence):
    word_tokens = word_tokenize(sentence)
    lemmatized_sentence = []
    lemmatizer = WordNetLemmatizer()
    for word in word_tokens:
            lemmatized_sentence.append(lemmatizer.lemmatize(word))      #Can also use pos tag for part-of-speech
    return ' '.join(lemmatized_sentence)

In [27]:
sampled_reviews_df['review_body_after_lemma'] = sampled_reviews_df['review_body_without_stop_words'].apply(lambda value:lemmatizeSentence(value))

In [28]:
sampled_reviews_df.head(1)

Unnamed: 0,review_id,star_rating,review_headline,review_body,review_date,review_body_contracted,review_body_without_stop_words,review_body_after_lemma
179967,R3NZV5F5X17AO9,1,Cheaply Made Chain,chain was cheaply made defective clasp returned,2015-06-08,chain was cheaply made defective clasp returned,chain cheaply made defective clasp returned,chain cheaply made defective clasp returned


In [29]:
sampled_reviews_df.to_csv('sampled_reviews_df.tsv', sep="\t")

# TF-IDF Feature Extraction

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVector = TfidfVectorizer(max_features=10000)
fittedValue = tfidfVector.fit_transform(sampled_reviews_df['review_body_after_lemma'].to_list())

In [31]:
data = pd.DataFrame(fittedValue.toarray(), columns=tfidfVector.get_feature_names())



### Train-test split

In [32]:
from sklearn.model_selection import train_test_split
trainData, testData, trainLabels, testLabels = train_test_split(data, sampled_reviews_df['star_rating'].to_list(), test_size=0.2, random_state=42, stratify=sampled_reviews_df['star_rating'].to_list())

# Perceptron

In [33]:
from sklearn.neural_network import MLPClassifier
mlpModel = MLPClassifier(hidden_layer_sizes=(64,), random_state=1, early_stopping=True, max_iter=200, verbose=True).fit(trainData.values, trainLabels)

In [1]:
mlpModel.score(testData.values, testLabels)

NameError: name 'mlpModel' is not defined

# SVM

# Logistic Regression

# Naive Bayes