In [2]:
import re 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/benvictoria17/DataAnalytics/main/dataset/Amazon%20Musical%20Instruments%20Reviews/Musical_instruments_reviews.csv")

In [4]:
df.shape

(10261, 9)

In [5]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [6]:
df.dtypes

reviewerID         object
asin               object
reviewerName       object
helpful            object
reviewText         object
overall           float64
summary            object
unixReviewTime      int64
reviewTime         object
dtype: object

In [7]:
def change_score(rating):
    if rating < 3:
        return 0
    elif rating > 3:
        return 2
    else:
        return 1

df_score = df["overall"]
df_score = df_score.map(change_score)
df["overall"] = df_score
df.tail()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
10256,A14B2YH83ZXMPP,B00JBIVXGC,Lonnie M. Adams,"[0, 0]","Great, just as expected. Thank to all.",2,Five Stars,1405814400,"07 20, 2014"
10257,A1RPTVW5VEOSI,B00JBIVXGC,Michael J. Edelman,"[0, 0]",I've been thinking about trying the Nanoweb st...,2,"Long life, and for some players, a good econom...",1404259200,"07 2, 2014"
10258,AWCJ12KBO5VII,B00JBIVXGC,Michael L. Knapp,"[0, 0]",I have tried coated strings in the past ( incl...,2,Good for coated.,1405987200,"07 22, 2014"
10259,A2Z7S8B5U4PAKJ,B00JBIVXGC,"Rick Langdon ""Scriptor""","[0, 0]","Well, MADE by Elixir and DEVELOPED with Taylor...",2,Taylor Made,1404172800,"07 1, 2014"
10260,A2WA8TDCTGUADI,B00JBIVXGC,TheTerrorBeyond,"[0, 0]","These strings are really quite good, but I wou...",2,"These strings are really quite good, but I wou...",1405468800,"07 16, 2014"


In [8]:
df["overall"].value_counts()

2    9022
1     772
0     467
Name: overall, dtype: int64

In [9]:
def convert_to_list(str_lst):
    str_ = str_lst.strip("[]").replace(","," ")
    lst = str_.split()
    lst_to_int = list(map(int, lst))
    return lst_to_int
        
def total_rating(lst_rating):
    return lst_rating[1] 

def helpful_rating(lst_rating):
    return lst_rating[0] 

df["helpful"] = df["helpful"].map(convert_to_list) # "[x,y]" -> [x,y]
df["total_ratings"] = df["helpful"].map(total_rating) # y
df["helpful"] = df["helpful"].map(helpful_rating) # x

df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,total_ratings
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...",0,"Not much to write about here, but it does exac...",2,good,1393545600,"02 28, 2014",0
1,A14VAT5EAX3D9S,1384719342,Jake,13,The product does exactly as it should and is q...,2,Jake,1363392000,"03 16, 2013",14
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""",1,The primary job of this device is to block the...,2,It Does The Job Well,1377648000,"08 28, 2013",1
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""",0,Nice windscreen protects my MXL mic and preven...,2,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014",0
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,0,This pop filter is great. It looks and perform...,2,No more pops when I record my vocals.,1392940800,"02 21, 2014",0


In [10]:
print("Number rows having common values of [reviewerName, reviewText, unixReviewTime] =", df[df.duplicated(subset=["reviewerName", "reviewText", "unixReviewTime"])].shape[0])

Number rows having common values of [reviewerName, reviewText, unixReviewTime] = 0


In [11]:
print("Number of rows, in which helpful > total_ratings =",df[df["helpful"] > df["total_ratings"]].shape[0])

Number of rows, in which helpful > total_ratings = 0


In [12]:
df.isnull().sum()

reviewerID         0
asin               0
reviewerName      27
helpful            0
reviewText         7
overall            0
summary            0
unixReviewTime     0
reviewTime         0
total_ratings      0
dtype: int64

In [13]:
df.drop(df[df["reviewText"].isnull()].index, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [14]:
def remove_html(text):
    html_pattern = re.compile("<.*?>")
    text = re.sub(html_pattern, " ", text) # Substitute HTML tag with space
    return text

def remove_spl_char(text):
    text = re.sub(r"[?|!|.|,|)|(|\|/|#|\'|\"]", r"", text) # All the special characters removed
    return text

def in_lowercase(text):
    text = text.lower()
    return text

stop_words = set(stopwords.words("english")) # List of all the stop words

def remove_stopwords(text):
    filtered_text_lst = []
    text_lst = text.split()
    for word in text_lst:
        if word not in stop_words:
            filtered_text_lst.append(word)
        else:
            continue
    filtered_word = " ".join(filtered_text_lst)
    return filtered_word

stem = PorterStemmer()
def stemming(text):
    stemmed_txt_lst = []
    text_lst = text.split()
    for word in text_lst:
        stemmed_word = stem.stem(word)
        stemmed_txt_lst.append(stemmed_word)
    stemmed_txt_lst = " ".join(stemmed_txt_lst)
    return stemmed_txt_lst
    
def text_preprocessing(text):
    rem_html_txt = remove_html(text) # Remove HTML
    rem_spl_char_txt = remove_spl_char(rem_html_txt) # Remove Special Characters
    lowercase_txt = in_lowercase(rem_spl_char_txt) # Conversion in lowercase
    rem_stopwords_txt = remove_stopwords(lowercase_txt) # Remove stopwords
    stemmed_txt = stemming(text)
    final_txt = stemmed_txt
    return final_txt

df["final_review"] = df["reviewText"].map(text_preprocessing)
print("Before Text Preprocessing- ", "\n")
print(df["reviewText"][0], "\n")
print("After Text Preprocessing- ", "\n")
print(df["final_review"][0])

Before Text Preprocessing-  

Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing, 

After Text Preprocessing-  

not much to write about here, but it doe exactli what it' suppos to. filter out the pop sounds. now my record are much more crisp. it is one of the lowest price pop filter on amazon so might as well buy it, they honestli work the same despit their pricing,


In [15]:
reviews = []
def construct_reviews_lst(review):
    review_split = review.split()
    reviews.append(review_split)
df["final_review"].map(construct_reviews_lst)   

print(df["final_review"].iloc[0]) # Before
reviews[0] 

not much to write about here, but it doe exactli what it' suppos to. filter out the pop sounds. now my record are much more crisp. it is one of the lowest price pop filter on amazon so might as well buy it, they honestli work the same despit their pricing,


['not',
 'much',
 'to',
 'write',
 'about',
 'here,',
 'but',
 'it',
 'doe',
 'exactli',
 'what',
 "it'",
 'suppos',
 'to.',
 'filter',
 'out',
 'the',
 'pop',
 'sounds.',
 'now',
 'my',
 'record',
 'are',
 'much',
 'more',
 'crisp.',
 'it',
 'is',
 'one',
 'of',
 'the',
 'lowest',
 'price',
 'pop',
 'filter',
 'on',
 'amazon',
 'so',
 'might',
 'as',
 'well',
 'buy',
 'it,',
 'they',
 'honestli',
 'work',
 'the',
 'same',
 'despit',
 'their',
 'pricing,']

In [17]:
w2v_model = Word2Vec(reviews, vector_size=50, min_count=5)
def avg_w2v(reviews):
    text_vector = []
    for review in reviews:
        review_vec_sum = np.zeros(50)
        num_words = 0
        for word in review:
            try:
                word_vec = w2v_model.wv[word]
                review_vec_sum += word_vec
                num_words += 1
            except:
                pass
        avg_review_vector = review_vec_sum / num_words
        text_vector.append(avg_review_vector)
    return text_vector

text_vector = np.array(avg_w2v(reviews)) 
print(df["final_review"][0])

print("\n\nVector Representation of above text - ")
text_vector[0] 

not much to write about here, but it doe exactli what it' suppos to. filter out the pop sounds. now my record are much more crisp. it is one of the lowest price pop filter on amazon so might as well buy it, they honestli work the same despit their pricing,


Vector Representation of above text - 


array([ 0.17475755, -0.35378106, -0.42393964,  0.09785367, -0.17032797,
       -0.05104926,  0.49417332,  0.47967612, -0.35972262, -0.25008195,
        0.28523428, -0.31540545,  0.43778842,  0.30244684, -0.33784105,
        1.10185047,  0.47256438,  0.21782782, -0.64956004,  0.1097148 ,
        0.11025078,  0.31815014,  0.5678942 , -0.06164812,  0.26819361,
       -0.16839348, -0.28421945,  0.31620463, -0.12708545, -0.32856199,
       -0.21759072,  0.09058963,  0.22830556,  0.15017771, -0.60820809,
       -0.08033748,  0.1141738 ,  0.08093768,  0.27820454, -0.37364091,
        0.77315059,  0.37992832,  0.0517144 ,  0.38140865,  0.47308853,
        0.11246037,  0.02499377, -0.78818469,  0.25615816,  0.03671107])

In [18]:
def create_feature_names():
    text_features = []
    for index in range(1,51):
        feature_name = "text-feature-"+ str(index)
        text_features.append(feature_name)
    return text_features
text_features = create_feature_names()

def create_df_txt_vec(text_vector):
    df_text_lst = []
    for vector in text_vector:
        vector_reshape = np.reshape(vector ,(50, 1)).T
        df_vector = pd.DataFrame(vector_reshape, columns=text_features)
        df_text_lst.append(df_vector)
    df_text = pd.concat(df_text_lst, ignore_index=True)
    return df_text

df_text = create_df_txt_vec(text_vector)

df_final = pd.concat([df_text, df["helpful"], df["total_ratings"], df["overall"]], axis=1)

In [19]:
df_final.head()

Unnamed: 0,text-feature-1,text-feature-2,text-feature-3,text-feature-4,text-feature-5,text-feature-6,text-feature-7,text-feature-8,text-feature-9,text-feature-10,...,text-feature-44,text-feature-45,text-feature-46,text-feature-47,text-feature-48,text-feature-49,text-feature-50,helpful,total_ratings,overall
0,0.174758,-0.353781,-0.42394,0.097854,-0.170328,-0.051049,0.494173,0.479676,-0.359723,-0.250082,...,0.381409,0.473089,0.11246,0.024994,-0.788185,0.256158,0.036711,0,0,2
1,-0.011216,-0.333489,-0.18853,-0.033778,0.107343,-0.094726,0.044379,0.40825,-0.292075,-0.304488,...,0.183532,0.443912,0.278432,0.213817,-0.851107,0.321744,0.184368,13,14,2
2,-0.298874,-0.251139,-0.002743,-0.250483,-0.318461,-0.106541,-0.109154,0.301816,-0.492155,-0.459679,...,0.317405,0.462119,0.452929,-0.062902,-0.446603,0.066848,0.421023,1,1,2
3,-0.150091,-0.227379,0.026772,-0.142237,-0.166871,0.014411,-0.039234,0.34285,-0.144623,-0.460504,...,0.394085,0.608508,0.40263,-0.024339,-0.425605,0.006842,0.307974,0,0,2
4,-0.06536,-0.116549,-0.034588,-0.045736,-0.357636,-0.224357,-0.08632,0.392834,-0.452386,-0.954828,...,0.245058,0.176507,0.091516,0.302508,-0.579151,0.401698,0.226064,0,0,2


In [20]:
df_features = df_final.drop("overall", axis=1)
df_target = df_final["overall"]

df_features_columns = df_features.columns
df_features_scaled = StandardScaler().fit_transform(df_features)
df_features_scaled = pd.DataFrame(df_features_scaled, columns=df_features_columns)
X_train, X_test, y_train, y_test = train_test_split(df_features_scaled, df_target, test_size=0.25, train_size=0.75)
df_final["overall"].value_counts()

2    9015
1     772
0     467
Name: overall, dtype: int64

In [21]:
oversampling_smote = SMOTE(sampling_strategy={1:5000, 0:5000})
undersampling = RandomUnderSampler(sampling_strategy={2:5000})
pipeline = Pipeline([('under', undersampling), ('smote', oversampling_smote)])
df_train_resampled = pipeline.fit_resample(X_train, y_train)
X_train = df_train_resampled[0] # Resampled X_train 
y_train = df_train_resampled[1] # Resampled y_train

X_train, y_train = shuffle(X_train, y_train)

In [22]:
model = LogisticRegression(C = 6, solver="sag", max_iter=5000) 
model.fit(X_train, y_train)

LogisticRegression(C=6, max_iter=5000, solver='sag')

In [23]:
acc_test = model.score(X_test, y_test)*100
acc_test = round(acc_test, 2)
print("Accuracy of LogisticRegression model on test set: " + str(acc_test) + "%")

Accuracy of LogisticRegression model on test set: 66.07%


In [24]:
K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] # Values of hyperparameter K

cv_f1_mean= []

for value in K:
    model = KNeighborsClassifier(n_neighbors=value)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_micro")
    cv_f1_mean.append(np.mean(scores))
    
max_f1 = max(cv_f1_mean)
index_max_f1 = cv_f1_mean.index(max_f1)

print("Optimal value of hyperparameter K: " + str(K[index_max_f1]))
print("F1 score at optimal K: " + str(max_f1))

Optimal value of hyperparameter K: 1
F1 score at optimal K: 0.8556666666666667


In [25]:
model = KNeighborsClassifier(n_neighbors=1) # Calculating test accuracy of model
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [27]:
acc_test = model.score(X_test, y_test)*100
acc_test = round(acc_test, 2)
print("Accuracy of 1-NN model on test set: " + str(acc_test) + "%")

Accuracy of 1-NN model on test set: 57.1%


In [28]:
model = SVC(C = 15,kernel="rbf") # Calculating test accuracy of model
model.fit(X_train, y_train)

SVC(C=15)

In [29]:
acc_test = model.score(X_test, y_test)*100
acc_test = round(acc_test, 2)
print("Accuracy of SVM model on test set: " + str(acc_test) + "%")

Accuracy of SVM model on test set: 81.55%


In [30]:
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB()

In [31]:
acc_test = model.score(X_test, y_test)*100
acc_test = round(acc_test, 2)
print("Accuracy of GaussianNB model on test set: " + str(acc_test) + "%")

Accuracy of GaussianNB model on test set: 51.68%


In [32]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [33]:
acc_test = model.score(X_test, y_test)*100
acc_test = round(acc_test, 2)
print("Accuracy of Random Forest model on test set: " + str(acc_test) + "%")

Accuracy of Random Forest model on test set: 80.81%
