In [1]:
# importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
d1 = pd.read_csv('./boat_bassheads900_reviews.csv')
d2 = pd.read_csv('./boat_bassheads900_reviews_1.csv')
d3 = pd.read_csv('./boat_bassheads900_reviews_2.csv')
d4 = pd.read_csv('./boat_bassheads900_reviews_3.csv')

In [4]:
# Using Concat function to merge all the four datasets
final_df = pd.concat([d1, d2, d3, d4], ignore_index = True)

In [5]:
final_df.head()

Unnamed: 0,review,rating
0,Its just fine and has lifespan of 1 year only ...,4.0 out of 5 stars
1,"For this price point ,this headphone is a grea...",4.0 out of 5 stars
2,Best experience ever in headphones I’ve used t...,4.0 out of 5 stars
3,overall is good,4.0 out of 5 stars
4,Bass Quality and sound is great,4.0 out of 5 stars


In [7]:
# Value count for the different ratings
final_df['rating'].value_counts()

4.0 out of 5 stars    753
3.0 out of 5 stars    615
5.0 out of 5 stars    475
2.0 out of 5 stars    140
1.0 out of 5 stars    114
Name: rating, dtype: int64

In [10]:
final_df.duplicated().sum()

134

In [11]:
final_df.drop_duplicates(inplace = True)

In [12]:
final_df['rating'] = final_df['rating'].apply(lambda x: x.split()[0].split('.')[0])

In [15]:
final_df['rating'].tail()

2092    2
2093    3
2094    2
2095    3
2096    2
Name: rating, dtype: object

In [17]:
final_df = final_df.astype({'rating':'int64'})

In [21]:
final_df['rating'].describe()

count    1963.000000
mean        3.602140
std         1.082309
min         1.000000
25%         3.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: rating, dtype: float64

In [26]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\balun\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
def clean(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[""''_]', '', text)
    text = re.sub('\n', '', text)
    return text

def decontract_text(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"\'t've", " not have", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'clock", "f the clock", text)
    text = re.sub(r"\'cause", " because", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\’s", " is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)
    return text

def final_clean(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    stop = set(stopwords.words('english'))
    filtered_words = [word for word in text.split() if word not in stop]
    ps = LancasterStemmer()
    text = [ps.stem(word) for word in filtered_words]
    text = ' '.join(text)
    return text

In [27]:
final_df['review'] = final_df['review'].apply(lambda text: decontract_text(text))
final_df['review'] = final_df['review'].apply(lambda text: clean(text))
final_df['review'] = final_df['review'].apply(lambda text: final_clean(text))

In [29]:
vec = CountVectorizer()
X = vec.fit_transform(final_df["review"])
X

<1963x3368 sparse matrix of type '<class 'numpy.int64'>'
	with 26320 stored elements in Compressed Sparse Row format>

In [31]:
df1 = pd.DataFrame(X.toarray(),columns=vec.get_feature_names_out())

In [32]:
final_df['rating'].value_counts()

4    678
3    597
5    435
2    140
1    113
Name: rating, dtype: int64

In [34]:
final_df['Happy_Unhappy'] = final_df['rating'].apply(lambda x: 1 if x>3 else 0)

In [35]:
final_df

Unnamed: 0,review,rating,Happy_Unhappy
0,fin lifesp year gre year mic headphon stop wor...,4,1
1,pric point headphon gre steal us headphon dai ...,4,1
2,best expery ev headphon us til sup sound qual ...,4,1
3,overal good,4,1
4,bass qual sound gre,4,1
...,...,...,...
2092,produc qual got fail within three month,2,0
2093,first produc defect replac got nic produc got ...,3,0
2094,poor connect nee toggl connect jack rep okay l...,2,0
2095,overal produc good nic sound qual jack good us...,3,0


In [36]:
final_df['Happy_Unhappy'].value_counts()

1    1113
0     850
Name: Happy_Unhappy, dtype: int64

In [38]:
y = final_df['Happy_Unhappy']
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.2, stratify = y)

In [39]:
print(y_train.value_counts())
print(y_test.value_counts())

1    890
0    680
Name: Happy_Unhappy, dtype: int64
1    223
0    170
Name: Happy_Unhappy, dtype: int64


## Support Vector Classifiers

In [45]:
model = svm.SVC()
model.fit(X_train, y_train)
X_train_pred = model.predict(X_train)
X_test_pred = model.predict(X_test)
print(classification_report(y_train, X_train_pred))

              precision    recall  f1-score   support

           0       0.92      0.74      0.82       680
           1       0.83      0.95      0.88       890

    accuracy                           0.86      1570
   macro avg       0.87      0.85      0.85      1570
weighted avg       0.87      0.86      0.86      1570



## Random Forest Classifier

In [49]:
model = RandomForestClassifier(n_jobs = -1, max_depth = 10, n_estimators = 50)
model.fit(X_train, y_train)
X_train_pred = model.predict(X_train)
X_test_pred = model.predict(X_test)
print(classification_report(y_train, X_train_pred))

              precision    recall  f1-score   support

           0       0.98      0.33      0.49       680
           1       0.66      1.00      0.79       890

    accuracy                           0.71      1570
   macro avg       0.82      0.66      0.64      1570
weighted avg       0.80      0.71      0.66      1570



## `Random Forest Classifier` is giving the best accuracy