In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
import re

In [2]:
#Loading the data
data=pd.read_csv("Rating_Project_Data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Rating,Review_Title,Review_Desc
0,0,5.0 out of 5 stars,The Best Lg has to offer,Its a very well built ultra light 17 inch lapt...
1,1,5.0 out of 5 stars,Just awesome. Best in class,Great laptop. Mindblowing speed. Fantastic di...
2,2,5.0 out of 5 stars,Go for it,-
3,3,4.0 out of 5 stars,Very solow speed and hang laptop bad,Very slow and not good 8gb ram according speed...
4,4,1.0 out of 5 stars,Its a regret!!,Really hate the product. Its wastage of money....


In [3]:
#Checking the shape of the data
data.shape

(18273, 4)

In [4]:
#Extracting only the rating number value(1 to 5) from the whole text of Rating
data[["Ratings", "Rating_5"]]=data.Rating.str.split(".", expand=True)

In [5]:
#Removing the unwanted columns from the dataset.
data.drop(["Unnamed: 0", "Rating", "Rating_5", "Review_Title"],axis=1, inplace=True)

In [6]:
data.head(10)

Unnamed: 0,Review_Desc,Ratings
0,Its a very well built ultra light 17 inch lapt...,5
1,Great laptop. Mindblowing speed. Fantastic di...,5
2,-,5
3,Very slow and not good 8gb ram according speed...,4
4,Really hate the product. Its wastage of money....,1
5,The Product Worth it,5
6,Please don't buy this laptop. The screen keep...,1
7,Firstly for the correct information of all pot...,4
8,After seeing the reviews I was confused. But ...,5
9,This is a review after more than 6 months of u...,2


In [7]:
#Lets check if we have any null values
data.isnull().sum()

Review_Desc    1
Ratings        0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18273 entries, 0 to 18272
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Desc  18272 non-null  object
 1   Ratings      18273 non-null  object
dtypes: object(2)
memory usage: 285.6+ KB


In [9]:
data.describe()

Unnamed: 0,Review_Desc,Ratings
count,18272,18273
unique,15099,6
top,Good,5
freq,104,7214


In [10]:
data["Ratings"].value_counts()

5    7214
1    4619
4    3245
3    1762
2    1204
-     229
Name: Ratings, dtype: int64

In [11]:
#Lets replace the "-" values by Nan values to identify the null values.

data.replace({'-': np.nan}, inplace=True)

In [12]:
data.isnull().sum()

Review_Desc     51
Ratings        229
dtype: int64

In [13]:
#Lets drop these null values and proceed further on the preprocessing.
data.dropna(inplace=True)

In [14]:
data.isnull().sum()

Review_Desc    0
Ratings        0
dtype: int64

In [15]:
data["Ratings"].value_counts()

5    7189
1    4609
4    3235
3    1757
2    1203
Name: Ratings, dtype: int64

###### Preprocessing Of the data:

In [16]:
#Lets first convert all the words to lowercase.

def convert_lowercase(review):
    return review.lower()

data["Review_Desc"]=data.Review_Desc.apply(lambda x: convert_lowercase(x))

data.head()   

Unnamed: 0,Review_Desc,Ratings
0,its a very well built ultra light 17 inch lapt...,5
1,great laptop. mindblowing speed. fantastic di...,5
3,very slow and not good 8gb ram according speed...,4
4,really hate the product. its wastage of money....,1
5,the product worth it,5


In [17]:
#Punctuations
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
#Removing the punctuations from the data as it does not provide the required information on predicting the rating.
#str.maketrans removes any punctuations 

def remove_punct(review):
    return review.translate(str.maketrans('', '', string.punctuation))

data["Review_Desc"]=data.Review_Desc.apply(lambda x: remove_punct(x))
data.head(11)


Unnamed: 0,Review_Desc,Ratings
0,its a very well built ultra light 17 inch lapt...,5
1,great laptop mindblowing speed fantastic disp...,5
3,very slow and not good 8gb ram according speed...,4
4,really hate the product its wastage of money i...,1
5,the product worth it,5
6,please dont buy this laptop the screen keeps ...,1
7,firstly for the correct information of all pot...,4
8,after seeing the reviews i was confused but i...,5
9,this is a review after more than 6 months of u...,2
10,i purchased gr0011au 1tb hdd no ssd ryzen 3 32...,4


In [19]:
# Using word_tokenize to tokenize sentence into words

def tokenized_rev(review):
    return word_tokenize(review, "english")
data["Review_Desc"]=data.Review_Desc.apply(lambda x: tokenized_rev(x))
data.head(11)

    


Unnamed: 0,Review_Desc,Ratings
0,"[its, a, very, well, built, ultra, light, 17, ...",5
1,"[great, laptop, mindblowing, speed, fantastic,...",5
3,"[very, slow, and, not, good, 8gb, ram, accordi...",4
4,"[really, hate, the, product, its, wastage, of,...",1
5,"[the, product, worth, it]",5
6,"[please, dont, buy, this, laptop, the, screen,...",1
7,"[firstly, for, the, correct, information, of, ...",4
8,"[after, seeing, the, reviews, i, was, confused...",5
9,"[this, is, a, review, after, more, than, 6, mo...",2
10,"[i, purchased, gr0011au, 1tb, hdd, no, ssd, ry...",4


In [20]:
#Loading stopwords
stop_words=stopwords.words('English')
type(stop_words), stop_words, len(stop_words)

(list,
 ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out',
  'on',
  'off',
  'over',
  'under',
  'again',
  'further'

In [21]:
stop_words.append(('its','firstly', 'first'))

In [22]:
len(stop_words)

180

In [23]:
 #Lets derive the negative sentiment stopwords and keep them in our data which will help us to predict the lower ratings
keep_words=['ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn', "isn't",'don',"don't",'no','nor','not', 'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]
type(keep_words)

list

In [24]:
#Lets remove the keep_words from the stop_words
for i in keep_words:
    stop_words.remove(i)

In [25]:
len(stop_words) #We can see the keep_words are now removed from the stop_words. 

140

In [26]:
data.head()

Unnamed: 0,Review_Desc,Ratings
0,"[its, a, very, well, built, ultra, light, 17, ...",5
1,"[great, laptop, mindblowing, speed, fantastic,...",5
3,"[very, slow, and, not, good, 8gb, ram, accordi...",4
4,"[really, hate, the, product, its, wastage, of,...",1
5,"[the, product, worth, it]",5


In [27]:
#Now lets remove the stop_words from our data

def remove_stopwords(review):
    reviews_with_no_stopwords= [rev for rev in review if not rev in stop_words]
    reviews=' '.join(reviews_with_no_stopwords)
    return reviews

data["Review_Desc"]=data.Review_Desc.apply(lambda x: remove_stopwords(x))
data.head(11)

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light 17 inch laptop needed p...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good 8gb ram according speed slow han...,4
4,really hate product wastage money taking long ...,1
5,product worth,5
6,please dont buy laptop screen keeps freezing b...,1
7,firstly correct information potential buyers g...,4
8,seeing reviews confused decided go got deliver...,5
9,review 6 months usagei azuz laptop since many ...,2
10,purchased gr0011au 1tb hdd no ssd ryzen 3 3250...,4


In [28]:
df1=data.copy()
df1.head(20)

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light 17 inch laptop needed p...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good 8gb ram according speed slow han...,4
4,really hate product wastage money taking long ...,1
5,product worth,5
6,please dont buy laptop screen keeps freezing b...,1
7,firstly correct information potential buyers g...,4
8,seeing reviews confused decided go got deliver...,5
9,review 6 months usagei azuz laptop since many ...,2
10,purchased gr0011au 1tb hdd no ssd ryzen 3 3250...,4


In [29]:
#Now Lets remove the numbers from the data
def remove_num(review):
    review = re.sub(r'[0-9]+', "", review)
    return review

df1["Review_Desc"]=df1.Review_Desc.apply(lambda x: remove_num(x))
df1.head(20)

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light inch laptop needed por...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good gb ram according speed slow hang...,4
4,really hate product wastage money taking long ...,1
5,product worth,5
6,please dont buy laptop screen keeps freezing b...,1
7,firstly correct information potential buyers g...,4
8,seeing reviews confused decided go got deliver...,5
9,review months usagei azuz laptop since many p...,2
10,purchased grau tb hdd no ssd ryzen economica...,4


In [30]:
df2=df1.copy()
df2.head(20)

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light inch laptop needed por...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good gb ram according speed slow hang...,4
4,really hate product wastage money taking long ...,1
5,product worth,5
6,please dont buy laptop screen keeps freezing b...,1
7,firstly correct information potential buyers g...,4
8,seeing reviews confused decided go got deliver...,5
9,review months usagei azuz laptop since many p...,2
10,purchased grau tb hdd no ssd ryzen economica...,4


In [31]:
# Lemmatization
lm = WordNetLemmatizer()
def lemmatiz_words(review):
    lemmatized_rev=[]
    for i in review:
        lemmatized_rev.append(lm.lemmatize(i))
    return "".join(lemmatized_rev)

df1["Review_Desc"]=df1.Review_Desc.apply(lambda x: lemmatiz_words(x))
df1.head(20)

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light inch laptop needed por...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good gb ram according speed slow hang...,4
4,really hate product wastage money taking long ...,1
5,product worth,5
6,please dont buy laptop screen keeps freezing b...,1
7,firstly correct information potential buyers g...,4
8,seeing reviews confused decided go got deliver...,5
9,review months usagei azuz laptop since many p...,2
10,purchased grau tb hdd no ssd ryzen economica...,4


In [32]:
df2=df1.copy()
df2.head(20)

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light inch laptop needed por...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good gb ram according speed slow hang...,4
4,really hate product wastage money taking long ...,1
5,product worth,5
6,please dont buy laptop screen keeps freezing b...,1
7,firstly correct information potential buyers g...,4
8,seeing reviews confused decided go got deliver...,5
9,review months usagei azuz laptop since many p...,2
10,purchased grau tb hdd no ssd ryzen economica...,4


In [33]:
#Now lets remove the emojis present in our data. 

import re
def remove_emoji(review):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', review)


#remove_emoji("I am happy 🙂😊")

In [34]:
df2["Review_Desc"]=df2.Review_Desc.apply(lambda x: remove_emoji(x))
df2.head(20)

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light inch laptop needed por...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good gb ram according speed slow hang...,4
4,really hate product wastage money taking long ...,1
5,product worth,5
6,please dont buy laptop screen keeps freezing b...,1
7,firstly correct information potential buyers g...,4
8,seeing reviews confused decided go got deliver...,5
9,review months usagei azuz laptop since many p...,2
10,purchased grau tb hdd no ssd ryzen economica...,4


In [35]:
#Now lets convert the emoji's to words

#try:
 #   import cPickle as pickle
#except ImportError: 
 #   import pickle
#import re

#with open('Emoji_Dict.p', 'rb') as fp:
 #   Emoji_Dict = pickle.load(fp)
#Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

#def convert_emojis_to_word(review):
 #   for emot in Emoji_Dict:
  #      reviews = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), review)
   # return reviews

###### Feature Extraction

Now our data is ready. 
CountVectorizer transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
###min_df: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(min_df=8)  #min_df=8, the word frequency which are more than 8 are considered

feature_vector = count_vectorizer.fit(df2["Review_Desc"])

#to get a list of all unique words
features = feature_vector.get_feature_names()

#to get a sparse matrix of the words in the text
df_features = count_vectorizer.transform(df2["Review_Desc"])

In [37]:
len(features)

5056

In [38]:
df2.head()

Unnamed: 0,Review_Desc,Ratings
0,well built ultra light inch laptop needed por...,5
1,great laptop mindblowing speed fantastic displ...,5
3,slow not good gb ram according speed slow hang...,4
4,really hate product wastage money taking long ...,1
5,product worth,5


###### Train_Test_Split:

In [42]:
#Lets check for the best random_state

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

maxAcc=0
maxRS=0
for i in range(1,200):
    x_train,x_test,y_train,y_test=train_test_split(df_features.toarray(), df2["Ratings"],test_size=0.2,random_state=i) #Train/Test Splitting
    rfc=RandomForestClassifier()           #LinearRegression
    rfc.fit(x_train,y_train)         #Fitting the Model
    pred=rfc.predict(x_test)         #Predition
    acc=accuracy_score(y_test,pred)   #Accuracy score
    if acc>maxAcc:
        maxAcc=acc
        maxRS=i
        
print("Best Acc:",maxAcc,"on Rand State ",maxRS)

Best Acc: 0.7146429563767713 on Rand State  171


In [43]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_features.toarray(), df2["Ratings"], test_size=0.2, random_state=14)

len(x_train), len(y_train), len(x_test), len(y_test)

(14394, 14394, 3599, 3599)

In [44]:
len(x_train), len(y_train), len(x_test), len(y_test)

(14394, 14394, 3599, 3599)

###### Model Building:

Lets use multiple algorithms to train and predict our model.

In [45]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [47]:
gnb=GaussianNB()
dtc=DecisionTreeClassifier()
gbc=GradientBoostingClassifier()
rfc=RandomForestClassifier()
svc=SVC()
knc=KNeighborsClassifier()
mnb=MultinomialNB()

model = [gnb, dtc, gbc, rfc, svc, knc, mnb]

for m in model:
    m.fit(x_train, y_train)
    y_pred=m.predict(x_test)
    acc=accuracy_score(y_test, y_pred)
    
    print("Accuracy_Score of ",m, "is", acc)
    print("Confusion_Matrix :", confusion_matrix(y_test, y_pred))
    print("Classification_Report :", classification_report(y_test, y_pred))
    print('\n')
    print("*************************************************************************")
    print("*************************************************************************")

Accuracy_Score of  GaussianNB() is 0.243123089747152
Confusion_Matrix : [[255 461 166  41  21]
 [ 35 123  35  14   9]
 [ 25 145 109  35  21]
 [ 63 234 115 170  56]
 [112 652 250 234 218]]
Classification_Report :               precision    recall  f1-score   support

           1       0.52      0.27      0.36       944
           2       0.08      0.57      0.13       216
           3       0.16      0.33      0.22       335
           4       0.34      0.27      0.30       638
           5       0.67      0.15      0.24      1466

    accuracy                           0.24      3599
   macro avg       0.35      0.32      0.25      3599
weighted avg       0.49      0.24      0.27      3599



*************************************************************************
*************************************************************************
Accuracy_Score of  DecisionTreeClassifier() is 0.612948041122534
Confusion_Matrix : [[ 688   53   56   58   89]
 [  74   82   14   14   32]
 [  79   

###### Hyper Parameter Tuning and Cross Validation:

In [48]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [49]:
ran_class=RandomForestClassifier()
param={'max_features' : ['sqrt','auto', 'log2'],
       'n_estimators' : [100, 500, 1000],
       'min_samples_leaf' : [1, 5, 10, 15],
       'min_samples_split' : [5, 10, 15, 20],
       'random_state': list(range(0,50)),
       'bootstrap': [True, False],
       'criterion' : ['entropy','gini']
      }
     
cv=KFold(n_splits=10,shuffle=False)    

In [50]:
RCV=RandomizedSearchCV(estimator=ran_class, param_distributions=param, cv=cv, verbose=2)
RCV.fit(x_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=500, random_state=19; total time= 3.0min
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=500, random_state=19; total time= 2.9min
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=500, random_state=19; total time= 3.0min
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=500, random_state=19; total time= 2.9min
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=500, random_state=19; total time= 2.9min
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=500, random_state=19; total time= 2.

[CV] END bootstrap=True, criterion=gini, max_features=log2, min_samples_leaf=1, min_samples_split=20, n_estimators=100, random_state=34; total time=  13.6s
[CV] END bootstrap=True, criterion=gini, max_features=log2, min_samples_leaf=1, min_samples_split=20, n_estimators=100, random_state=34; total time=  13.5s
[CV] END bootstrap=True, criterion=gini, max_features=log2, min_samples_leaf=1, min_samples_split=20, n_estimators=100, random_state=34; total time=  13.4s
[CV] END bootstrap=True, criterion=gini, max_features=log2, min_samples_leaf=1, min_samples_split=20, n_estimators=100, random_state=34; total time=  13.6s
[CV] END bootstrap=True, criterion=gini, max_features=log2, min_samples_leaf=1, min_samples_split=20, n_estimators=100, random_state=34; total time=  13.4s
[CV] END bootstrap=True, criterion=gini, max_features=log2, min_samples_leaf=1, min_samples_split=20, n_estimators=100, random_state=34; total time=  13.4s
[CV] END bootstrap=True, criterion=gini, max_features=log2, min_

RandomizedSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
                   estimator=RandomForestClassifier(),
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['entropy', 'gini'],
                                        'max_features': ['sqrt', 'auto',
                                                         'log2'],
                                        'min_samples_leaf': [1, 5, 10, 15],
                                        'min_samples_split': [5, 10, 15, 20],
                                        'n_estimators': [100, 500, 1000],
                                        'random_state': [0, 1, 2, 3, 4, 5, 6, 7,
                                                         8, 9, 10, 11, 12, 13,
                                                         14, 15, 16, 17, 18, 19,
                                                         20, 21, 22, 23, 24, 25,
                                      

In [51]:
print("Best Parameters : ",RCV.best_params_)
print("Best Estimator : ",RCV.best_estimator_)

Best Parameters :  {'random_state': 49, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'criterion': 'gini', 'bootstrap': False}
Best Estimator :  RandomForestClassifier(bootstrap=False, min_samples_split=5, random_state=49)


In [52]:
rcv_pred=RCV.predict(x_test)
print("Confusion_Matrix : \n", confusion_matrix(y_test,rcv_pred))
print("Accuracy_Score :", accuracy_score(y_test,rcv_pred))
print("Classification_Report :\n",classification_report(y_test,rcv_pred))

Confusion_Matrix : 
 [[ 850    0    2    8   84]
 [ 101   64    1    6   44]
 [  98    1  108   24  104]
 [  69    1    4  189  375]
 [  72    2    2   43 1347]]
Accuracy_Score : 0.7107529869408169
Classification_Report :
               precision    recall  f1-score   support

           1       0.71      0.90      0.80       944
           2       0.94      0.30      0.45       216
           3       0.92      0.32      0.48       335
           4       0.70      0.30      0.42       638
           5       0.69      0.92      0.79      1466

    accuracy                           0.71      3599
   macro avg       0.79      0.55      0.59      3599
weighted avg       0.73      0.71      0.68      3599



We have found the RandomForestclassifier as our best model with hyperparameter and cross validation and got the accuray of 71%. 