# Mounting Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries


In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.parsing import preprocessing, remove_stopwords

# Loading training dataset

In [4]:
dataframe = pd.read_csv('/content/drive/My Drive/Colab Notebooks/consumer_reviews.csv')
dataframe

Unnamed: 0,sub-categories,primary-categories,rating,reviews,title
0,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,3,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...
1,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,4,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...
2,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...
3,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...
4,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...
5,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Bought a lot of batteries for Christmas and th...,... batteries for Christmas and the AmazonBasi...
6,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,ive not had any problame with these batteries ...,... batteries have ordered them in the past be...
7,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Well if you are looking for cheap non-recharge...,... batteries that last quite a while then the...
8,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,3,These do not hold the amount of high power jui...,... do not hold the amount of high power juice...
9,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,4,AmazonBasics AA AAA batteries have done well b...,... done well by me appear to have a good shel...


# Data Pre-Processing

Here, the text cleaning process takes place. The text here is stripped of the following:
1. numeric characters
2. non-alphanumeric characters
3. multiple white-spaces
4. hashtags

Stemming of the text is also done here.

The above preprocessing steps are required to reduce the size of the sparse matrix created later. The text elements removed above do not affect the accuracy of the model severly.

The reviews are tokenized, cleaned(as mentioned above) and the converted back to strings.

**Example:**

*'I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.'*

Gets converted to:

*'i order of them and on of the item is bad qualiti is miss backup spring so i have to put a pc of aluminum to make the batteri work'*

In [0]:
FILTERS = [
    preprocessing.strip_numeric,  #strips numeric data
    preprocessing.strip_non_alphanum, #strips non-numeric data
    preprocessing.strip_multiple_whitespaces, #strips multiple whitespaces
    preprocessing.strip_punctuation, #strips punctuation
    preprocessing.strip_tags, #strips hashtags
    preprocessing.stem_text  #stemming 
]

def clean(string):
    word_list = preprocessing.preprocess_string(string, FILTERS)
    return ' '.join(word.lower() for word in word_list)     #Convert the cleaned reviews from a list back to a string

In [0]:
clean_reviews_list = [clean(reviews) for reviews in dataframe["reviews"]]

In [0]:
dataframe["clean_reviews"] = clean_reviews_list

In [54]:
dataframe

Unnamed: 0,sub-categories,primary-categories,rating,reviews,title,clean_reviews
0,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,3,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,i order of them and on of the item is bad qual...
1,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,4,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,bulk is alwai the less expens wai to go for pr...
2,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,well thei ar not duracel but for the price i a...
3,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,seem to work as well as name brand batteri at ...
4,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,these batteri ar veri long last the price is g...
5,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Bought a lot of batteries for Christmas and th...,... batteries for Christmas and the AmazonBasi...,bought a lot of batteri for christma and the a...
6,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,ive not had any problame with these batteries ...,... batteries have ordered them in the past be...,iv not had ani problam with these batteri have...
7,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Well if you are looking for cheap non-recharge...,... batteries that last quite a while then the...,well if you ar look for cheap non recharg batt...
8,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,3,These do not hold the amount of high power jui...,... do not hold the amount of high power juice...,these do not hold the amount of high power jui...
9,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,4,AmazonBasics AA AAA batteries have done well b...,... done well by me appear to have a good shel...,amazonbas aa aaa batteri have done well by me ...


A new dataframe containing only the clean reviews and their respective ratings is created below.

In [0]:
cleaned_dataframe = pd.DataFrame(
                        { "reviews" : dataframe["clean_reviews"],
                          "rating" : dataframe["rating"]  
                        })

In [10]:
cleaned_dataframe

Unnamed: 0,reviews,rating
0,i order of them and on of the item is bad qual...,3
1,bulk is alwai the less expens wai to go for pr...,4
2,well thei ar not duracel but for the price i a...,5
3,seem to work as well as name brand batteri at ...,5
4,these batteri ar veri long last the price is g...,5
5,bought a lot of batteri for christma and the a...,5
6,iv not had ani problam with these batteri have...,5
7,well if you ar look for cheap non recharg batt...,5
8,these do not hold the amount of high power jui...,3
9,amazonbas aa aaa batteri have done well by me ...,4


The rating series is being categorised here. 
* 0 - indicating negative reviews
* 1 - indicating positive reviews
* 2 - indicating neutral reviews




In [11]:
cleaned_dataframe["rating"] = [1 if rating >= 4 else 0 if rating < 3 else 2 for rating in cleaned_dataframe["rating"]]
cleaned_dataframe

Unnamed: 0,reviews,rating
0,i order of them and on of the item is bad qual...,2
1,bulk is alwai the less expens wai to go for pr...,1
2,well thei ar not duracel but for the price i a...,1
3,seem to work as well as name brand batteri at ...,1
4,these batteri ar veri long last the price is g...,1
5,bought a lot of batteri for christma and the a...,1
6,iv not had ani problam with these batteri have...,1
7,well if you ar look for cheap non recharg batt...,1
8,these do not hold the amount of high power jui...,2
9,amazonbas aa aaa batteri have done well by me ...,1


# Bag Of Words (Count Vectorizer)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
count_vectorized_data = cv.fit_transform(cleaned_dataframe["reviews"])
count_vectorized_data

<28332x6832 sparse matrix of type '<class 'numpy.int64'>'
	with 562398 stored elements in Compressed Sparse Row format>

In [0]:
count_vectorized_data = np.asarray(count_vectorized_data.toarray())

Creating 'bag of words' dataframe.

In [0]:
count_vectorized_dataframe = pd.DataFrame(
                                data = count_vectorized_data)

In [15]:
count_vectorized_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,6792,6793,6794,6795,6796,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806,6807,6808,6809,6810,6811,6812,6813,6814,6815,6816,6817,6818,6819,6820,6821,6822,6823,6824,6825,6826,6827,6828,6829,6830,6831
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Splitting of training data

In [0]:
X = count_vectorized_dataframe.iloc[:, :]
Y = cleaned_dataframe.loc[:, "rating"]
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Model 1: Random Forest Classifier (CountVectorizer)

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
Random_forest = RandomForestClassifier(n_estimators=200, n_jobs=-1)

In [19]:
Random_forest.fit(X_Train, Y_Train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [0]:
predictions = Random_forest.predict(X_Test)

# Evaluation (based on training dataset)

In [21]:
accuracy_score(Y_Test, predictions)

0.9489411764705883

In [22]:
print(np.unique(predictions, return_counts=True))
print(predictions.size)

(array([0, 1, 2]), array([ 260, 8060,  180]))
8500


In [23]:
print(Y_Test.value_counts())
print(Y_Test.size)

1    7663
0     478
2     359
Name: rating, dtype: int64
8500


# Importing validation dataset

In [24]:
validation_dataframe = pd.read_csv('/content/drive/My Drive/Colab Notebooks/validation.csv')
validation_dataframe

Unnamed: 0,reviews.rating,reviews.text,reviews.title
0,5.0,I realize that many people will believe that t...,My favorite tech product this year!
1,5.0,Great tablet from Amazon! I works quickly and ...,Great inexpensive tablet.
2,5.0,This is my very first kindle. I am enjoying re...,Cool way to read more often
3,5.0,Small and light weight. Even in a leather case...,Great E Reader
4,5.0,Easy to use for books and audio. Will use on v...,Love the Kindle.
5,5.0,"Works as it should, this was a replacement of ...",Great reader
6,2.0,"Hard to use, Lots of ads, and Randomly closes ...",Not a huge fan
7,3.0,"I'm mostly happy, but like so many electronics...",Slow to get where you want to be
8,3.0,After the initial trial questions found our Al...,Not what I expected
9,5.0,I brought 3 of them. 2 were gifts. We love the...,Awesome Kindle


# Prepocessing validation dataset

Cleaning the reviews present in the validation set

In [0]:
clean_reviews_list_validation = [clean(reviews) for reviews in validation_dataframe["reviews.text"]]

Creating a dataframe consisting of clean reviews and their respective ratings

In [0]:
cleaned_dataframe_validation = pd.DataFrame(
                        { "reviews" : clean_reviews_list_validation,
                          "rating" : validation_dataframe["reviews.rating"]  
                        })

In [27]:
cleaned_dataframe_validation

Unnamed: 0,reviews,rating
0,i realiz that mani peopl will believ that thi ...,5.0
1,great tablet from amazon i work quickli and sm...,5.0
2,thi is my veri first kindl i am enjoi read mor...,5.0
3,small and light weight even in a leather case ...,5.0
4,easi to us for book and audio will us on vacat,5.0
5,work as it should thi wa a replac of the same ...,5.0
6,hard to us lot of ad and randomli close app,2.0
7,i m mostli happi but like so mani electron thi...,3.0
8,after the initi trial question found our alexa...,3.0
9,i brought of them were gift we love them aweso...,5.0


The rating series is being categorised here.

* 0 - indicating negative reviews
* 1 - indicating positive reviews
* 2 - indicating neutral reviews

In [28]:
cleaned_dataframe_validation["rating"] = [1 if rating >= 4 else 0 if rating < 3 else 2 for rating in cleaned_dataframe_validation["rating"]]
cleaned_dataframe_validation

Unnamed: 0,reviews,rating
0,i realiz that mani peopl will believ that thi ...,1
1,great tablet from amazon i work quickli and sm...,1
2,thi is my veri first kindl i am enjoi read mor...,1
3,small and light weight even in a leather case ...,1
4,easi to us for book and audio will us on vacat,1
5,work as it should thi wa a replac of the same ...,1
6,hard to us lot of ad and randomli close app,0
7,i m mostli happi but like so mani electron thi...,2
8,after the initi trial question found our alexa...,2
9,i brought of them were gift we love them aweso...,1


# Bag of words dataframe (Count Vectorization) for validation dataset 

In [0]:
count_vectorized_data_validation = cv.transform(cleaned_dataframe_validation["reviews"])
count_vectorized_data_validation = np.asarray(count_vectorized_data_validation.toarray())

Creating a 'bag of words' dataframe for the validation set.

In [30]:
count_vectorized_dataframe_validation = pd.DataFrame(
                                data = count_vectorized_data_validation)
count_vectorized_dataframe_validation

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,6792,6793,6794,6795,6796,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806,6807,6808,6809,6810,6811,6812,6813,6814,6815,6816,6817,6818,6819,6820,6821,6822,6823,6824,6825,6826,6827,6828,6829,6830,6831
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Splitting of validation dataset (X_Test, Y_Test)

In [0]:
X_Test_validation = count_vectorized_dataframe_validation.iloc[:, :]
Y_Test_validation = cleaned_dataframe_validation.loc[:, "rating"]

# Model 1: Predictions on validation set

In [0]:
predictions_validation = Random_forest.predict(X_Test_validation)

# Evaluation of model 1 on validation set

In [33]:
accuracy_score(Y_Test_validation, predictions_validation)

0.94505

In [34]:
print(Y_Test_validation.value_counts())

1    18639
2      904
0      457
Name: rating, dtype: int64


# Tf- idf model

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
tf_idf = TfidfVectorizer(analyzer="word")
tf_idf_data = tf_idf.fit_transform(cleaned_dataframe["reviews"]).toarray()

Creating a 'tf-idf' dataframe of the training dataset

In [37]:
tf_idf_vectorized_dataframe = pd.DataFrame(
                                data = tf_idf_data)
tf_idf_vectorized_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,6792,6793,6794,6795,6796,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806,6807,6808,6809,6810,6811,6812,6813,6814,6815,6816,6817,6818,6819,6820,6821,6822,6823,6824,6825,6826,6827,6828,6829,6830,6831
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.251893,0.269438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
print(tf_idf.vocabulary_)

{'order': 4170, 'of': 4104, 'them': 5944, 'and': 248, 'on': 4132, 'the': 5933, 'item': 3171, 'is': 3157, 'bad': 468, 'qualiti': 4715, 'miss': 3776, 'backup': 464, 'spring': 5570, 'so': 5450, 'have': 2749, 'to': 6056, 'put': 4704, 'pc': 4331, 'aluminum': 200, 'make': 3598, 'batteri': 526, 'work': 6687, 'bulk': 823, 'alwai': 202, 'less': 3411, 'expens': 2073, 'wai': 6489, 'go': 2534, 'for': 2309, 'product': 4627, 'like': 3457, 'these': 5962, 'well': 6573, 'thei': 5941, 'ar': 325, 'not': 4042, 'duracel': 1790, 'but': 839, 'price': 4581, 'am': 204, 'happi': 2718, 'seem': 5190, 'as': 354, 'name': 3908, 'brand': 753, 'at': 383, 'much': 3867, 'better': 610, 'veri': 6423, 'long': 3507, 'last': 3351, 'great': 2622, 'bought': 737, 'lot': 3530, 'christma': 1035, 'amazonbas': 214, 'cell': 946, 'been': 578, 'good': 2551, 'haven': 2752, 'notic': 4053, 'differ': 1583, 'between': 614, 'amazon': 213, 'basic': 508, 'just': 3239, 'easier': 1833, 'purchas': 4694, 'arriv': 345, 'hous': 2896, 'hand': 2706, 

In [39]:
tf_idf.get_feature_names()

['aa',
 'aaa',
 'aaaaa',
 'abandon',
 'abc',
 'abcmous',
 'abg',
 'abi',
 'abil',
 'abit',
 'abj',
 'abk',
 'abl',
 'abnorm',
 'abount',
 'about',
 'abov',
 'abroad',
 'absolut',
 'absolutelt',
 'absorb',
 'abund',
 'abus',
 'abv',
 'abysm',
 'ac',
 'acc',
 'acced',
 'accent',
 'accept',
 'access',
 'accessori',
 'accid',
 'accident',
 'accommod',
 'accomod',
 'accompani',
 'accomplish',
 'accord',
 'accordingli',
 'account',
 'acct',
 'accur',
 'accustom',
 'acdelco',
 'acer',
 'acess',
 'achiev',
 'acid',
 'aciv',
 'acm',
 'acord',
 'acoust',
 'acquaint',
 'acrobat',
 'across',
 'act',
 'action',
 'activ',
 'actual',
 'acuiti',
 'ad',
 'adapt',
 'adaqu',
 'add',
 'addag',
 'addict',
 'addit',
 'addition',
 'addon',
 'address',
 'adept',
 'adequ',
 'adh',
 'adhd',
 'adict',
 'adjust',
 'admit',
 'admittedli',
 'adolesc',
 'adopt',
 'adoptor',
 'ador',
 'adult',
 'adultsa',
 'adust',
 'adv',
 'advanc',
 'advantag',
 'adventur',
 'adveris',
 'advers',
 'advert',
 'advertis',
 'advic',
 

# Splitting of training dataset

In [0]:
X_tf_idf = tf_idf_vectorized_dataframe.iloc[:, :]
Y_tf_idf = cleaned_dataframe.loc[:, "rating"]
X_Train_tf_idf, X_Test_tf_idf, Y_Train_tf_idf, Y_Test_tf_idf = train_test_split(X_tf_idf, Y_tf_idf, test_size=0.3, random_state=42)

# Model 2: Random Forest Classifier (Tf-Idf)

In [0]:
Random_forest_tf_idf = RandomForestClassifier(n_estimators=200, n_jobs=-1)

In [42]:
Random_forest_tf_idf.fit(X_Train_tf_idf, Y_Train_tf_idf)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [0]:
predictions_tf_idf = Random_forest.predict(X_Test_tf_idf)

# Evaluation (based on training dataset)

In [44]:
accuracy_score(Y_Test_tf_idf, predictions_tf_idf)

0.9023529411764706

# Tf-idf dataframe for validation dataset


Creating a 'tf-idf' dataframe of the validation set.

In [45]:
tf_idf_data_validation = tf_idf.transform(cleaned_dataframe_validation["reviews"]).toarray()
tf_idf_vectorized_dataframe_validation = pd.DataFrame(
                                data = tf_idf_data_validation)
tf_idf_vectorized_dataframe_validation

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,6792,6793,6794,6795,6796,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806,6807,6808,6809,6810,6811,6812,6813,6814,6815,6816,6817,6818,6819,6820,6821,6822,6823,6824,6825,6826,6827,6828,6829,6830,6831
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064349,0.0,0.0,0.0,0.0,0.0,0.0,0.088362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054339,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.185992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
X_Test_tf_idf_validation = tf_idf_vectorized_dataframe_validation.iloc[:, :]
Y_Test_tf_idf_validation = cleaned_dataframe_validation.loc[:, "rating"]
print(X_Test_tf_idf_validation.shape)
print(Y_Test_tf_idf_validation.shape)

(20000, 6832)
(20000,)


# Model 2: Predictions on validation set

In [0]:
predictions_tf_idf_validation = Random_forest_tf_idf.predict(X_Test_tf_idf_validation)

# Evaluation of model 2 on validation dataset

In [48]:
accuracy_score(Y_Test_tf_idf_validation, predictions_tf_idf_validation)

0.945