In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import learning_curve,GridSearchCV
%matplotlib inline

In [2]:
# LOADING THE DATASET AND SEEING THE DETAILS
data = pd.read_csv('train.csv')
# SHAPE OF THE DATASET
print("Shape of the dataset:")
print(data.shape)
# COLUMN NAMES
print("Column names:")
print(data.columns)
# DATATYPE OF EACH COLUMN
print("Datatype of each column:")
print(data.dtypes)
# SEEING FEW OF THE ENTRIES
print("Few dataset entries:")
print(data.head())
# DATASET SUMMARY
data.describe(include='all')

Shape of the dataset:
(146811, 3)
Column names:
Index(['review_id', 'review', 'rating'], dtype='object')
Datatype of each column:
review_id     int64
review       object
rating        int64
dtype: object
Few dataset entries:
   review_id                                             review  rating
0          0  Ga disappointed neat products .. Meletot Hilsn...       1
1          1    Rdtanya replace broken glass, broken chargernya       1
2          2  Nyesel bngt dsni shopping antecedent photo mes...       1
3          3      Sent a light blue suit goods ga want a refund       1
4          4  Pendants came with dents and scratches on its ...       1


Unnamed: 0,review_id,review,rating
count,146811.0,146811,146811.0
unique,,115328,
top,,Excellent product quality,
freq,,1378,
mean,73405.0,,3.562764
std,42380.829522,,1.260537
min,0.0,,1.0
25%,36702.5,,3.0
50%,73405.0,,4.0
75%,110107.5,,5.0


In [4]:
# Seperate the dataset into X and Y for prediction
x = data['review']
y = data['rating']
print(x.head())
print(y.head())

0    Ga disappointed neat products .. Meletot Hilsn...
1      Rdtanya replace broken glass, broken chargernya
2    Nyesel bngt dsni shopping antecedent photo mes...
3        Sent a light blue suit goods ga want a refund
4    Pendants came with dents and scratches on its ...
Name: review, dtype: object
0    1
1    1
2    1
3    1
4    1
Name: rating, dtype: int64


In [5]:
# CLEANING THE REVIEWS - REMOVAL OF STOPWORDS AND PUNCTUATION
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [6]:
vocab = CountVectorizer(analyzer=text_process).fit(x)
print(len(vocab.vocabulary_))
r0 = x[0]
print(r0)
vocab0 = vocab.transform([r0])
print(vocab0)
print("Getting the words back:")
print(vocab.get_feature_names()[19648])
print(vocab.get_feature_names()[10643])

94887
Ga disappointed neat products .. Meletot Hilsnyaa Speed ​​of delivery is good.
  (0, 10608)	1
  (0, 11917)	1
  (0, 16363)	1
  (0, 23219)	1
  (0, 40513)	1
  (0, 42138)	1
  (0, 47581)	1
  (0, 63453)	1
  (0, 71212)	1
  (0, 90055)	1
Getting the words back:
Pit
Gak


In [7]:
x = vocab.transform(x)
#Shape of the matrix:
print("Shape of the sparse matrix: ", x.shape)
#Non-zero occurences:
print("Non-Zero occurences: ",x.nnz)

# DENSITY OF THE MATRIX
density = (x.nnz/(x.shape[0]*x.shape[1]))*100
print("Density of the matrix = ",density)

Shape of the sparse matrix:  (146811, 94887)
Non-Zero occurences:  1326688
Density of the matrix =  0.009523651352382709


In [8]:
# SPLITTING THE DATASET INTO TRAINING SET AND TESTING SET
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=101)

In [9]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)
predmnb = mnb.predict(x_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))

Confusion Matrix for Multinomial Naive Bayes:
[[1747  147  777  146  184]
 [ 575  387 1199  178  150]
 [ 456  268 4340  899 1121]
 [ 216  176 1460 3065 3598]
 [ 221  174 1396 2954 3529]]
Score: 44.5
Classification Report:               precision    recall  f1-score   support

           1       0.54      0.58      0.56      3001
           2       0.34      0.16      0.21      2489
           3       0.47      0.61      0.53      7084
           4       0.42      0.36      0.39      8515
           5       0.41      0.43      0.42      8274

    accuracy                           0.45     29363
   macro avg       0.44      0.43      0.42     29363
weighted avg       0.44      0.45      0.44     29363



In [16]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rmfr = RandomForestClassifier()
rmfr.fit(x_train,y_train)
predrmfr = rmfr.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predrmfr))
print("Score:",round(accuracy_score(y_test,predrmfr)*100,2))
print("Classification Report:",classification_report(y_test,predrmfr))

Confusion Matrix for Random Forest Classifier:
[[1757  240  639  203  162]
 [ 548  607  919  212  203]
 [ 512  271 4255  975 1071]
 [ 260   86 1404 3424 3341]
 [ 257   75 1342 3347 3253]]
Score: 45.28
Classification Report:               precision    recall  f1-score   support

           1       0.53      0.59      0.55      3001
           2       0.47      0.24      0.32      2489
           3       0.50      0.60      0.54      7084
           4       0.42      0.40      0.41      8515
           5       0.41      0.39      0.40      8274

    accuracy                           0.45     29363
   macro avg       0.46      0.45      0.45     29363
weighted avg       0.45      0.45      0.45     29363



In [14]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
preddt = dt.predict(x_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))

Confusion Matrix for Decision Tree:
[[1473  371  615  287  255]
 [ 492  741  686  275  295]
 [ 613  562 3574 1142 1193]
 [ 366  246 1493 3315 3095]
 [ 356  241 1506 3197 2974]]
Score: 41.13
Classification Report:               precision    recall  f1-score   support

           1       0.45      0.49      0.47      3001
           2       0.34      0.30      0.32      2489
           3       0.45      0.50      0.48      7084
           4       0.40      0.39      0.40      8515
           5       0.38      0.36      0.37      8274

    accuracy                           0.41     29363
   macro avg       0.41      0.41      0.41     29363
weighted avg       0.41      0.41      0.41     29363



In [15]:
# Support Vector Machine
from sklearn.svm import SVC
svm = SVC(random_state=101)
svm.fit(x_train,y_train)
predsvm = svm.predict(x_test)
print("Confusion Matrix for Support Vector Machines:")
print(confusion_matrix(y_test,predsvm))
print("Score:",round(accuracy_score(y_test,predsvm)*100,2))
print("Classification Report:",classification_report(y_test,predsvm))

Confusion Matrix for Support Vector Machines:
[[1869   43  673  263  153]
 [ 570  481 1058  195  185]
 [ 417  131 4501 1052  983]
 [ 157   21 1255 3691 3391]
 [ 173   22 1196 3584 3299]]
Score: 47.14
Classification Report:               precision    recall  f1-score   support

           1       0.59      0.62      0.60      3001
           2       0.69      0.19      0.30      2489
           3       0.52      0.64      0.57      7084
           4       0.42      0.43      0.43      8515
           5       0.41      0.40      0.41      8274

    accuracy                           0.47     29363
   macro avg       0.53      0.46      0.46     29363
weighted avg       0.48      0.47      0.46     29363



In [16]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
"""# parameter evaluation
gbe = GradientBoostingClassifier(random_state=0)
parameters = {
     'learning_rate': [0.05, 0.1, 0.5],
    'max_features': [0.5, 1],
    'max_depth': [3, 4, 5]}
gridsearch=GridSearchCV(gbe,parameters,cv=100,scoring='roc_auc')
gridsearch.fit(x,y)
print(gridsearch.best_params_)
print(gridsearch.best_score_)"""
#Boosting
gbi = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,max_features=0.5,random_state=999999)
gbi.fit(x_train,y_train)
predgbi = gbi.predict(x_test)
print("Confusion Matrix for Gradient Boosting Classifier:")
print(confusion_matrix(y_test,predgbi))
print("Score:",round(accuracy_score(y_test,predgbi)*100,2))
print("Classification Report:",classification_report(y_test,predgbi))

KeyboardInterrupt: 

In [26]:
# K Nearest Neighbour Algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train,y_train)
predknn = knn.predict(x_test)
print("Confusion Matrix for K Neighbors Classifier:")
print(confusion_matrix(y_test,predknn))
print("Score: ",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:")
print(classification_report(y_test,predknn))

Confusion Matrix for K Neighbors Classifier:
[[1546  351  577  288  239]
 [ 619  703  676  267  224]
 [ 827  560 3694 1121  882]
 [ 572  286 1823 3144 2690]
 [ 537  293 1711 3156 2577]]
Score:  39.72
Classification Report:
              precision    recall  f1-score   support

           1       0.38      0.52      0.44      3001
           2       0.32      0.28      0.30      2489
           3       0.44      0.52      0.47      7084
           4       0.39      0.37      0.38      8515
           5       0.39      0.31      0.35      8274

    accuracy                           0.40     29363
   macro avg       0.38      0.40      0.39     29363
weighted avg       0.39      0.40      0.39     29363



In [27]:
# XGBoost Classifier
import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(x_train,y_train)
predxgb = xgb.predict(x_test)
print("Confusion Matrix for XGBoost Classifier:")
print(confusion_matrix(y_test,predxgb))
print("Score: ",round(accuracy_score(y_test,predxgb)*100,2))
print("Classification Report:")
print(classification_report(y_test,predxgb))

Confusion Matrix for XGBoost Classifier:
[[ 745   26  649 1412  169]
 [ 183  409  758  987  152]
 [ 144   85 3811 2372  672]
 [  67   14 1315 4369 2750]
 [  72   17 1259 4197 2729]]
Score:  41.08
Classification Report:
              precision    recall  f1-score   support

           1       0.62      0.25      0.35      3001
           2       0.74      0.16      0.27      2489
           3       0.49      0.54      0.51      7084
           4       0.33      0.51      0.40      8515
           5       0.42      0.33      0.37      8274

    accuracy                           0.41     29363
   macro avg       0.52      0.36      0.38     29363
weighted avg       0.46      0.41      0.40     29363



In [None]:
# MULTILAYER PERCEPTRON CLASSIFIER
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(x_train,y_train)
predmlp = mlp.predict(x_test)
print("Confusion Matrix for Multilayer Perceptron Classifier:")
print(confusion_matrix(y_test,predmlp))
print("Score:",round(accuracy_score(y_test,predmlp)*100,2))
print("Classification Report:")
print(classification_report(y_test,predmlp))

In [17]:
#Testing model

pr = data['review'][0]
print(pr)
print("Actual Rating: ",data['rating'][0])
pr_t = vocab.transform([pr])
print("Predicted Rating:")
svm.predict(pr_t)[0]

Ga disappointed neat products .. Meletot Hilsnyaa Speed ​​of delivery is good.
Actual Rating:  1
Predicted Rating:


1

In [19]:
# Loading test set

testing = pd.read_csv("test.csv")
testing.head()   

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg model..."
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...
4,5,it's sooooo cute! i like playing with the glit...


In [20]:
# Adding new column

testing['rating'] = ""
testing.head()


Unnamed: 0,review_id,review,rating
0,1,"Great danger, cool, motif and cantik2 jg model...",
1,2,One of the shades don't fit well,
2,3,Very comfortable,
3,4,Fast delivery. Product expiry is on Dec 2022. ...,
4,5,it's sooooo cute! i like playing with the glit...,


In [None]:
# Using the best model to predict the rating for the test set

for index,row in testing.iterrows():
    pr = testing['review'][index]
    pr_t = vocab.transform([pr])
    testing['rating'][index]= rmfr.predict(pr_t)[0]
    
    if index % 10000 ==0:
        print(index)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


0
10000


In [None]:
testing=testing.drop(['review'], axis=1)

In [None]:
testing.to_csv(r'C:\Users\Andy_uni\Desktop\submission(rmfr).csv', index = False, header=True)