In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#from keras.utils import to_categorical
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense , LSTM , Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

import pickle

from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [12]:
data = pd.read_csv('Reviews.csv')

In [13]:
data.shape

(568454, 10)

In [14]:
data.dropna(how='any',inplace=True)
data.drop_duplicates(inplace=True, subset=['Score','Text'])
idx = data[data["HelpfulnessNumerator"]>data["HelpfulnessDenominator"]].index
data.drop(index=idx, inplace=True)

In [15]:
data.shape

(393659, 10)

In [16]:
class_1 = data.loc[data.Score==1].sample(35000)
class_2 = data.loc[data.Score==2].sample(20000)
class_4 = data.loc[data.Score==4].sample(20000)
class_5 = data.loc[data.Score==5].sample(35000)
data = pd.concat([class_1,class_2,class_4,class_5])
data.shape

(110000, 10)

In [17]:
def create_target(x):
    if x > 3:
        return "Positive"
    elif x < 3:
        return "Negative"
data['target'] = data['Score'].apply(create_target)

In [18]:
final_stopwords = stopwords.words('english')
addition = ['•', '!', '"', '#', '”', '“', '$', '%', '&', "'", '–', '(', ')', '*','’', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '…']
final_stopwords.extend(addition)

In [19]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
HTMLTAGS = re.compile('<.*?>')
table = str.maketrans(dict.fromkeys(string.punctuation))
remove_digits = str.maketrans('', '', string.digits)
MULTIPLE_WHITESPACE = re.compile(r"\s+")

In [20]:
def preprocessor(review):
    review = HTMLTAGS.sub(r'', review)
    review = review.translate(table)
    review = review.translate(remove_digits)
    review = review.lower()
    review = MULTIPLE_WHITESPACE.sub(" ", review).strip()
    review = [word for word in review.split() if word not in final_stopwords]
    #review = ' '.join([stemmer.stem(word) for word in review])
    review = ' '.join([lemmatizer.lemmatize(word) for word in review])
    return review

data['Text'] = data['Text'].apply(preprocessor) 

In [21]:
X = data['Text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y)

In [22]:
X_train.shape, X_test.shape

((77000,), (33000,))

In [23]:
bow_vectorizer = CountVectorizer(max_features=10000)
bow_vectorizer.fit(X_train)
bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)

In [24]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer.fit(X_train)
tfidf_X_train = tfidf_vectorizer.transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

In [25]:
labelEncoder = LabelEncoder()

y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.transform(y_test)
labels = labelEncoder.classes_.tolist()

In [26]:
def train_and_eval(model, trainX, trainY, testX, testY):
    model.fit(trainX, trainY)
    y_preds_train = model.predict(trainX)
    y_preds_test = model.predict(testX)
    print()
    print(model)
    print(f"Train accuracy score : {accuracy_score(y_train, y_preds_train)}")
    print(f"Test accuracy score : {accuracy_score(y_test, y_preds_test)}")
    print('\n',40*'-')

In [27]:
C = [0.001, 0.01, 0.1, 1, 10]
for c in C: 
    log_model = LogisticRegression(C=c, max_iter=500, random_state=1)
    train_and_eval(log_model,bow_X_train,y_train,bow_X_test,y_test)


LogisticRegression(C=0.001, max_iter=500, random_state=1)
Train accuracy score : 0.8328181818181818
Test accuracy score : 0.8298181818181818

 ----------------------------------------

LogisticRegression(C=0.01, max_iter=500, random_state=1)
Train accuracy score : 0.8701818181818182
Test accuracy score : 0.8602727272727273

 ----------------------------------------

LogisticRegression(C=0.1, max_iter=500, random_state=1)
Train accuracy score : 0.8939480519480519
Test accuracy score : 0.8692727272727273

 ----------------------------------------

LogisticRegression(C=1, max_iter=500, random_state=1)
Train accuracy score : 0.9124675324675324
Test accuracy score : 0.8642121212121212

 ----------------------------------------

LogisticRegression(C=10, max_iter=500, random_state=1)
Train accuracy score : 0.9214805194805195
Test accuracy score : 0.8514848484848485

 ----------------------------------------


In [28]:
clf = svm.SVC(kernel='rbf')
train_and_eval(clf,bow_X_train,y_train,bow_X_test,y_test)


SVC()
Train accuracy score : 0.9387142857142857
Test accuracy score : 0.8655151515151516

 ----------------------------------------


In [26]:
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['entropy', 'gini']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(bow_X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [27]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 500}

In [28]:
clf = RandomForestClassifier(random_state=42, max_features='log2', n_estimators= 500, max_depth=8, criterion='gini')
train_and_eval(clf,bow_X_train,y_train,bow_X_test,y_test)


RandomForestClassifier(max_depth=8, max_features='log2', n_estimators=500,
                       random_state=42)
Train accuracy score : 0.8472597402597403
Test accuracy score : 0.829030303030303

 ----------------------------------------
