In [23]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

import string
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.utils import to_categorical

import joblib

from sklearn import svm

In [24]:
data = pd.read_csv('Reviews.csv')

In [25]:
data.dropna(how='any',inplace=True)
data.drop_duplicates(inplace=True, subset=['Score','Text'])
idx = data[data["HelpfulnessNumerator"]>data["HelpfulnessDenominator"]].index
data.drop(index=idx, inplace=True)

In [26]:
data['Score'].value_counts()

5    250737
4     56073
1     36277
3     29770
2     20802
Name: Score, dtype: int64

In [27]:
class_1 = data.loc[data.Score==1].sample(36277)
class_2 = data.loc[data.Score==2].sample(20802)
class_4 = data.loc[data.Score==4].sample(21000)
class_5 = data.loc[data.Score==5].sample(36300)
data = pd.concat([class_1,class_2,class_4,class_5])
data.shape

(114379, 10)

In [28]:
def create_target(x):
    if x > 3:
        return "Positive"
    elif x < 3:
        return "Negative"
data['target'] = data['Score'].apply(create_target)

In [29]:
# final_stopwords = stopwords.words('english')
# addition = ['•', '!', '"', '#', '”', '“', '$', '%', '&', "'", '–', '(', ')', '*','’', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '…']
# final_stopwords.extend(addition)

In [30]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
HTMLTAGS = re.compile('<.*?>')
table = str.maketrans(dict.fromkeys(string.punctuation))
remove_digits = str.maketrans('', '', string.digits)
MULTIPLE_WHITESPACE = re.compile(r"\s+")

In [31]:
def preprocessor(review):
    review = HTMLTAGS.sub(r'', review)
    review = review.translate(table)
    review = review.translate(remove_digits)
    review = review.lower()
    review = MULTIPLE_WHITESPACE.sub(" ", review).strip()
    #review = [word for word in review.split() if word not in final_stopwords]
    #review = ' '.join([stemmer.stem(word) for word in review])
    review = review.split()
    review = ' '.join([lemmatizer.lemmatize(word) for word in review])
    #review = ' '.join(k for k in review)
    #print(review)
    return review

data['Text'] = data['Text'].apply(preprocessor) 

In [32]:
X = data['Text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0, stratify=y)

In [11]:
# labelEncoder = LabelEncoder()

# y_train = labelEncoder.fit_transform(y_train)
# y_test = labelEncoder.transform(y_test)
# labels = labelEncoder.classes_.tolist()

In [13]:
model=make_pipeline(TfidfVectorizer(),svm.SVC(kernel='rbf'))
model.fit(X_train, y_train)

In [14]:
#joblib.dump(model,"svm_model_2class.sav")

In [37]:
load_model = joblib.load("svm_model_2class.sav")

In [38]:
labels_train = load_model.predict(X_train)
labels_test = load_model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

Training Accuracy: 92.54480734403298
Validation Accuracy: 92.67063006353092


In [39]:
from sklearn.linear_model import LogisticRegression
model=make_pipeline(TfidfVectorizer(),LogisticRegression(C=1, max_iter=500, random_state=1))
model.fit(X_train, y_train)

labels_train = model.predict(X_train)
labels_test = model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

Training Accuracy: 90.20545806532193
Validation Accuracy: 88.0515241592353


In [34]:
from sklearn.naive_bayes import MultinomialNB
model=make_pipeline(TfidfVectorizer(),MultinomialNB())
model.fit(X_train, y_train)

labels_train = model.predict(X_train)
labels_test = model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

Training Accuracy: 89.00143633297945
Validation Accuracy: 85.35874570146295


In [35]:
from sklearn.ensemble import RandomForestClassifier
model=make_pipeline(TfidfVectorizer(),RandomForestClassifier())
model.fit(X_train, y_train)

labels_train = model.predict(X_train)
labels_test = model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

Training Accuracy: 99.99750202960095
Validation Accuracy: 84.56315206621204


In [36]:
from xgboost import XGBClassifier
model=make_pipeline(TfidfVectorizer(),XGBClassifier())
model.fit(X_train, y_train)

labels_train = model.predict(X_train)
labels_test = model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)



Training Accuracy: 89.95441204021732
Validation Accuracy: 84.93034912863554


In [None]:
# text = input("Enter any text:\t")
# text = preprocessor(text)
# print(load_model.predict([text]))

In [None]:
# model=make_pipeline(TfidfVectorizer(),svm.SVC(kernel='rbf'))
# model.fit(X, y)

In [None]:
# joblib.dump(model,"svm_2class.sav")