In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

import string
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.utils import to_categorical

import joblib
import pickle

from sklearn import svm

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Reviews.csv')

In [3]:
data.dropna(how='any',inplace=True)
data.drop_duplicates(inplace=True, subset=['Score','Text'])
idx = data[data["HelpfulnessNumerator"]>data["HelpfulnessDenominator"]].index
data.drop(index=idx, inplace=True)

In [4]:
data['Score'].value_counts()

5    250737
4     56073
1     36277
3     29770
2     20802
Name: Score, dtype: int64

In [5]:
class_1 = data.loc[data.Score==1].sample(30000)
class_2 = data.loc[data.Score==2].sample(15000)
class_3 = data.loc[data.Score==3].sample(29770)
class_4 = data.loc[data.Score==4].sample(15000)
class_5 = data.loc[data.Score==5].sample(30000)
data = pd.concat([class_1,class_2,class_3,class_4,class_5])
data.shape

(119770, 10)

In [6]:
def create_target(x):
    if x > 3:
        return "Positive"
    elif x < 3:
        return "Negative"
    else:
        return "Neutral"
data['target'] = data['Score'].apply(create_target)

In [7]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
HTMLTAGS = re.compile('<.*?>')
table = str.maketrans(dict.fromkeys(string.punctuation))
remove_digits = str.maketrans('', '', string.digits)
MULTIPLE_WHITESPACE = re.compile(r"\s+")

In [8]:
def preprocessor(review):
    review = HTMLTAGS.sub(r'', review)
    review = review.translate(table)
    review = review.translate(remove_digits)
    review = review.lower()
    review = MULTIPLE_WHITESPACE.sub(" ", review).strip()
    #review = [word for word in review.split() if word not in final_stopwords]
    #review = ' '.join([stemmer.stem(word) for word in review])
    review = review.split()
    review = ' '.join([lemmatizer.lemmatize(word) for word in review])
    #review = ' '.join(k for k in review)
    #print(review)
    return review

data['Text'] = data['Text'].apply(preprocessor) 

In [9]:
X = data['Text']
y = data['target']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0, stratify=y)

In [13]:
X_train = X[:83839]
y_train = X[83839:]
y_train = y[:83839]
y_test = y[83839:]

In [37]:
model=make_pipeline(TfidfVectorizer(),svm.SVC(kernel='rbf'))
model.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()), ('svc', SVC())])

In [38]:
#joblib.dump(model,"svm_model_3class.sav")

['svm_model_3class.sav']

In [14]:
load_model = joblib.load("svm_model_3class.sav")

In [None]:
#labels_train = load_model.predict(X_train)
labels_test = load_model.predict(X_test)

#print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

In [None]:
cm = confusion_matrix(y_test, labels_test)
print("Confusion Matrix is: \n",cm)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("classWise accuracy is:\t",cm.diagonal()*100)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=make_pipeline(TfidfVectorizer(),MultinomialNB())
model.fit(X_train, y_train)

labels_train = model.predict(X_train)
labels_test = model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=make_pipeline(TfidfVectorizer(),RandomForestClassifier())
model.fit(X_train, y_train)

labels_train = model.predict(X_train)
labels_test = model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

In [None]:
from xgboost import XGBClassifier
model=make_pipeline(TfidfVectorizer(),XGBClassifier())
model.fit(X_train, y_train)

labels_train = model.predict(X_train)
labels_test = model.predict(X_test)

print("Training Accuracy:",accuracy_score(y_train, labels_train)*100)
print("Validation Accuracy:",accuracy_score(y_test, labels_test)*100)

In [None]:
# text = input("Enter any text:\t")
# text = preprocessor(text)
# print(load_model.predict([text]))

In [40]:
# model=make_pipeline(TfidfVectorizer(),svm.SVC(kernel='rbf'))
# model.fit(X, y)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()), ('svc', SVC())])

In [41]:
# joblib.dump(model,"svm_3class.sav")

['svm_3class.sav']