
# **MODEL IMPLEMENTATION AND EVALUATION**
This is the stage where the three models are built, optimized and evaluated.

**Models used:**  Support Vector Machine, Naive Bayes, KNN, Decision Trees

**Evaluation methods used:** accuracy, precision, recall, f1_score and confusion matrix

In [None]:
#LIBRARIES
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.simplefilter("ignore")

In [None]:
#lOADING DATASETS
df = pd.read_json('data/final_pre_process.json',encoding='latin1') #due to special charas should be encoded as latin 1

#REMOVE MAX
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## **DOUBLE-CHECKING...**
Double checking if there are any NULL values within the dataset. This would cause issues later on if there are as such.

In [None]:
#CHECKING WHICH ROW IS NULL FROM PRE-PROCESSING
checkNULL = df.isnull()
checkNULL = checkNULL.any(axis=1)
df[checkNULL]

In [None]:
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
#UPDATED VP VALUES
df["verified"].value_counts(normalize=True)

The change barely had any affect on the T/F values, and thus we are ready to proceed.

# **SVM (ABSA)**

INPUTS = REVIEWTEXT , MEAN ABSA, OVERALL(RATING)

COVERS BOTH COUNT VECTORIZER AND TFIDF VECTORIZER

## **Count Vectorizer**
word vectorization maps words or phrases from a lexicon to a matching vector of real numbers, which may then be used to determine word predictions and semantics, and this is done due to the fact that models only understand numerical data.

We are going to be utlizing two of the vectorization methods, the first one being count vectorizer. We just count the number of times a word appears in the document in CountVectorizer, which results in a bias in favor of the most common terms.

In [None]:
df.drop('Overall Sentiment', axis=1, inplace=True)
df.info()

CHANGE TO STRING AS TRAINING DOES NOT ACCEPT NUMBERED NAMED COLUMNS

In [None]:
df['overall'] = df['overall'].astype(str)

### **Data Preparation**

In [None]:
#creating dummy variable for category class
dummy_creat = pd.concat([df, pd.get_dummies(df['overall'])], axis=1)

count_vect = CountVectorizer()
comment_feature = count_vect.fit_transform(dummy_creat['reviewText'])

text_feature_df = pd.DataFrame(comment_feature.todense(), columns = count_vect.get_feature_names_out())

cv_final_feature_df = pd.concat([text_feature_df, dummy_creat[['Mean ABSA Sentiment', '1', '2', '3', '4', '5', 'verified']]], axis=1)

In [None]:
cv_final_feature_df.head()

### **DATASET SPLIT**

SPLIT DATASET 80/20

In [None]:
train_x_cv = cv_final_feature_df[cv_final_feature_df.columns[:-1]]
train_y_cv = cv_final_feature_df['verified']

In [None]:
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(
    train_x_cv, train_y_cv,test_size=0.2, random_state=42)

In [None]:
X_train_cv.head()

The data is decided to be split into 80 - 20, which has been determined by trial and error. This splitting produces the highest accuracy for the models, and thus we are going to with that.

### **SVM (ABSA + COUNT VECTORIZER)**

In [None]:
ABSA TRAIN

In [None]:
#IMPLEMENTING AND RUNNNING SVM MODEL - COUNT

svm1 = LinearSVC(random_state=0, tol=1e-5)
svm1.fit(X_train_cv, y_train_cv)

In [None]:
X_test_cv.head()

**PREDICTION & EVALUATION**

In [None]:
#PREDICTION
prediction = svm1.predict(X_test_cv)

In [None]:
#EVALUATION
svm_a1 = accuracy_score(y_test_cv, prediction)*100
svm_p1 = precision_score(y_test_cv, prediction)* 100
svm_r1 = recall_score(y_test_cv, prediction)*100
svm_f11 = f1_score(y_test_cv, prediction)*100

In [None]:
#CONFUSION MATRIX
cm =  confusion_matrix(y_test_cv, prediction, labels=svm1.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=svm1.classes_)
display.plot()

In [None]:
print("Accuracy: ", svm_a1)
print("Precision: ", svm_p1)
print("Recall: ", svm_r1)
print("F1 Score: ", svm_f11)


## **TFIDF VECTORIZER**
We examine the total document weightage of a word in TfidfVectorizer. It assists us in coping with the most common terms. We may use it to penalize them. The word counts are weighted by a measure of how frequently they appear in the documents in TfidfVectorizer.

In [None]:
#creating dummy variable for category class
dummy_creat = pd.concat([df, pd.get_dummies(df['overall'])], axis=1)

tfid_vect = TfidfVectorizer(stop_words='english')
comment_feature = tfid_vect.fit_transform(dummy_creat['reviewText'])

text_feature_df = pd.DataFrame(comment_feature.todense(), columns = tfid_vect.get_feature_names_out())

tfidf_final_feature_df = pd.concat([text_feature_df, dummy_creat[['Mean ABSA Sentiment', '1', '2', '3', '4', '5', 'verified']]], axis=1)

### **DATASET SPLIT**

SPLIT DATASET 80/20

In [None]:
train_x_tfidf = tfidf_final_feature_df[tfidf_final_feature_df.columns[:-1]]
train_y_tfidf = tfidf_final_feature_df['verified']

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    train_x_tfidf, train_y_tfidf,test_size=0.22, random_state=42)

In [None]:
X_train_tfidf.head()

In [None]:
new_xtest_tfidf = X_test_tfidf.copy().reset_index(drop=True)
new_ytest_tfidf = y_test_tfidf.copy().reset_index(drop=True)

### **SVM (ABSA + TFIDF VECTORIZER)**

In [None]:
#IMPLEMENTING AND RUNNNING SVM MODEL - COUNT
svm1 = LinearSVC(random_state=0, tol=1e-5)
svm1.fit(X_train_tfidf, y_train_tfidf)

**PREDICTION & EVALUATION**

In [None]:
#PREDICTION
prediction = svm1.predict(X_test_tfidf)

In [None]:
#EVALUATION
svm_a2 = accuracy_score(y_test_tfidf, prediction)*100
svm_p2 = precision_score(y_test_tfidf, prediction)* 100
svm_r2 = recall_score(y_test_tfidf, prediction)*100
svm_f12 = f1_score(y_test_tfidf, prediction)*100

In [None]:
#CONFUSION MATRIX
cm =  confusion_matrix(y_test_tfidf, prediction, labels=svm1.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=svm1.classes_)
display.plot()

In [None]:
print("Accuracy: ", svm_a2)
print("Precision: ", svm_p2)
print("Recall: ", svm_r2)
print("F1 Score: ", svm_f12)

# **SVM (NON-ABSA)**

INPUTS = REVIEWTEXT

COVERS BOTH COUNT VECTORIZER AND TFIDF VECTORIZER

In [None]:
df = pd.read_json('data/final_pre_process.json',encoding='latin1') #due to special charas should be encoded as latin 1

**RECHECK

In [None]:
#CHECKING WHICH ROW IS NULL FROM PRE-PROCESSING
%timeit
checkNULL = df.isnull()
checkNULL = checkNULL.any(axis=1)
df[checkNULL]

In [None]:
#UPDATED VP VALUES
df["verified"].value_counts(normalize=True)

In [None]:
#DROP DUPLICATES
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
df.head()

**MODELING**

In [None]:
#ASSIGN THE VARIABLES
X = df['reviewText'] #input var
y = df['verified'] #target var

In [None]:
#SPLIT DATA
%timeit
X_train, X_test, y_train, y_test = train_test_split(
    df['reviewText'], df['verified'],test_size=0.2, random_state=42) #40% gives best results, 42 is no of life...

entiredf = format(df.shape[0])
traindf = format(X_train.shape[0])
testdf = format(X_test.shape[0])

print('Number of rows:')
print('Entire dataset:', entiredf)
print('Train dataset:', traindf)
print('Test dataset:',testdf)

## **COUNT VECTORIZER**

In [None]:
count_vectorizer  = CountVectorizer(stop_words='english')
count_vectorizer.fit(X_train)
print('\nVocabulary: \n', count_vectorizer.vocabulary_)

train_c = count_vectorizer.fit_transform(X_train)
test_c = count_vectorizer.transform(X_test)

### **SVM (COUNT VECTORIZER NON ABSA)**

**PREDICTION & EVALUATION**

In [None]:
#IMPLEMENTING AND RUNNNING SVM MODEL - COUNT
svm1 = LinearSVC(random_state=0, tol=1e-5)
svm1.fit(train_c, y_train)
prediction = svm1.predict(test_c)

In [None]:
#EVALUATION
svm_na_a1 = accuracy_score(y_test, prediction)*100
svm_na_p1 = precision_score(y_test, prediction)* 100
svm_na_r1 = recall_score(y_test, prediction)*100
svm_na_f11 = f1_score(y_test, prediction)*100

In [None]:
#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=svm1.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=svm1.classes_)
display.plot()

In [None]:
print("Accuracy: ", svm_na_a1)
print("Precision: ", svm_na_p1)
print("Recall: ", svm_na_r1)
print("F1 Score: ", svm_na_f11)

## **TFIDF VECTORIZER**

In [None]:
TFIDF_vectorizer  = TfidfVectorizer(stop_words='english')

TFIDF_vectorizer.fit(X_train)
print('\nVocabulary: \n', TFIDF_vectorizer.vocabulary_)

train_tf = TFIDF_vectorizer.fit_transform(X_train)
test_tf = TFIDF_vectorizer.transform(X_test)

### **SVM (TFIDF VECTORIZER NON ABSA)**

**PREDICTION & EVALUATION**

In [None]:
#IMPLEMENTING AND RUNNING SVM MODEL - TFIDF
svm2 = LinearSVC(random_state=0, tol=1e-5)
svm2.fit(train_tf, y_train)
prediction = svm2.predict(test_tf)

In [None]:
#EVALUATION
svm_na_a2 = accuracy_score(y_test, prediction)*100
svm_na_p2 = precision_score(y_test, prediction)* 100
svm_na_r2 = recall_score(y_test, prediction)*100
svm_na_f12 = f1_score(y_test, prediction)*100

In [None]:
#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=svm2.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=svm2.classes_)
display.plot()

In [None]:
print("Accuracy: ", svm_na_a2)
print("Precision: ", svm_na_p2)
print("Recall: ", svm_na_r2)
print("F1 Score: ", svm_na_f12)

# **NAIVE BAYES**

## **COUNT VECTORIZER**

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_c, y_train)
prediction = nb.predict(test_c)

#EVALUATION
nb_a1 = accuracy_score(y_test, prediction)*100
nb_p1 = precision_score(y_test, prediction)* 100
nb_r1 = recall_score(y_test, prediction)*100
nb_f11 = f1_score(y_test, prediction)*100

#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=nb.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=nb.classes_)
display.plot()


In [None]:
print("Accuracy: ", nb_a1)
print("Precision: ", nb_p1)
print("Recall: ", nb_r1)
print("F1 Score: ", nb_f11)

## **TFIDF VECTORIZER**

In [None]:
nb = MultinomialNB()
nb.fit(train_tf, y_train)
prediction = nb.predict(test_tf)

#EVALUATION
nb_a2 = accuracy_score(y_test, prediction)*100
nb_p2 = precision_score(y_test, prediction)* 100
nb_r2 = recall_score(y_test, prediction)*100
nb_f12 = f1_score(y_test, prediction)*100

#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=nb.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=nb.classes_)
display.plot()

In [None]:
print("Accuracy: ", nb_a1)
print("Precision: ", nb_p1)
print("Recall: ", nb_r1)
print("F1 Score: ", nb_f11)


# **KNN MODEL**

## **COUNT VECTORIZER**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_c, y_train)
prediction = knn.predict(test_c)

#EVALUATION
knn_a1 = accuracy_score(y_test, prediction)*100
knn_p1 = precision_score(y_test, prediction)* 100
knn_r1 = recall_score(y_test, prediction)*100
knn_f11 = f1_score(y_test, prediction)*100

#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=knn.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=knn.classes_)
display.plot()


In [None]:
print("Accuracy: ", knn_a1)
print("Precision: ", knn_p1)
print("Recall: ", knn_r1)
print("F1 Score: ", knn_f11)


## **TFIDF VECTORIZER**

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_tf, y_train)
prediction = knn.predict(test_tf)

#EVALUATION
knn_a2 = accuracy_score(y_test, prediction)*100
knn_p2 = precision_score(y_test, prediction)* 100
knn_r2 = recall_score(y_test, prediction)*100
knn_f12 = f1_score(y_test, prediction)*100

#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=knn.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=knn.classes_)
display.plot()

In [None]:
print("Accuracy: ", knn_a2)
print("Precision: ", knn_p2)
print("Recall: ", knn_r2)
print("F1 Score: ", knn_f12)


# **DECISSION TREE MODEL**

## **COUNT VECTORIZER**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(train_c, y_train)
prediction = dt.predict(test_c)

#EVALUATION
dt_a1 = accuracy_score(y_test, prediction)*100
dt_p1 = precision_score(y_test, prediction)* 100
dt_r1 = recall_score(y_test, prediction)*100
dt_f11 = f1_score(y_test, prediction)*100

#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=dt.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=dt.classes_)
display.plot()

In [None]:
print('\nDecision Tree - Count Vectorizer')
print('Accuracy:', dt_a1)
print('Precision:', dt_p1)
print('Recall:', dt_r1)
print('F1 Score:', dt_f11)

## **TFIDF VECTORIZER**

In [None]:
dt * DecisionTreeClassifier()
dt.fit(train_tf, y_train)
prediction = dt.predict(test_tf)

#EVALUATION
dt_a2 = accuracy_score(y_test, prediction)*100
dt_p2 = precision_score(y_test, prediction)* 100   
dt_r2 = recall_score(y_test, prediction)*100
dt_f12 = f1_score(y_test, prediction)*100

#CONFUSION MATRIX
cm =  confusion_matrix(y_test, prediction, labels=dt.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=dt.classes_)
display.plot()


In [None]:
print('\nDecision Tree - TFIDF Vectorizer')
print('Accuracy:', dt_a2)
print('Precision:', dt_p2)
print('Recall:', dt_r2)
print('F1 Score:', dt_f12)

# **DISPLAY RESULTS**

In [None]:
#DISPLAYING THE RESULTS
print('SVM ABSA- Count Vectorizer')
print('Accuracy:', svm_a1)
print('Precision:', svm_p1)
print('Recall:', svm_r1)
print('F1 Score:', svm_f11)

print('\nSVM ABSA - TFIDF Vectorizer')
print('Accuracy:', svm_a2)
print('Precision:', svm_p2)
print('Recall:', svm_r2)
print('F1 Score:', svm_f12)

print('\nSVM NON-ABSA- Count Vectorizer')
print('Accuracy:', svm_na_a1)
print('Precision:', svm_na_p1)
print('Recall:', svm_na_r1)
print('F1 Score:', svm_na_f11)

print('\nSVM NON-ABSA - TFIDF Vectorizer')
print('Accuracy:', svm_na_a2)
print('Precision:', svm_na_p2)
print('Recall:', svm_na_r2)
print('F1 Score:', svm_na_f12)

print('\nNaive Bayes - Count Vectorizer')
print('Accuracy:', nb_a1)
print('Precision:', nb_p1)
print('Recall:', nb_r1)
print('F1 Score:', nb_f11)

print('\nNaive Bayes - TFIDF Vectorizer')
print('Accuracy:', nb_a2)
print('Precision:', nb_p2)
print('Recall:', nb_r2)
print('F1 Score:', nb_f12)

print('\nKNN - Count Vectorizer')
print('Accuracy:', knn_a1)
print('Precision:', knn_p1)
print('Recall:', knn_r1)
print('F1 Score:', knn_f11)

print('\nKNN - TFIDF Vectorizer')
print('Accuracy:', knn_a2)
print('Precision:', knn_p2)
print('Recall:', knn_r2)
print('F1 Score:', knn_f12)

print('\nDecision Tree - Count Vectorizer')
print('Accuracy:', dt_a1)
print('Precision:', dt_p1)
print('Recall:', dt_r1)
print('F1 Score:', dt_f11)

print('\nDecision Tree - TFIDF Vectorizer')
print('Accuracy:', dt_a2)
print('Precision:', dt_p2)
print('Recall:', dt_r2)
print('F1 Score:', dt_f12)


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head()