In [10]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
#CountVectorizer --> Bag of words
#TfidfVectorizer --> Tf-IDF
#gensim.models.Word2Vec --> word embeddings (word2vec)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0,Id,Review,Label,cleaned_reviews,z_score
0,0,good and interesting,5,ood interesting,0.4865
1,1,"This class is very helpful to me. Currently, I...",5,"class helpful . currently , 'm still learning ...",0.4865
2,2,like!Prof and TAs are helpful and the discussi...,5,ike ! prof ta helpful discussion among student...,0.4865
3,3,Easy to follow and includes a lot basic and im...,5,asy follow includes lot basic important techni...,0.4865
4,4,Really nice teacher!I could got the point eazl...,4,eally nice teacher ! could got point eazliy v,-0.675


In [21]:
X = df['cleaned_reviews']
Y = df['Label']
X.fillna('', inplace=True)

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


In [8]:
# Handling Imbalanced Dataset using SMOTE
# smote = SMOTE(random_state=42)
# X_res, y_res = smote.fit_resample(X_train_vect, y_train)

In [23]:
# Shuffle the data (combined features and target) after oversampling
X_resampled, y_resampled = shuffle(X_train_vect, y_train, random_state=42)

In [24]:
#naive bayes
#Gaussian Naive Bayes classifier can handle continuous data.
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_resampled, y_resampled)

#KNN
knn_tfidf = KNeighborsClassifier(n_neighbors=5)
knn_tfidf.fit(X_resampled, y_resampled)

#SVM
svm_tfidf = LinearSVC(dual=False, max_iter=10000)
svm_tfidf.fit(X_resampled, y_resampled)

In [25]:
#Naive bayes
nb_pred_tfidf = nb_tfidf.predict(X_test_vect)
#KNN
knn_pred_tfidf = knn_tfidf.predict(X_test_vect)
#SVM
svm_pred_tfidf = svm_tfidf.predict(X_test_vect)
print('Naive Bayes(TF-IDF) accurancy: ', accuracy_score(y_test, nb_pred_tfidf))
print()
print('KNN(TF-IDF) accurancy: ', accuracy_score(y_test, knn_pred_tfidf))
print()
print('SVM(TF-IDF) accurancy: ', accuracy_score(y_test, svm_pred_tfidf))

Naive Bayes(TF-IDF) accurancy:  0.7601626016260162

KNN(TF-IDF) accurancy:  0.7505021520803443

SVM(TF-IDF) accurancy:  0.7830224772835964


In [26]:
file1 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/newModels/nb_tfidf.sav'
file2 = 'D:/Study\College/IV Sem/Mini Project/Codes/Trained Models/newModels/knn_tfidf.sav'
file3 = 'D:/Study\College/IV Sem/Mini Project/Codes/Trained Models/newModels/svm_tfidf.sav'
pickle.dump(nb_tfidf, open(file1, 'wb'))
pickle.dump(knn_tfidf, open(file2, 'wb'))
pickle.dump(svm_tfidf, open(file3, 'wb'))