In [1]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
#CountVectorizer --> Bag of words
#TfidfVectorizer --> Tf-IDF
#gensim.models.Word2Vec --> word embeddings (word2vec)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Review,Label,cleaned_reviews,z_score,sentiment
0,0,0,good and interesting,5,ood interesting,0.4865,pos
1,1,1,"This class is very helpful to me. Currently, I...",5,"class helpful . currently , 'm still learning ...",0.4865,pos
2,2,2,like!Prof and TAs are helpful and the discussi...,5,ike ! prof ta helpful discussion among student...,0.4865,pos
3,3,3,Easy to follow and includes a lot basic and im...,5,asy follow includes lot basic important techni...,0.4865,pos
4,4,4,Really nice teacher!I could got the point eazl...,4,eally nice teacher ! could got point eazliy v,-0.675,pos


In [4]:
X = df['cleaned_reviews']
Y = df['sentiment']
X.fillna('', inplace=True)

In [6]:
# bag of words
bow = CountVectorizer()
X_bow = bow.fit_transform(X)
# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)
#load the Word2Vec
w2v = Word2Vec.load('./trained-word2vec-studentReviews.model')
#creating sentence vector:
def sentence_to_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:  # If the sentence contains no words in the model's vocabulary
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Create vectors for all sentences
X_w2v = np.array([sentence_to_vector(sentence, w2v) for sentence in X])

In [8]:
#test_size = 0.2 --> 20% of the data goes for testing and the remaing 80% goes for traning
#random_state = 42 --> ensures that the data is split in the same random way across all code runs-> this ensures efficient debugging
X_train_bow, X_test_bow, Y_train, Y_test = train_test_split(X_bow, Y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf,_, _ = train_test_split(X_tfidf, Y, test_size=0.2, random_state=42)
X_train_w2v, X_test_w2v,_, _ = train_test_split(X_w2v, Y, test_size=0.2, random_state=42)

In [9]:
#naive bayes
#Gaussian Naive Bayes classifier can handle continuous data.
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, Y_train)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, Y_train)

nb_w2v = GaussianNB()
nb_w2v.fit(X_train_w2v, Y_train)

#KNN
knn_bow = KNeighborsClassifier(n_neighbors=5)
knn_bow.fit(X_train_bow, Y_train)

knn_tfidf = KNeighborsClassifier(n_neighbors=5)
knn_tfidf.fit(X_train_tfidf, Y_train)

knn_w2v = KNeighborsClassifier(n_neighbors=5)
knn_w2v.fit(X_train_w2v, Y_train)



In [10]:
#SVM
#SVM performs better when the data is standarised or normalised, but in word2vec -> data is scaled
# scaler = StandardScaler()

# X_train_w2v_scaled = scaler.fit_transform(X_train_w2v)
# X_test_w2v_scaled = scaler.fit_transform(X_test_w2v)

svm_bow = LinearSVC(dual=False, max_iter=10000)
svm_bow.fit(X_train_bow, Y_train)

svm_tfidf = LinearSVC(dual=False, max_iter=10000)
svm_tfidf.fit(X_train_tfidf, Y_train)

svm_w2v = LinearSVC(dual=False, max_iter=10000)
svm_w2v.fit(X_train_w2v, Y_train)

In [11]:
#Naive bayes
nb_pred_bow = nb_bow.predict(X_test_bow)
nb_pred_tfidf = nb_tfidf.predict(X_test_tfidf)
nb_pred_w2v = nb_w2v.predict(X_test_w2v)
#KNN
knn_pred_bow = knn_bow.predict(X_test_bow)
knn_pred_tfidf = knn_tfidf.predict(X_test_tfidf)
knn_pred_w2v = knn_w2v.predict(X_test_w2v)
#SVM
svm_pred_bow = svm_bow.predict(X_test_bow)
svm_pred_tfidf = svm_tfidf.predict(X_test_tfidf)
svm_pred_w2v = svm_w2v.predict(X_test_w2v)

print('Naive Bayes(Bag of Words) accurancy: ', accuracy_score(Y_test, nb_pred_bow))
print('Naive Bayes(TF-IDF) accurancy: ', accuracy_score(Y_test, nb_pred_tfidf))
print('Naive Bayes(Word2Vec) accurancy: ', accuracy_score(Y_test, nb_pred_w2v))
print()
print('KNN(Bag of Words) accurancy: ', accuracy_score(Y_test, knn_pred_bow))
print('KNN(TF-IDF) accurancy: ', accuracy_score(Y_test, knn_pred_tfidf))
print('KNN(Word2Vec) accurancy: ', accuracy_score(Y_test, knn_pred_w2v))
print()
print('SVM(Bag of Words) accurancy: ', accuracy_score(Y_test, svm_pred_bow))
print('SVM(TF-IDF) accurancy: ', accuracy_score(Y_test, svm_pred_tfidf))
print('SVM(Word2Vec) accurancy: ', accuracy_score(Y_test, svm_pred_w2v))


Naive Bayes(Bag of Words) accurancy:  0.9208512673362028
Naive Bayes(TF-IDF) accurancy:  0.9298900047824008
Naive Bayes(Word2Vec) accurancy:  0.647297943567671

KNN(Bag of Words) accurancy:  0.9289813486370158
KNN(TF-IDF) accurancy:  0.9301769488283118
KNN(Word2Vec) accurancy:  0.9194165471066476

SVM(Bag of Words) accurancy:  0.9245337159253946
SVM(TF-IDF) accurancy:  0.9331420373027259
SVM(Word2Vec) accurancy:  0.9303204208512673


In [12]:
file1 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/nb_bow.sav'
file2 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/nb_tfidf.sav'
file3 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/nb_w2v.sav'
pickle.dump(nb_bow, open(file1, 'wb'))
pickle.dump(nb_tfidf, open(file2, 'wb'))
pickle.dump(nb_w2v, open(file3, 'wb'))
file4 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/knn_bow.sav'
file5 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/knn_tfidf.sav'
file6 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/knn_w2v.sav'
pickle.dump(knn_bow, open(file4, 'wb'))
pickle.dump(knn_tfidf, open(file5, 'wb'))
pickle.dump(knn_w2v, open(file6, 'wb'))
file7 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/svm_bow.sav'
file8 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/svm_tfidf.sav'
file9 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/svm_w2v.sav'
pickle.dump(svm_bow, open(file7, 'wb'))
pickle.dump(svm_tfidf, open(file8, 'wb'))
pickle.dump(svm_w2v, open(file9, 'wb'))