In [1]:
import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#CountVectorizer --> Bag of words
#TfidfVectorizer --> Tf-IDF
#gensim.models.Word2Vec --> word embeddings (word2vec)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Load the dataset

In [2]:
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Review,Label,cleaned_reviews,sentiment
0,0,0,good and interesting,5,good interesting,Positive
1,1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently im still learning clas...,Positive
2,2,2,like!Prof and TAs are helpful and the discussi...,5,likeprof ta helpful discussion among student q...,Positive
3,3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...,Positive
4,4,4,Really nice teacher!I could got the point eazl...,4,really nice teacheri could got point eazliy v,Positive


In [4]:
X = df['cleaned_reviews']
Y = df['sentiment']
X.fillna('', inplace=True)

### Load the vectorizers

In [5]:
# Load the saved vectorizers
file1 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/bow'
file2 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/tfidf'

bow = pickle.load(open(file1, 'rb'))
tfidf = pickle.load(open(file2, 'rb'))

#load the Word2Vec
w2v = Word2Vec.load('D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/FreshModels/w2v')

### Transforming the data using loaded vectorizers

In [6]:
# Transform the data using the loaded vectorizers
X_bow = bow.transform(X)
X_tfidf = tfidf.transform(X)

In [7]:
#creating sentence vector:
def sentence_to_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:  # If the sentence contains no words in the model's vocabulary
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Create vectors for all sentences
X_w2v = np.array([sentence_to_vector(sentence, w2v) for sentence in X])

### Splitting the Dataset

In [8]:
#test_size = 0.2 --> 20% of the data goes for testing and the remaing 80% goes for traning
#random_state = 42 --> ensures that the data is split in the same random way across all code runs-> this ensures efficient debugging
X_train_bow, X_test_bow, Y_train, Y_test = train_test_split(X_bow, Y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf,_, _ = train_test_split(X_tfidf, Y, test_size=0.2, random_state=42)
X_train_w2v, X_test_w2v,_, _ = train_test_split(X_w2v, Y, test_size=0.2, random_state=42)

### Model Training

In [9]:
#naive bayes
#Gaussian Naive Bayes classifier can handle continuous data.
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, Y_train)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, Y_train)

nb_w2v = GaussianNB()
nb_w2v.fit(X_train_w2v, Y_train)

#KNN
knn_bow = KNeighborsClassifier(n_neighbors=5)
knn_bow.fit(X_train_bow, Y_train)

knn_tfidf = KNeighborsClassifier(n_neighbors=5)
knn_tfidf.fit(X_train_tfidf, Y_train)

knn_w2v = KNeighborsClassifier(n_neighbors=5)
knn_w2v.fit(X_train_w2v, Y_train)

#svm
svm_bow = LinearSVC(dual=False, max_iter=10000)
svm_bow.fit(X_train_bow, Y_train)

svm_tfidf = LinearSVC(dual=False, max_iter=10000)
svm_tfidf.fit(X_train_tfidf, Y_train)

svm_w2v = LinearSVC(dual=False, max_iter=10000)
svm_w2v.fit(X_train_w2v, Y_train)



### Saving the Trained Models

In [11]:
file1 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/nb_bow.sav'
file2 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/nb_tfidf.sav'
file3 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/nb_w2v.sav'
file4 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/knn_bow.sav'
file5 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/knn_tfidf.sav'
file6 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/knn_w2v.sav'
file7 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/svm_bow.sav'
file8 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/svm_tfidf.sav'
file9 = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/New Models/svm_w2v.sav'
pickle.dump(nb_bow, open(file1, 'wb'))
pickle.dump(nb_tfidf, open(file2, 'wb'))
pickle.dump(nb_w2v, open(file3, 'wb'))
pickle.dump(knn_bow, open(file4, 'wb'))
pickle.dump(knn_tfidf, open(file5, 'wb'))
pickle.dump(knn_w2v, open(file6, 'wb'))
pickle.dump(svm_bow, open(file7, 'wb'))
pickle.dump(svm_tfidf, open(file8, 'wb'))
pickle.dump(svm_w2v, open(file9, 'wb'))