In [1]:
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#CountVectorizer --> Bag of words
#TfidfVectorizer --> Tf-IDF
#gensim.models.Word2Vec --> word embeddings (word2vec)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor

### Load the data 

In [2]:
df = pd.read_csv('preprocessed_data.csv')
df.isnull().sum()

Id                  0
Review              0
Label               0
cleaned_reviews    23
dtype: int64

### dealing with outliers

In [3]:
df

Unnamed: 0,Id,Review,Label,cleaned_reviews
0,0,good and interesting,5,ood interesting
1,1,"This class is very helpful to me. Currently, I...",5,"class helpful . currently , 'm still learning ..."
2,2,like!Prof and TAs are helpful and the discussi...,5,ike ! prof ta helpful discussion among student...
3,3,Easy to follow and includes a lot basic and im...,5,asy follow includes lot basic important techni...
4,4,Really nice teacher!I could got the point eazl...,4,eally nice teacher ! could got point eazliy v
...,...,...,...,...
107013,107013,Trendy topic with talks from expertises in the...,4,rendy topic talk expertise field . covered are...
107014,107014,"Wonderful! Simple and clear language, good ins...",5,"onderful ! simple clear language , good instru..."
107015,107015,an interesting and fun course. thanks. dr quincy,5,n interesting fun course . thanks . dr quincy
107016,107016,"very broad perspective, up to date information...",4,"ery broad perspective , date information , use..."


In [4]:
# Calculate Z-scores for Labels (assuming Labels is the sentiment score)
df['z_score'] = (df['Label'] - df['Label'].mean()) / df['Label'].std()
# Identify outliers based on Z-score
outliers_z = df[np.abs(df['z_score']) > 3]
df = df[np.abs(df['z_score']) <= 3]
df

Unnamed: 0,Id,Review,Label,cleaned_reviews,z_score
0,0,good and interesting,5,ood interesting,0.4865
1,1,"This class is very helpful to me. Currently, I...",5,"class helpful . currently , 'm still learning ...",0.4865
2,2,like!Prof and TAs are helpful and the discussi...,5,ike ! prof ta helpful discussion among student...,0.4865
3,3,Easy to follow and includes a lot basic and im...,5,asy follow includes lot basic important techni...,0.4865
4,4,Really nice teacher!I could got the point eazl...,4,eally nice teacher ! could got point eazliy v,-0.6750
...,...,...,...,...,...
107013,107013,Trendy topic with talks from expertises in the...,4,rendy topic talk expertise field . covered are...,-0.6750
107014,107014,"Wonderful! Simple and clear language, good ins...",5,"onderful ! simple clear language , good instru...",0.4865
107015,107015,an interesting and fun course. thanks. dr quincy,5,n interesting fun course . thanks . dr quincy,0.4865
107016,107016,"very broad perspective, up to date information...",4,"ery broad perspective , date information , use...",-0.6750


In [5]:
X = df['cleaned_reviews']
Y = df['Label']
X.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna('', inplace=True)


In [6]:
df.isnull().sum()

Id                 0
Review             0
Label              0
cleaned_reviews    0
z_score            0
dtype: int64

In [8]:
df.to_csv('preprocessed_data.csv', index = False)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

### Handling imbalanced dataset

In [8]:
# Handling Imbalanced Dataset using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_vect, y_train)

### Shuffling and reducing the dimensions of the dataset

In [9]:
# Shuffle the data (combined features and target) after oversampling
X_resampled, y_resampled = shuffle(X_res, y_res, random_state=42)
X_dense = X_res.toarray()

In [10]:
pca_model = PCA(n_components=50)
pca_model.fit(X_dense)
pca_file = 'D:/Study/College/IV Sem/Mini Project/Codes/Trained Models/pca-model2.sav'
pickle.dump(pca_model, open(pca_file, 'wb'))
X_dense_pca = pca_model.transform(X_dense)

### HyperParameter traning for diffrenet models

In [11]:
# Define models and their parameters
model_params = {
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 11, 19],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
        }
    }
}

In [12]:
#trying with these models and saving the results to a csv file
scores = []

for model_name, mp in model_params.items():
    clf =  RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_dense_pca, y_res)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df.to_csv('hyperParameterResults.csv', index=False)