# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
import nltk
from sklearn.neighbors import KNeighborsClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import time
from nltk.stem import WordNetLemmatizer

In [None]:
import sys
print(sys.executable)
!{sys.executable} -m pip install gensim

In [None]:
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
from huggingface_hub import hf_hub_download
import gensim.downloader as api

# Importing Word2Vec Model

In [4]:
model = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', 
    binary=True
)

Testing 1: Checking Similar Words

In [9]:
similar_words = model.most_similar('best')
for word, score in similar_words:
    print(f"{word}: {score}")

finest: 0.6383625864982605
worst: 0.5835109949111938
greatest: 0.5824129581451416
strongest: 0.5785795450210571
smartest: 0.5695626139640808
easiest: 0.5528990626335144
good: 0.5467196702957153
thebest: 0.5430120825767517
quickest: 0.5399831533432007
healthiest: 0.5372902750968933


Testing 2: Checking similarity between two words

In [10]:
similarity = model.similarity("can't", "cannot")
print(similarity)

0.4316191


Testing 3: Checking Vector for specific Word

In [67]:
vector = model.get_vector('technology')
print(vector)

[ 0.04345703  0.00183105  0.18457031  0.00671387 -0.11376953  0.00479126
  0.07128906 -0.09228516 -0.01531982 -0.09521484 -0.08642578 -0.08544922
 -0.03125    -0.13476562 -0.19921875 -0.05224609 -0.04956055 -0.03271484
 -0.08496094 -0.06933594 -0.11328125 -0.01916504 -0.0625      0.19042969
 -0.03491211  0.10058594 -0.01916504  0.04833984  0.05737305 -0.27539062
 -0.00204468 -0.08837891 -0.12451172 -0.08398438  0.13085938 -0.14746094
 -0.00756836 -0.11132812  0.203125   -0.00817871 -0.12695312  0.22851562
  0.03588867 -0.00671387  0.05444336 -0.06054688 -0.20214844 -0.09912109
  0.01269531  0.15332031  0.0222168   0.01342773 -0.12890625 -0.23632812
 -0.05126953  0.203125    0.04492188 -0.12695312  0.36914062 -0.05810547
 -0.03588867 -0.09716797 -0.24707031  0.04589844 -0.296875    0.11132812
 -0.14550781  0.22558594 -0.02770996  0.03979492 -0.17871094 -0.05932617
  0.08544922  0.10009766  0.06103516  0.00473022  0.10546875  0.18847656
 -0.01196289 -0.01116943  0.10449219  0.08398438  0

Testing 4: Word exist in model or not

In [12]:
print('python' in model.key_to_index)

True


Vocab Size

In [13]:
print("Vocab size:", len(model.key_to_index))

Vocab size: 3000000


# Dataset Cleaning

In [14]:
# Creating pandas dataframe for csv
train_df = pd.read_csv('train_2.csv', usecols=['selected_text', 'sentiment', 'Age of User'], low_memory=False)

In [68]:
df = train_df.copy()

In [17]:
# Dropping the null values with rows
df = df.dropna()

In [18]:
# Dropping duplicate rows
df = df.drop_duplicates()

In [19]:
# Label Encoding the target column ('sentiment') for values (Positive, Neutral, Negative)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

# Preprocessing

Preprocessing 1: Lower Case

In [20]:
df['selected_text'] = df['selected_text'].str.lower()

Preprocessing 2: De-contractions Words

In [21]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [22]:
def contr_words(text, contractions_dict):
    text = str(text)
    for contraction, expansion in contractions_dict.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', expansion, text, flags=re.IGNORECASE)
    return text

df['selected_text'] = df['selected_text'].apply(lambda x: contr_words(x, contractions))

Preprocessing 3: Removing links

In [23]:
# Removing Links using regex expressions.
df['selected_text'] = df['selected_text'].str.replace(r'http\S+|www\.\S+', ' ', regex=True)

Preprocessing 4: Removing Stopwords

In [None]:
nltk.download('stopwords')
english_stopwords = set(stopwords.words('english'))

In [25]:
# Removing stopwords using NLTK library.
def remove_stopwords(text):
    tokens = word_tokenize(text.lower())  
    tokens_wo_stopwords = [word for word in tokens if word not in english_stopwords] 
    return ' '.join(tokens_wo_stopwords)

df['selected_text'] = df['selected_text'].apply(remove_stopwords)

Preprocessin 5: Removing HTML

In [26]:
df['selected_text'] = df['selected_text'].str.replace(r'<[^>]+>', ' ', regex=True)

Preprocessing 6: Removing Special Characters

In [27]:
df['selected_text'] = df['selected_text'].str.replace(r'[^a-zA-Z0-9\s]', ' ', regex=True)

Preprocessing 7: Removing Punctuation

In [28]:
df['selected_text'] = df['selected_text'].str.replace(r'[^\w\s]', ' ', regex=True)

Preprocessing 8: Removing Extra Spaces

In [29]:
# Replacing multiple spaces with single space
df['selected_text'] = df['selected_text'].str.replace(r'\s+', ' ', regex=True)

# Implementing Word2Vec Model

In [32]:
# Converting text into a numerical vector representation
# Using pre-trained Word2Vec model to represent words as numerical vectors
def text_to_word2vec_vector(text):
    words = text.split()
    
    vector = np.zeros(model.vector_size)  
    word_count = 0
    
    for word in words:
        if word in model.key_to_index:
            vector += model.get_vector(word)
            word_count += 1
        
    if word_count > 0:
        vector /= word_count
    
    return vector

# Train, Test, Split

In [33]:
X = df['selected_text'].apply(text_to_word2vec_vector).tolist()
X = np.array(X)  
y = df['sentiment']

In [34]:
X.shape

(24160, 300)

In [35]:
# Spliting Train and test data by 90% (train) and 10% (test).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

# Model 1: Random Forest

In [36]:
# Defining model parameters
rf_classifier = RandomForestClassifier(
    n_estimators=100, 
    random_state=42
)

In [37]:
# Start training of model
start_time = time.time()
rf_classifier.fit(X_train, y_train)
rf_training_time = time.time() - start_time
rf_pred = rf_classifier.predict(X_test)

In [38]:
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("RF Training Time: {:.4f} seconds".format(rf_training_time))

Accuracy: 0.7690397350993378
RF Training Time: 35.3709 seconds


# Model 2: Logistic Regression

In [39]:
# Defining model parameters and start training
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(
    max_iter=1000,
    random_state=42
)

start_time = time.time()
lr_classifier.fit(X_train, y_train)
lr_training_time = time.time() - start_time
lr_pred = lr_classifier.predict(X_test)

In [40]:
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("LR Training Time: {:.4f} seconds".format(lr_training_time))

Accuracy: 0.7682119205298014
LR Training Time: 0.4062 seconds


# Model 3: SVM 

In [41]:
# Defining model parameters and start training
from sklearn.svm import SVC

svm_classifier = SVC(
    kernel='rbf',  
    C=1.0,         
    random_state=42
)

start_time = time.time()
svm_classifier.fit(X_train, y_train)
svm_training_time = time.time() - start_time
svm_pred = svm_classifier.predict(X_test)

In [42]:
print("Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Training Time: {:.4f} seconds".format(svm_training_time))

Accuracy: 0.7984271523178808
SVM Training Time: 38.3339 seconds


# Model 4: Naive Bayes

In [53]:
# Start training of model
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_scaled, y_train)

In [54]:
y_pred = nb_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6783940397350994


# Hyper tunning: SVM

In [43]:
# Using GridSearchCV implemented muliple parameters list for hyper tunning for SVM Model
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.1, 1, 10],  
    'kernel': ['rbf', 'poly', 'sigmoid'] 
}

svm_classifier = SVC(random_state=42)

start_time = time.time()
grid_search = GridSearchCV(
    estimator=svm_classifier, 
    param_grid=param_grid, 
    cv=5,  
    scoring='accuracy',
    n_jobs=-1  
)

In [None]:
grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
svm_pred = best_svm.predict(X_test)
svm_training_time = time.time() - start_time

In [60]:
print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Training Time: {:.4f} seconds".format(svm_training_time))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Test Accuracy: 0.7984271523178808
SVM Training Time: 4912.2493 seconds


# Hyper tunning: Random Forest

In [49]:
# Using GridSearchCV implemented muliple parameters list for hyper tunning for Random Forest Model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [50]:
rf_classifier = RandomForestClassifier(random_state=42)

start_time = time.time()
grid_search = GridSearchCV(
    estimator=rf_classifier, 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1,
    verbose=2
)

In [None]:
grid_search.fit(X_train, y_train)
best_rf_classifier = grid_search.best_estimator_
training_time = time.time() - start_time
rf_pred = best_rf_classifier.predict(X_test)

In [52]:
print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, rf_pred))
print("Training Time: {:.4f} seconds".format(training_time))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Test Accuracy: 0.7690397350993378
Training Time: 1449.5872 seconds


# Hyper Tuning: Naive Bayes

In [64]:
# Using GridSearchCV implemented muliple parameters list for hyper tunning for Naive Bayes Model
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],  
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(
    MultinomialNB(), 
    param_grid, 
    cv=5, 
    scoring='accuracy'
)

In [65]:
grid_search.fit(X_train_scaled, y_train)
best_nb_classifier = grid_search.best_estimator_
y_pred = best_nb_classifier.predict(X_test_scaled)

In [66]:
print("Best Parameters:", grid_search.best_params_)
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))

Best Parameters: {'alpha': 1, 'fit_prior': False}
Test Set Accuracy: 0.7392384105960265


# Extracting Models

In [55]:
# Extracting the trained models using joblib module
import joblib
joblib.dump(svm_classifier, 'sentiment_svm_model.joblib')
joblib.dump(rf_classifier, 'sentiment_rf_model.joblib')
joblib.dump(lr_classifier, 'sentiment_lr_model.joblib')
joblib.dump(nb_classifier, 'sentiment_nb_model.joblib')

In [56]:
# Created Custom function to test model by giving input 
def predict_sentiment(input_text):
    processed_text = contr_words(input_text.lower(), contractions)
    processed_text = remove_stopwords(processed_text)
    processed_text = re.sub(r'http\S+|www\.\S+', ' ', processed_text)
    processed_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', processed_text)
    input_vector = text_to_word2vec_vector(processed_text)
    
    loaded_model = joblib.load('sentiment_svm_model.joblib')

    prediction = loaded_model.predict([input_vector])[0]
    sentiment_labels = {2: 'Positive', 1: 'Neutral', 0: 'Negative'}
    return sentiment_labels[prediction]

In [57]:
result = predict_sentiment("I love this product")
print(result)