In [1]:
!pip install imbalanced-learn





[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd



In [3]:
# Load cleaned dataset
df = pd.read_csv(r'C:\Users\User\Downloads\modelling\Final_Clean_Data.csv')


# Display the first 5 entries of the DataFrame
df.head(5)

Unnamed: 0.1,Unnamed: 0,clean_text_processed,Mood
0,0,"['innovation', 'lab', 'officially', 'open', 'c...",0
1,1,"['open', 'aws', 'asia', 'pacific', 'seoul', 'r...",0
2,2,"['beginner', 'guide', 'scaling', 'million', 'u...",0
3,3,"['bridging', 'aws', 'azure', 'environment', 'v...",0
4,4,"['elk', 'aws', 'elasticsearch', 'service', 'el...",0


In [4]:
df = df.drop('Unnamed: 0', axis=1)

In [5]:
mood_counts = df['Mood'].value_counts()
mood_counts

Mood
0    64171
1    26503
2     5290
Name: count, dtype: int64

In [6]:
df[['clean_text_processed', 'Mood']].head(5)

Unnamed: 0,clean_text_processed,Mood
0,"['innovation', 'lab', 'officially', 'open', 'c...",0
1,"['open', 'aws', 'asia', 'pacific', 'seoul', 'r...",0
2,"['beginner', 'guide', 'scaling', 'million', 'u...",0
3,"['bridging', 'aws', 'azure', 'environment', 'v...",0
4,"['elk', 'aws', 'elasticsearch', 'service', 'el...",0


In [7]:
testing_text = pd.Series(df['clean_text_processed'])
testing_text

0        ['innovation', 'lab', 'officially', 'open', 'c...
1        ['open', 'aws', 'asia', 'pacific', 'seoul', 'r...
2        ['beginner', 'guide', 'scaling', 'million', 'u...
3        ['bridging', 'aws', 'azure', 'environment', 'v...
4        ['elk', 'aws', 'elasticsearch', 'service', 'el...
                               ...                        
95959    ['anyone', 'good', 'workflow', 'using', 'aws',...
95960    ['aws', 'csa', 'associate', 'professional', 't...
95961    ['curious', 'change', 'opinion', 'aws', 'effic...
95962    ['new', 'aws', 'startup', 'blog', 'startup', '...
95963    ['top', 'paying', 'certification', 'aws', 'cer...
Name: clean_text_processed, Length: 95964, dtype: object

In [8]:
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from imblearn.pipeline import make_pipeline

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')

# Convert text data into numerical features
X = tfidf.fit_transform(df['clean_text_processed'])

# Ensure y is a 1D array of labels
y = df['Mood']

# Initialize SMOTE
smote = SMOTE(random_state=777, k_neighbors=1)

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X, y)


In [9]:
# Define the pipeline with XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
SMOTE_pipeline_xgb = make_pipeline(smote, xgb_model)

def model_cv(splits, X, Y, pipeline, average_method):
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy, precision, recall, f1 = [], [], [], []
    
    for train_index, test_index in kfold.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        # Fit the model on the training set
        model_fit = pipeline.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model_fit.predict(X_test)
        
        # Calculate scores
        accuracy.append(model_fit.score(X_test, y_test) * 100)
        precision.append(precision_score(y_test, y_pred, average=average_method) * 100)
        recall.append(recall_score(y_test, y_pred, average=average_method) * 100)
        f1.append(f1_score(y_test, y_pred, average=average_method) * 100)
        
        # Print individual fold scores for debugging purposes
        print('              negative    neutral     positive')
        print('precision:', precision_score(y_test, y_pred, average=None))
        print('recall:   ', recall_score(y_test, y_pred, average=None))
        print('f1 score: ', f1_score(y_test, y_pred, average=None))
        print('-'*50)
    
    # Print overall scores
    print("accuracy: %.2f%%" % np.mean(accuracy))
    print("precision: %.2f%%" % np.mean(precision))
    print("recall: %.2f%%" % np.mean(recall))
    print("f1 score: %.2f%%" % np.mean(f1))

# Perform cross-validation
model_cv(5, X, y, SMOTE_pipeline_xgb, 'macro')


              negative    neutral     positive
precision: [0.92296445 0.91183295 0.53429355]
recall:    [0.93439813 0.81566038 0.7362949 ]
f1 score:  [0.9286461  0.86106961 0.61923688]
--------------------------------------------------
              negative    neutral     positive
precision: [0.91850321 0.9140107  0.5337931 ]
recall:    [0.93525012 0.80607433 0.731569  ]
f1 score:  [0.92680102 0.85665597 0.61722488]
--------------------------------------------------
              negative    neutral     positive
precision: [0.91502622 0.9149029  0.54507915]
recall:    [0.93805517 0.79098283 0.74858223]
f1 score:  [0.9263976  0.84844193 0.63082437]
--------------------------------------------------
              negative    neutral     positive
precision: [0.91632404 0.916019   0.54870357]
recall:    [0.93774349 0.80041502 0.74007561]
f1 score:  [0.92691004 0.85432397 0.63018109]
--------------------------------------------------
              negative    neutral     positive
precision

In [11]:
import pickle

# Save the model to a pickle file
with open('sentiment_model_xgb.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [12]:
# Import necessary libraries
from sklearn.naive_bayes import MultinomialNB

# Define the Naive Bayes model
nb_model = MultinomialNB()

# Define the pipeline with SMOTE and Naive Bayes
SMOTE_pipeline_nb = make_pipeline(smote, nb_model)

def model_cv(splits, X, Y, pipeline, average_method):
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy, precision, recall, f1 = [], [], [], []
    
    for train_index, test_index in kfold.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        # Fit the model on the training set
        model_fit = pipeline.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model_fit.predict(X_test)
        
        # Calculate scores
        accuracy.append(model_fit.score(X_test, y_test) * 100)
        precision.append(precision_score(y_test, y_pred, average=average_method) * 100)
        recall.append(recall_score(y_test, y_pred, average=average_method) * 100)
        f1.append(f1_score(y_test, y_pred, average=average_method) * 100)
        
        # Print individual fold scores for debugging purposes
        print('              negative    neutral     positive')
        print('precision:', precision_score(y_test, y_pred, average=None))
        print('recall:   ', recall_score(y_test, y_pred, average=None))
        print('f1 score: ', f1_score(y_test, y_pred, average=None))
        print('-'*50)
    
    # Print overall scores
    print("accuracy: %.2f%%" % np.mean(accuracy))
    print("precision: %.2f%%" % np.mean(precision))
    print("recall: %.2f%%" % np.mean(recall))
    print("f1 score: %.2f%%" % np.mean(f1))

# Perform cross-validation
model_cv(5, X, y, SMOTE_pipeline_nb, 'macro')


              negative    neutral     positive
precision: [0.9465527  0.70534591 0.39460448]
recall:    [0.78511882 0.84641509 0.81568998]
f1 score:  [0.85831097 0.76946827 0.53189522]
--------------------------------------------------
              negative    neutral     positive
precision: [0.93915094 0.69808055 0.39025481]
recall:    [0.77567399 0.83701188 0.82514178]
f1 score:  [0.84962021 0.76125933 0.52989378]
--------------------------------------------------
              negative    neutral     positive
precision: [0.94015587 0.71567515 0.40071238]
recall:    [0.7895434  0.83286172 0.85066163]
f1 score:  [0.85829239 0.76983435 0.54479419]
--------------------------------------------------
              negative    neutral     positive
precision: [0.941868   0.70931357 0.41071429]
recall:    [0.79281596 0.83040936 0.84782609]
f1 score:  [0.86093836 0.7650995  0.55336212]
--------------------------------------------------
              negative    neutral     positive
precision

In [13]:
import pickle

# Save the model to a pickle file
with open('sentiment_model_nb.pkl', 'wb') as f:
    pickle.dump(nb_model, f)

-XGBoost outperforms Naive Bayes in terms of overall accuracy, precision, and F1 score.
Naive Bayes has higher recall for neutral and positive sentiments but suffers from lower precision, leading to lower overall performance.

-XGBoost provides a more balanced performance across all sentiment categories, particularly excelling in precision and achieving higher F1 scores for positive sentiment compared to Naive Bayes.