# Model Training for Sentiment Analysis

In [1]:
%pip install xgboost scikit-learn datasets pandas joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import joblib

## Load Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")
df = dataset['train'].to_pandas()
test = dataset['test'].to_pandas()


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31232 entries, 0 to 31231
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         31232 non-null  int64 
 1   text       31232 non-null  object
 2   label      31232 non-null  int64 
 3   sentiment  31232 non-null  object
dtypes: int64(2), object(2)
memory usage: 976.1+ KB


In [9]:
test.head()

Unnamed: 0,id,text,label,sentiment
0,9235,getting cds ready for tour,1,neutral
1,16790,"MC, happy mother`s day to your mom ;).. love yah",2,positive
2,24840,A year from now is graduation....i am pretty s...,0,negative
3,20744,because you had chips and sale w/o me,1,neutral
4,6414,Great for organising my work life balance,2,positive


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5206 entries, 0 to 5205
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         5206 non-null   int64 
 1   text       5205 non-null   object
 2   label      5206 non-null   int64 
 3   sentiment  5206 non-null   object
dtypes: int64(2), object(2)
memory usage: 162.8+ KB


In [13]:
test.columns

Index(['id', 'text', 'label', 'sentiment'], dtype='object')

## Feature Engineering and Training

In [4]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])
y = df['label']

model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, seed=42)
model.fit(X, y)

In [17]:
# 1. Fill missing values (NaN/None) with an empty string so the vectorizer doesn't crash
test['text'] = test['text'].fillna('')

# 2. Use .transform() instead of .fit_transform() for test data
# (Assuming 'vectorizer' was already fit on X_train)
X_test = vectorizer.transform(test['text'])
y_test = test['label']


## Save Model and Vectorizer

In [5]:
joblib.dump(vectorizer, '../app/sentiment_model/vectorizer.pkl')
model.save_model('../app/sentiment_model/model.xgb')

  self.get_booster().save_model(fname)


In [19]:
y_pred = model.predict(X_test)

In [20]:
y_proba = model.predict_proba(X_test)

In [21]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Accuracy: ", accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Accuracy:  0.4022281982328083
              precision    recall  f1-score   support

           0       0.35      0.23      0.28      1546
           1       0.40      0.62      0.48      1930
           2       0.46      0.31      0.37      1730

    accuracy                           0.40      5206
   macro avg       0.40      0.39      0.38      5206
weighted avg       0.40      0.40      0.39      5206

[[ 358  927  261]
 [ 360 1199  371]
 [ 301  892  537]]


Now that the model and vectorizer are saved, you can run the FastAPI application.