# Introduction
This notebook leverages pre-defined functions from the `train_naive_bayes.py` script to train a Naive Bayes model on the Sentiment140 dataset using TF-IDF features.


### Setup

In [1]:
import sys
sys.path.append('../../src/models/')  # Add the path to the script

In [2]:
from train_naive_bayes import (
    load_data, vectorize_text, train_naive_bayes,
    evaluate_model, save_model_and_vectorizer
)

### Load the cleaned data

In [3]:
df = load_data('../../data/processed/cleaned_data.csv')
df = df.dropna(subset=['clean_text'])

### Feature Engineering: TF-IDF Vectorization

In [4]:
X, tfidf = vectorize_text(df, max_features=1000)
y = df['label']

### Train Naive Bayes Model

In [5]:
model = train_naive_bayes(X, y)

### Evaluate the Model

In [7]:
accuracy, report = evaluate_model(model, X, y)
print(f"Model Accuracy on Full Dataset: {round(accuracy,2)}")
print("\nClassification Report:\n", report)

Model Accuracy on Full Dataset: 0.74

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74    796302
           1       0.74      0.72      0.73    795668

    accuracy                           0.74   1591970
   macro avg       0.74      0.74      0.74   1591970
weighted avg       0.74      0.74      0.74   1591970



### Visualization (e.g., Confusion Matrix, ROC Curve)

### [todo]

### Save the model and TF-IDF transformer

In [8]:
save_model_and_vectorizer(
    model, tfidf,
    '../../models/naive_bayes_model.pkl',
    '../../models/tfidf_vectorizer_nb.pkl'
)