In [19]:
import os
import sys
import pickle
import logging
from datetime import datetime
from typing import Tuple, Dict, List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.base import BaseEstimator

In [20]:
# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("model_building.log"),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

In [21]:
# Configurations
class Config:
    DATA_PATH = "../data/processed/processed_sentiments.csv"
    MODELS_DIR = "models/"
    REPORTS_DIR = "reports/"
    SEED = 42
    TEST_SIZE = 0.2
    SCORING = 'f1_weighted'
    FIG_SIZE = (10, 6)
    CV_FOLDS = 10


In [22]:
config = Config()

os.makedirs(config.MODELS_DIR, exist_ok=True)
os.makedirs(config.REPORTS_DIR, exist_ok=True)

np.random.seed(config.SEED)

In [23]:
# ==============================
# 2. Data Loading
# ==============================

def load_data(filepath: str) -> pd.DataFrame:
    """Load preprocessed sentiment data."""
    try:
        df = pd.read_csv(filepath, parse_dates=['Comment_Date'])
        logger.info(f"Data loaded with {df.shape[0]} rows and {df.shape[1]} columns")
        return df
    except Exception as e:
        logger.error(f"Error loading dataset: {str(e)}")
        raise

In [24]:
# Load
df = load_data(config.DATA_PATH)
df.head(2)

2025-06-01 16:24:40,045 - INFO - Data loaded with 2000 rows and 15 columns


Unnamed: 0,Username,Comment,Comment_Date,Likes,Comment_Length,Has_Typo,Slang_Presence,Sentiment,Comment_DayOfWeek,Comment_Month,Comment_Season,Words_per_Comment,Avg_Word_Length,Engagement_rate,Sentiment_Score
0,williamsonbrett,Absolutely furious about the customer service!,2024-03-20,297,6,0,0,Angry,Wednesday,March,Spring,6,6.833333,49.5,-2.0
1,seth67,Can't believe how bad the customer care was.,2024-12-04,152,8,0,0,Angry,Wednesday,December,Winter,8,4.625,19.0,-2.0


In [25]:
def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, Dict[int, int]]:
    """Prepare features and target variable."""
    logger.info("Preparing features and target...")

    initial_rows = df.shape[0]

    # Drop missing target safely
    df = df.dropna(subset=["Sentiment_Score"]).copy()

    final_rows = df.shape[0]
    logger.info(f"Dropped {initial_rows - final_rows} rows due to missing Sentiment_Score")

    # Map original labels to 0,1,2,3,4
    sentiment_mapping = {-2: 0, -1: 1, 0: 2, 1: 3, 2: 4}
    df['Sentiment_Score_Mapped'] = df['Sentiment_Score'].map(sentiment_mapping)

    X = df[['Comment', 'Comment_Length', 'Likes', 'Has_Typo', 'Slang_Presence']]
    y = df['Sentiment_Score_Mapped'].astype(int)

    return X, y, sentiment_mapping

In [26]:
X, y, sentiment_mapping = prepare_features(df)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=config.TEST_SIZE, random_state=config.SEED, stratify=y
)

logger.info(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

2025-06-01 16:24:40,058 - INFO - Preparing features and target...
2025-06-01 16:24:40,060 - INFO - Dropped 417 rows due to missing Sentiment_Score
2025-06-01 16:24:40,063 - INFO - Train size: (1266, 5), Test size: (317, 5)


In [27]:
# ==============================
# 4. Preprocessing Pipeline
# ==============================

def build_preprocessing_pipeline() -> ColumnTransformer:
    """Build preprocessing pipeline."""
    text_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2)))
    ])
    
    num_pipeline = Pipeline([
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer([
        ('text', text_pipeline, 'Comment'),
        ('num', num_pipeline, ['Comment_Length', 'Likes', 'Has_Typo', 'Slang_Presence'])
    ])
    
    return preprocessor


In [28]:
# ==============================
# 5. Model Initialization
# ==============================

def initialize_models() -> Dict[str, BaseEstimator]:
    """Initialize machine learning models."""
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced", random_state=config.SEED),
        "Random Forest": RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=config.SEED),
        "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=config.SEED),
        "SVM": SVC(kernel='linear', probability=True, class_weight='balanced', random_state=config.SEED),
        "XGBoost": XGBClassifier(n_estimators=100, random_state=config.SEED, use_label_encoder=False, eval_metric="mlogloss"),
        "LightGBM": LGBMClassifier(n_estimators=100, random_state=config.SEED, class_weight="balanced")
    }
    return models


In [29]:
# ==============================
# 6. Model Training and Evaluation
# ==============================

def evaluate_model(model: Pipeline, X_test: pd.DataFrame, y_test: np.ndarray) -> Dict[str, float]:
    """Evaluate model on test data."""
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_weighted": f1_score(y_test, y_pred, average="weighted"),
    }

    if y_prob is not None:
        metrics["roc_auc_ovr"] = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted")
        
    return metrics

def train_and_evaluate_models(X_train: pd.DataFrame, y_train: np.ndarray,
                              X_test: pd.DataFrame, y_test: np.ndarray) -> Dict[str, Dict]:
    """Train and evaluate all models."""
    results = {}
    preprocessor = build_preprocessing_pipeline()
    models = initialize_models()

    for name, model in models.items():
        logger.info(f"Training {name}...")
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        pipeline.fit(X_train, y_train)

        cv_score = cross_val_score(pipeline, X_train, y_train, cv=config.CV_FOLDS, scoring=config.SCORING)
        metrics = evaluate_model(pipeline, X_test, y_test)

        results[name] = {
            "model": pipeline,
            "cv_mean": np.mean(cv_score),
            "cv_std": np.std(cv_score),
            "test_metrics": metrics
        }

        # Confusion matrix
        plot_confusion_matrix(pipeline, X_test, y_test, model_name=name)

    return results

def plot_confusion_matrix(model: Pipeline, X_test: pd.DataFrame, y_test: np.ndarray, model_name: str) -> None:
    """Plot confusion matrix."""
    preds = model.predict(X_test)
    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=config.FIG_SIZE)
    sns.heatmap(cm, annot=True, cmap="Blues", fmt='d')
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(f"{config.REPORTS_DIR}/confusion_matrix_{model_name.replace(' ', '_')}.png")
    plt.close()


In [30]:
# ==============================
# 7. Model Selection and Saving
# ==============================

def select_best_model(results: Dict[str, Dict]) -> Tuple[str, Pipeline]:
    """Select the best performing model."""
    summary = pd.DataFrame({
        "Model": list(results.keys()),
        "CV_Mean": [v['cv_mean'] for v in results.values()],
        "CV_Std": [v['cv_std'] for v in results.values()],
        "Test_Accuracy": [v['test_metrics']['accuracy'] for v in results.values()],
        "Test_F1": [v['test_metrics']['f1_weighted'] for v in results.values()]
    }).sort_values(by="CV_Mean", ascending=False)

    logger.info("\n" + str(summary))
    summary.to_csv(f"{config.REPORTS_DIR}/model_comparison.csv", index=False)

    best_model_name = summary.iloc[0]["Model"]
    best_model = results[best_model_name]["model"]

    return best_model_name, best_model

def save_model(model: Pipeline, model_name: str) -> None:
    """Save the model."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    path = f"{config.MODELS_DIR}/{model_name.replace(' ', '_').lower()}_{timestamp}.pkl"

    with open(path, "wb") as f:
        pickle.dump(model, f)

    logger.info(f"Model saved at {path}")


In [31]:
# ==============================
# 8. Final Execution
# ==============================

results = train_and_evaluate_models(X_train, y_train, X_test, y_test)
best_model_name, best_model = select_best_model(results)
save_model(best_model, best_model_name)

print("\n" + "="*80)
print(f"BEST MODEL: {best_model_name}")
print("="*80)


2025-06-01 16:24:40,123 - INFO - Training Logistic Regression...
2025-06-01 16:24:40,381 - INFO - Training Random Forest...
2025-06-01 16:24:41,536 - INFO - Training Gradient Boosting...
2025-06-01 16:24:46,789 - INFO - Training SVM...
2025-06-01 16:24:49,949 - INFO - Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


2025-06-01 16:24:53,431 - INFO - Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 1266, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1593
[LightGBM] [Info] Number of data points in the train set: 1139, number of used features: 85
[LightGBM] [Info] Start tr



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 1139, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1598
[LightGBM] [Info] Number of data points in the train set: 1139, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1577
[LightGBM] [Info] Number of data points in the train set: 1139, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1568
[LightGBM] [Info] Number of data points in the train set: 1139, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000585 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 1140, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1579
[LightGBM] [Info] Number of data points in the train set: 1140, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1552
[LightGBM] [Info] Number of data points in the train set: 1140, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000519 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1592
[LightGBM] [Info] Number of data points in the train set: 1140, number of used features: 85
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




2025-06-01 16:25:03,920 - INFO - 
                 Model  CV_Mean  CV_Std  Test_Accuracy  Test_F1
0  Logistic Regression      1.0     0.0            1.0      1.0
1        Random Forest      1.0     0.0            1.0      1.0
2    Gradient Boosting      1.0     0.0            1.0      1.0
3                  SVM      1.0     0.0            1.0      1.0
4              XGBoost      1.0     0.0            1.0      1.0
5             LightGBM      1.0     0.0            1.0      1.0
2025-06-01 16:25:03,922 - INFO - Model saved at models//logistic_regression_20250601_162503.pkl

BEST MODEL: Logistic Regression




In [32]:
import os
import pickle

def save_best_clean_model(best_model, best_model_name: str) -> None:
    """
    Save the best model from clean data into ../models directory with dynamic name.
    
    Args:
        best_model: Trained best model object
        best_model_name (str): Name of the best model (lowercase, spaces replaced with '_')
    """
    # 1. Format model name
    model_name_clean = best_model_name.lower().replace(' ', '_')
    
    # 2. Prepare file path
    save_path = f"models/best_model_{model_name_clean}.pkl"
    
    # 3. Ensure directory exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    
    # 4. Save model
    with open(save_path, 'wb') as f:
        pickle.dump(best_model, f)
    
    print(f"✅ Best model saved successfully at: {save_path}")

# ===========================
# Example Usage (Your Part)
# ===========================

# Suppose you already have the following variables from your model selection step:
# best_model_name = "Logistic Regression"
# best_model = <your trained Logistic Regression model object>

# Now directly call:
save_best_clean_model(best_model, best_model_name)


✅ Best model saved successfully at: models/best_model_logistic_regression.pkl
