# Model Development & Tuning

## Objectives
1. Load and preprocess the dataset.
2. Vectorize text using TF-IDF.
3. Train and tune three models:
    - Logistic Regression (Baseline)
    - Linear SVM
    - LightGBM
4. Compare performance and select the best model.
5. Save the best model and vectorizer.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Data

In [2]:
DATA_PATH = '../data/raw'

def load_data():
    try:
        auth = pd.read_csv(os.path.join(DATA_PATH, 'Authentic-48K.csv'))
        fake = pd.read_csv(os.path.join(DATA_PATH, 'Fake-1K.csv'))
        
        auth['label'] = 0
        fake['label'] = 1
        
        # Use a smaller sample of authentic news to balance slightly if needed, 
        # but for this capstone we will use all or a reasonable ratio.
        # Let's use all for now, but be aware of class imbalance.
        
        df = pd.concat([auth, fake], ignore_index=True)
        # Shuffle
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
        return df
    except FileNotFoundError:
        print("Data not found. Please ensure data is in data/raw")
        return None

df = load_data()
print(f"Data Shape: {df.shape}")
print(df['label'].value_counts())

Data Shape: (49977, 7)
label
0    48678
1     1299
Name: count, dtype: int64


## 2. Preprocessing & Vectorization

In [3]:
# Setup - Ensure content is string
df['content'] = df['content'].fillna('').astype(str)

# Split Data
X = df['content']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")

Train Shape: (39981,)
Test Shape: (9996,)


In [None]:
# TF-IDF Vectorization
# Using max_features to keep dimensions manageable and reduce noise
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF Matrix Shape: {X_train_tfidf.shape}")

TF-IDF Matrix Shape: (39981, 5000)


## 3. Model Training & Tuning

In [None]:
# Helper function for evaluation
results = []

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    
    results.append({
        'Model': name,
        'Accuracy': acc,
        'F1 Score': f1
    })

### Model 1: Logistic Regression (Baseline)

In [None]:
lr_params = {'C': [0.1, 1, 10]}
lr = LogisticRegression(max_iter=1000, random_state=42)

grid_lr = GridSearchCV(lr, lr_params, cv=3, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train_tfidf, y_train)

print(f"Best LR Params: {grid_lr.best_params_}")
evaluate_model('Logistic Regression', grid_lr.best_estimator_, X_test_tfidf, y_test)

### Model 2: Linear SVM

In [None]:
svm_params = {'C': [0.1, 1, 10]}
svm = LinearSVC(random_state=42, dual='auto') # dual='auto' to suppress warnings for n_samples > n_features

# LinearSVC does not support predict_proba by default, which we need for Risk Score.
# We can use CalibratedClassifierCV if we select this as best, or just use the decision function.
# For now, simple GridSearch.

grid_svm = GridSearchCV(svm, svm_params, cv=3, scoring='f1', n_jobs=-1)
grid_svm.fit(X_train_tfidf, y_train)

print(f"Best SVM Params: {grid_svm.best_params_}")
evaluate_model('Linear SVM', grid_svm.best_estimator_, X_test_tfidf, y_test)

### Model 3: LightGBM

In [None]:
lgb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50],
    'device': ['gpu'] # Attempt to use GPU
}

print("Training LightGBM... (Attempting GPU usage)")
try:
    lgbm = lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True)
    grid_lgbm = GridSearchCV(lgbm, lgb_params, cv=3, scoring='f1', n_jobs=-1)
    grid_lgbm.fit(X_train_tfidf, y_train)
except Exception as e:
    print(f"GPU Training failed or skipped: {e}. Falling back to CPU.")
    # Fallback to CPU by removing device param or setting to cpu
    lgb_params['device'] = ['cpu']
    lgbm = lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True)
    grid_lgbm = GridSearchCV(lgbm, lgb_params, cv=3, scoring='f1', n_jobs=-1)
    grid_lgbm.fit(X_train_tfidf, y_train)

print(f"Best LightGBM Params: {grid_lgbm.best_params_}")
evaluate_model('LightGBM', grid_lgbm.best_estimator_, X_test_tfidf, y_test)

## 4. Model Selection

In [None]:
results_df = pd.DataFrame(results)
print(results_df)

# Identify best model based on F1 Score
best_model_name = results_df.sort_values(by='F1 Score', ascending=False).iloc[0]['Model']
print(f"\nBest Model: {best_model_name}")

if best_model_name == 'Logistic Regression':
    best_model = grid_lr.best_estimator_
elif best_model_name == 'Linear SVM':
    # Calibrate SVM for probability output
    best_model = CalibratedClassifierCV(grid_svm.best_estimator_, method='sigmoid', cv='prefit')
    best_model.fit(X_test_tfidf, y_test) # Note: this is a bit cheaty using test for calib, strictly should use val.
    # For simplicity in this notebook, we might fallback to LR if SVM is close, or stick to proper split.
    # Let's simple re-fit Calibrated on Train for correctness if chosen.
    best_model = CalibratedClassifierCV(grid_svm.best_estimator_, method='sigmoid', cv=3)
    best_model.fit(X_train_tfidf, y_train)
else:
    best_model = grid_lgbm.best_estimator_

## 5. Save Artifacts

In [None]:
if not os.path.exists('../models'):
    os.makedirs('../models')

# Save Vectorizer
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save Model
with open('../models/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Artifacts saved to ../models/")