<a href="https://colab.research.google.com/github/ansharyis/ml-colab-project/blob/main/notebooks/04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
DATA_DIR = "/content/drive/MyDrive/ML_Project_Data"


## Load tha dataset

In [3]:
# Project: ML Weight Prediction
# Notebook: PCA improved
# Owner: ....
# Description: PCA Improved Rev A

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

train_df = pd.read_csv(f"{DATA_DIR}/PROCESSED/train_df_final_after_null_removal.csv")
print(train_df.shape)



(20340, 59)


In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.decomposition import TruncatedSVD

## Prepare explanatory variable, dependent variable, and preprocessing pipeline

This handles:

numeric: median impute + scaling (good for linear/ridge/lasso)

categorical: most_frequent + one-hot

In [12]:
TARGET = "WEIGHTLBTC_A"

# -------------------------------------------------
# 1) Prepare X / y
# -------------------------------------------------
df_model = train_df.dropna(subset=[TARGET]).copy()
X = df_model.drop(columns=[TARGET])

USE_LOG_TARGET = True  # keep True for inverse transform to pounds

if USE_LOG_TARGET:
    y = np.log1p(df_model[TARGET].astype(float))
else:
    y = df_model[TARGET].astype(float)

# -------------------------------------------------
# 2) Detect numeric vs categorical columns
# -------------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# -------------------------------------------------
# 3) Define Transformers (memory-aware)
# -------------------------------------------------
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=True,
        min_frequency=0.01   # remove if your sklearn doesn't support it
    ))
])

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, cat_cols),
        ("num", numeric_transformer, num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------------------------------
# 4) Train/test split
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=44
)


## PCA IMPROVED

In [14]:
# -------------------------------------------------
# 5) Baseline Ridge WITHOUT PCA (for fair comparison)
# -------------------------------------------------
RIDGE_ALPHA = 100  # your winning alpha (adjust if needed)

ridge_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge(alpha=RIDGE_ALPHA))
])

ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_test)

if USE_LOG_TARGET:
    y_pred_ridge_pounds = np.expm1(y_pred_ridge)
    y_test_pounds = np.expm1(y_test)
else:
    y_pred_ridge_pounds = y_pred_ridge
    y_test_pounds = y_test

rmse_ridge = np.sqrt(mean_squared_error(y_test_pounds, y_pred_ridge_pounds))
print(f"Baseline Ridge RMSE (no PCA): {rmse_ridge:.2f} lbs")

# -------------------------------------------------
# 6) Ridge + PCA (TruncatedSVD) pipeline + GridSearch
# -------------------------------------------------
pcr_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("pca", TruncatedSVD(random_state=42)),
    ("model", Ridge(alpha=RIDGE_ALPHA))
])

params = {
    "pca__n_components": [2, 5, 10, 20, 30, 40, 50, 57]
}

print("\nRunning PCA (TruncatedSVD) Search...")
grid_pca = GridSearchCV(
    estimator=pcr_pipeline,
    param_grid=params,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=1   # keep 1 for Colab memory safety
)

grid_pca.fit(X_train, y_train)

# -------------------------------------------------
# 7) Evaluate best PCA model
# -------------------------------------------------
best_pca_model = grid_pca.best_estimator_
y_pred_pca = best_pca_model.predict(X_test)

if USE_LOG_TARGET:
    y_pred_pounds_pca = np.expm1(y_pred_pca)
else:
    y_pred_pounds_pca = y_pred_pca

rmse_pca = np.sqrt(mean_squared_error(y_test_pounds, y_pred_pounds_pca))

print(f"Best n_components: {grid_pca.best_params_['pca__n_components']}")
print(f"PCA + Ridge RMSE: {rmse_pca:.2f} lbs")

# -------------------------------------------------
# 8) Compare PCA vs full Ridge
# -------------------------------------------------
if rmse_pca < rmse_ridge:
    print("✅ PCA improved performance (reduced noise / dimensionality).")
else:
    print("⚠️ PCA did not improve performance (information loss vs full Ridge).")

Baseline Ridge RMSE (no PCA): 16.07 lbs

Running PCA (TruncatedSVD) Search...
Best n_components: 57
PCA + Ridge RMSE: 16.07 lbs
⚠️ PCA did not improve performance (information loss vs full Ridge).
