<a href="https://colab.research.google.com/github/ansharyis/ml-colab-project/blob/main/notebooks/02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
DATA_DIR = "/content/drive/MyDrive/ML_Project_Data"

## Load tha dataset

In [10]:
# Project: ML Weight Prediction
# Notebook: Linear Regression Model
# Owner: ....
# Description: Linear Regression Model Rev A

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

train_df = pd.read_csv(f"{DATA_DIR}/PROCESSED/train_df_final_after_null_removal.csv")
print(train_df.shape)



(20340, 59)


In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


## Prepare explanatory variable, dependent variable, and preprocessing pipeline

This handles:

numeric: median impute + scaling (good for linear/ridge/lasso)

categorical: most_frequent + one-hot

In [15]:
TARGET = "WEIGHTLBTC_A"

# -------------------------------------------------
# 1) Prepare X / y
# -------------------------------------------------
df_model = train_df.dropna(subset=[TARGET]).copy()

X = df_model.drop(columns=[TARGET])

USE_LOG_TARGET = True  # <-- set False if you don't want log target

if USE_LOG_TARGET:
    # log1p to handle skew and keep 0-safe (weight should be >0 anyway)
    y = np.log1p(df_model[TARGET].astype(float))
else:
    y = df_model[TARGET].astype(float)

# -------------------------------------------------
# 2) Detect numeric vs categorical columns
# -------------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# -------------------------------------------------
# 3) Build preprocessing pipeline
# -------------------------------------------------
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop"
)

# -------------------------------------------------
# 4) Train/test split
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


LINEAR REGRESSION MODEL

In [17]:
lr_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

print("Training Linear Regression...")
lr_pipeline.fit(X_train, y_train)

# -------------------------------------------------
# 6) Predict + Evaluate (MSE / RMSE)
# -------------------------------------------------
y_pred = lr_pipeline.predict(X_test)

if USE_LOG_TARGET:
    # Inverse transform to pounds
    y_pred_pounds = np.expm1(y_pred)
    y_test_pounds = np.expm1(y_test)
else:
    y_pred_pounds = y_pred
    y_test_pounds = y_test

mse_lr = mean_squared_error(y_test_pounds, y_pred_pounds)
rmse_lr = np.sqrt(mse_lr)

print(f"Linear Regression RMSE: {rmse_lr:.2f}")
print(f"Linear Regression MSE : {mse_lr:.2f}")

Training Linear Regression...
Linear Regression RMSE: 16.06
Linear Regression MSE : 258.08
