
# ITI113 — Two-Model Training (Colab + Google Drive)
This notebook is **user-friendly** and **Drive-aware**. It will:
- Mount your Google Drive (you'll authorize with **your own account**).
- Use (or create) a folder called **`Movie_IT113`** inside your Drive.
- Train **two models** (baseline vs stronger) on a CSV.
- Save all outputs to `Movie_IT113/artifacts/` in your Drive so your teammate/lecturer can see them.

> If you're Calvin or the lecturer: open this notebook from the shared Drive folder. You'll be asked to mount **your own Drive**. As long as you have access to the folder, it works.


## 0) Setup (install packages, mount Drive)

In [None]:
# If running on Colab, install dependencies and mount Drive
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    !pip -q install pandas scikit-learn joblib
    from google.colab import drive
    print("Mounting your Google Drive… (you'll authorize with your own Google account)")
    drive.mount('/content/drive')

print("Ready.")


## 1) Choose your project folder in Drive

In [None]:
import os, json, pandas as pd

# Default to My Drive/Movie_IT113; change this if your shared folder is elsewhere.
DRIVE_ROOT = "/content/drive/MyDrive" if 'IN_COLAB' in globals() and IN_COLAB else "."
PROJECT_DIR = os.path.join(DRIVE_ROOT, "Movie_IT113")

# If the folder doesn't exist, create it (safe operation). If you're using a shared folder,
# you can replace PROJECT_DIR with the exact path to that folder inside your Drive.
os.makedirs(PROJECT_DIR, exist_ok=True)
ART_DIR = os.path.join(PROJECT_DIR, "artifacts")
os.makedirs(ART_DIR, exist_ok=True)

print("Project directory:", PROJECT_DIR)
print("Artifacts directory:", ART_DIR)


## 2) Dataset — use the demo CSV or replace with your own

In [None]:
# If you already placed a CSV in the Movie_IT113 folder, set CSV_NAME to that file.
# Otherwise, we'll create a small realistic demo CSV for you.
CSV_NAME = "movie_boxoffice_demo.csv"    # you can change this to your dataset file name
CSV_PATH = os.path.join(PROJECT_DIR, CSV_NAME)
TARGET   = "revenue"                      # for the demo we use a regression target

if not os.path.exists(CSV_PATH):
    # Create a small realistic demo dataset
    import numpy as np
    rng = np.random.default_rng(42)
    n = 400
    genres = ["Action","Comedy","Drama","Horror","Romance","Sci-Fi","Animation"]
    genre_effect = {"Action":25.0,"Comedy":10.0,"Drama":5.0,"Horror":8.0,"Romance":6.0,"Sci-Fi":22.0,"Animation":15.0}
    franchises = ["none","weak","strong"]
    fran_effect = {"none":0.0,"weak":18.0,"strong":60.0}

    budget = rng.uniform(5, 200, size=n)
    runtime = np.clip(rng.normal(110, 18, size=n), 80, 180)
    popularity = rng.uniform(0, 50, size=n)
    vote_avg = rng.uniform(3.0, 8.5, size=n)
    genre = rng.choice(genres, size=n)
    release_month = rng.integers(1, 13, size=n)
    is_sequel = (rng.random(n) < 0.25).astype(int)
    franchise = rng.choice(franchises, size=n, p=[0.6,0.25,0.15])

    season_uplift = (np.isin(release_month, [5,6,7,8]) * 15.0).astype(float)
    season_uplift += (np.isin(release_month, [11,12]) * 8.0).astype(float)
    noise = rng.normal(0, 30.0, size=n)

    revenue = (
        1.2 * budget +
        1.5 * popularity +
        5.0 * (vote_avg - 5.0) +
        0.10 * (runtime - 110.0) +
        (is_sequel * 40.0) +
        np.vectorize(genre_effect.get)(genre) +
        season_uplift +
        np.vectorize(fran_effect.get)(franchise) +
        noise
    )
    revenue = np.maximum(revenue, 0.5)

    df = pd.DataFrame({
        "budget_musd": np.round(budget, 2),
        "runtime_min": np.round(runtime, 0).astype(int),
        "popularity": np.round(popularity, 2),
        "vote_average": np.round(vote_avg, 2),
        "genre": genre,
        "release_month": release_month,
        "is_sequel": is_sequel,
        "franchise_strength": franchise,
        "revenue": np.round(revenue, 2),
    })
    df.to_csv(CSV_PATH, index=False)
    print(f"Demo CSV created at: {CSV_PATH}")
else:
    print(f"Using existing CSV at: {CSV_PATH}")

# Quick peek
df = pd.read_csv(CSV_PATH)
display(df.head(3))
print("Columns:", list(df.columns))


## 3) Train two models (baseline vs strong) and save to Drive

In [None]:
import pandas as pd, numpy as np, os, json
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, r2_score, mean_squared_error, mean_absolute_error
from joblib import dump

def basic_clean(X: pd.DataFrame) -> pd.DataFrame:
    drop = []
    for c in X.columns:
        if X[c].isna().all() or X[c].nunique(dropna=False) <= 1:
            drop.append(c)
    if drop:
        X = X.drop(columns=drop)
    return X

def infer_task(y: pd.Series) -> str:
    if pd.api.types.is_numeric_dtype(y):
        return "regression" if y.nunique(dropna=True) > 10 else "classification"
    return "classification"

def split_data(X, y, test_size=0.2, random_state=42):
    task = infer_task(y)
    strat = y if task == "classification" else None
    return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=strat)

# Load & split
df = pd.read_csv(CSV_PATH)
if "revenue" not in df.columns:
    raise ValueError("Expected 'revenue' as target for this demo. Change TARGET if using your own CSV.")
y = df["revenue"]
X = basic_clean(df.drop(columns=["revenue"]))

num_cols = [c for c in X.columns if is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                     ("scaler", StandardScaler(with_mean=True))])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                     ("onehot", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", num_pipe, num_cols),
                         ("cat", cat_pipe, cat_cols)])

X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=42)

# Two models for regression
modelA_name, modelA = "linreg", LinearRegression()
modelB_name, modelB = "rf_reg", RandomForestRegressor(n_estimators=400, random_state=42)

pipeA = make_pipeline(pre, modelA)
pipeB = make_pipeline(pre, modelB)

pipeA.fit(X_train, y_train)
pipeB.fit(X_train, y_train)

# Evaluate
def eval_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return {"rmse": float(mse**0.5), "mae": float(mean_absolute_error(y_true, y_pred)), "r2": float(r2_score(y_true, y_pred))}

yA = pipeA.predict(X_test)
yB = pipeB.predict(X_test)
mA = eval_regression(y_test, yA)
mB = eval_regression(y_test, yB)

# Save artifacts to Drive
dump(pipeA, os.path.join(ART_DIR, "model_A.joblib"))
dump(pipeB, os.path.join(ART_DIR, "model_B.joblib"))
with open(os.path.join(ART_DIR, "metrics_A.json"),"w") as f: json.dump(mA,f,indent=2)
with open(os.path.join(ART_DIR, "metrics_B.json"),"w") as f: json.dump(mB,f,indent=2)
with open(os.path.join(ART_DIR, "feature_columns.json"),"w") as f: json.dump({"columns": list(X_train.columns)}, f, indent=2)

import pandas as pd
keys = sorted(set(list(mA.keys()) + list(mB.keys())))
cmp = pd.DataFrame([[mA.get(k,None) for k in keys],[mB.get(k,None) for k in keys]],
                   columns=keys, index=[modelA_name, modelB_name])
display(cmp)

print("Artifacts saved in:", ART_DIR)
print("Files:", os.listdir(ART_DIR))


## 4) (Optional) Change folder path if your shared folder lives elsewhere

In [None]:
# If your shared folder is NOT in MyDrive/Movie_IT113, set PROJECT_DIR manually, e.g.:
# PROJECT_DIR = "/content/drive/Shareddrives/<YourSharedDriveName>/Movie_IT113"
# ART_DIR = os.path.join(PROJECT_DIR, "artifacts")
# Then re-run the training cell above to save artifacts there.
print("If you need to change folder location, edit PROJECT_DIR above and re-run the training cell.")
