# Movie Revenue – Colab Workflow (All-in-One)
This notebook runs fully in **Google Colab**.

**What it does**
1. Mounts Google Drive
2. Uses your Drive folder: `Movie_RevenueIT113`
3. Installs requirements
4. Loads `tmdb_movies.csv`
5. Trains Linear Regression (baseline) and RandomForest
6. Saves best model to `artifacts/model.pkl` and `artifacts/metrics.json`
7. Copies artifacts back to Drive so your teammate can deploy

> Edit the folder/file names below if yours are different.


In [2]:
# ========== 1) Mount Drive ==========
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ========== 2) Project paths  ==========
DRIVE_PROJECT = "/content/drive/MyDrive/Movie_RevenueIT113"
CSV_ON_DRIVE  = f"{DRIVE_PROJECT}/tmdb_movies.csv"
COLAB_ROOT    = "/content/movie_revenue_project"

print("Drive project:", DRIVE_PROJECT)
print("CSV path:", CSV_ON_DRIVE)
print("Colab working root:", COLAB_ROOT)

Drive project: /content/drive/MyDrive/Movie_RevenueIT113
CSV path: /content/drive/MyDrive/Movie_RevenueIT113/tmdb_movies.csv
Colab working root: /content/movie_revenue_project


In [4]:
# ========== 3) Prepare working folder & install requirements ==========
import os, shutil, textwrap, json, joblib, numpy as np, pandas as pd
from pathlib import Path


# If you already manually placed files (requirements.txt, app/, notebooks/), you can skip unzip.
if not os.path.exists(COLAB_ROOT):
    os.makedirs(COLAB_ROOT, exist_ok=True)

# Copy requirements.txt from Drive if present; otherwise, create a minimal one
req_drive = f"{DRIVE_PROJECT}/requirements.txt"
req_local = f"{COLAB_ROOT}/requirements.txt"

if os.path.exists(req_drive):
    shutil.copy(req_drive, req_local)
else:
    with open(req_local, "w") as f:
        f.write("pandas\nnumpy\nscikit-learn==1.5.1\njoblib\n")

# Install
%cd $COLAB_ROOT
!pip install -q -r requirements.txt

print("Working dir:", os.getcwd())
print("Files:", os.listdir())

/content/movie_revenue_project
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.4/69.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.5/133.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━

In [5]:
# ========== 4) Place CSV into project structure ==========
data_dir = Path("data/processed")
data_dir.mkdir(parents=True, exist_ok=True)

assert os.path.exists(CSV_ON_DRIVE), f"CSV not found on Drive: {CSV_ON_DRIVE}"
shutil.copy(CSV_ON_DRIVE, data_dir/"tmdb_movies.csv")

import pandas as pd
df = pd.read_csv(data_dir/"tmdb_movies.csv")
print("Loaded rows:", len(df))
df.head()

Loaded rows: 3


Unnamed: 0,budget,runtime,popularity,vote_average,vote_count,release_year,cast_count,main_genre,production_company_t1,revenue
0,1000000,95,10.5,6.5,120,2015,8,Action,Marvel Studios,5000000
1,50000000,120,45.3,7.8,5400,2019,15,Drama,Warner Bros,200000000
2,12000000,105,20.1,5.9,320,2021,10,Comedy,Indie Prod,30000000


In [6]:
# ========== 5) Train two models (baseline & RF) and choose best by RMSE ==========
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

target = "revenue"
num_feats = ["budget","runtime","popularity","vote_average","vote_count","release_year","cast_count"]
cat_feats = ["main_genre","production_company_t1"]

X = df[num_feats + cat_feats].copy()
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pre = ColumnTransformer([
    ("num", StandardScaler(), num_feats),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_feats)
])

def score_model(model):
    pred = model.predict(X_test)
    rmse = float(np.sqrt(mean_squared_error(y_test, pred)))
    mae  = float(mean_absolute_error(y_test, pred))
    r2   = float(r2_score(y_test, pred))
    return {"rmse": rmse, "mae": mae, "r2": r2}

# Baseline
lin = Pipeline([("pre", pre), ("est", LinearRegression())]).fit(X_train, y_train)
m_lin = score_model(lin)

# Random Forest quick
rf = Pipeline([("pre", pre), ("est", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))]).fit(X_train, y_train)
m_rf = score_model(rf)

m_lin, m_rf



({'rmse': 68335691.59118266, 'mae': 68335691.59118266, 'r2': nan},
 {'rmse': 96400000.0, 'mae': 96400000.0, 'r2': nan})

In [7]:
# ========== 6) Save best model + metrics to /artifacts ==========
ART = Path("artifacts"); ART.mkdir(exist_ok=True)

best_model, best_metrics = (rf, m_rf) if m_rf["rmse"] < m_lin["rmse"] else (lin, m_lin)

joblib.dump(best_model, ART/"model.pkl")
with open(ART/"metrics.json","w") as f:
    json.dump(best_metrics, f, indent=2)

print("Saved:")
print(" - artifacts/model.pkl")
print(" - artifacts/metrics.json")
print("Best metrics:", best_metrics)

Saved:
 - artifacts/model.pkl
 - artifacts/metrics.json
Best metrics: {'rmse': 68335691.59118266, 'mae': 68335691.59118266, 'r2': nan}


In [None]:

\# ========== 7) Copy artifacts back to Drive ==========
drive_artifacts = f"{DRIVE_PROJECT}/artifacts"
os.makedirs(drive_artifacts, exist_ok=True)

shutil.copy(ART/"model.pkl", f"{drive_artifacts}/model.pkl")
shutil.copy(ART/"metrics.json", f"{drive_artifacts}/metrics.json")

print("Artifacts on Drive:", drive_artifacts)
!ls -lah "$DRIVE_PROJECT/artifacts" | sed -n '1,10p'

Artifacts on Drive: /content/drive/MyDrive/Movie_RevenueIT113/artifacts
total 4.5K
-rw------- 1 root root   72 Aug 18 07:05 metrics.json
-rw------- 1 root root 3.7K Aug 18 07:05 model.pkl


---
## Notes
- Keep the **genre** and **company** options in your Streamlit app aligned with what appears in your training CSV, so OneHotEncoder doesn’t error on unseen categories.
- To retrain: re-run the notebook; new metrics will overwrite the previous `artifacts/metrics.json`.
- For deployment, put these in a GitHub repo and deploy via **Streamlit Cloud**:
  - `app/streamlit_app.py`
  - `artifacts/model.pkl`
  - `artifacts/metrics.json`
  - `requirements.txt`
