In [3]:
# Cho phép import từ thư mục gốc của project
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))

import pandas as pd
from src.features import (
    parse_year, parse_duration, clean_votes,
    count_stars, desc_len, primary_genre
)

# Load dữ liệu gốc
df = pd.read_csv("../data/raw/IMBD.csv", low_memory=False)
print("Columns:", list(df.columns))


Columns: ['title', 'year', 'certificate', 'duration', 'genre', 'rating', 'description', 'stars', 'votes']


In [6]:
# Tạo cột numeric/derived cần dùng cho Decision Tree
df["year_num"]     = df["year"].map(parse_year)
df["duration_min"] = df["duration"].map(parse_duration)
df["votes_num"]    = df["votes"].map(clean_votes)
df["stars_count"]  = df["stars"].map(count_stars)
df["desc_len"]     = df["description"].map(desc_len)
df["genre_primary"]= df["genre"].map(primary_genre)

# Kiểm tra đã có đủ cột chưa
needed = ["year_num","duration_min","votes_num","stars_count","desc_len","genre_primary","certificate","rating"]
missing = [c for c in needed if c not in df.columns]
print("Missing after FE:", missing)

# Loại bản ghi không có target rating
df = df.dropna(subset=["rating"])
df.head(3)


Missing after FE: []


Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes,year_num,duration_min,votes_num,stars_count,desc_len,genre_primary
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031,2018.0,30.0,177031.0,7,21,Action
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885,2016.0,58.0,199885.0,7,23,Biography
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384,2015.0,46.0,501384.0,7,19,Crime


In [9]:
# === Cell 3 (ROBUS) — tạo lại feature nếu thiếu rồi mới chọn X ===
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

expected_num = ["year_num","duration_min","votes_num","stars_count","desc_len"]
expected_cat = ["certificate","genre_primary"]

def _parse_year(s):
    if pd.isna(s): return np.nan
    m = re.search(r"(19|20)\d{2}", str(s))
    return int(m.group(0)) if m else np.nan

def _parse_duration(s):
    if pd.isna(s): return np.nan
    t = str(s).lower()
    h = re.search(r"(\d+)\s*h", t)
    mins = re.findall(r"(\d+)\s*min", t)
    total = 0
    if h: total += int(h.group(1)) * 60
    if mins: total += int(mins[-1])
    if total == 0:
        m = re.search(r"(\d+)\s*min", t)
        if m: total = int(m.group(1))
    return total if total>0 else np.nan

def _clean_votes(v):
    if pd.isna(v): return np.nan
    return pd.to_numeric(str(v).replace(",","").strip(), errors="coerce")

def _count_stars(s):
    if pd.isna(s): return 0
    return max(1, str(s).count(",") + 1)

def _desc_len(s):
    if pd.isna(s): return 0
    return len(str(s).split())

def _primary_genre(s):
    if pd.isna(s): return np.nan
    return str(s).split(",")[0].strip()

# 1) In các cột hiện có để bạn kiểm tra
print("Current columns:", list(df.columns))

# 2) Nếu thiếu cột nào trong expected_num + ['genre_primary'] thì tạo lại
must_have = set(expected_num + ["genre_primary"])
missing_now = [c for c in must_have if c not in df.columns]
if missing_now:
    print("Missing features -> recomputing:", missing_now)
    # Tạo lại từ các cột gốc
    df["year_num"]      = df["year"].map(_parse_year)
    df["duration_min"]  = df["duration"].map(_parse_duration)
    df["votes_num"]     = df["votes"].map(_clean_votes)
    df["stars_count"]   = df["stars"].map(_count_stars)
    df["desc_len"]      = df["description"].map(_desc_len)
    df["genre_primary"] = df["genre"].map(_primary_genre)

# 3) Bỏ các hàng không có target
df = df.dropna(subset=["rating"])

# 4) Kiểm lại lần nữa cho chắc
still_missing = [c for c in expected_num + expected_cat if c not in df.columns]
if still_missing:
    raise RuntimeError(f"Các cột sau vẫn thiếu trong df: {still_missing}. Hãy kiểm tra lại tên cột gốc (year/duration/genre/...) và Cell 1-2.")

# 5) Build X, y và train
X = df[expected_num + expected_cat]
y = df["rating"].astype(float)

preprocess = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), expected_num),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore"))
    ]), expected_cat)
])

model = Pipeline([
    ("prep", preprocess),
    ("tree", DecisionTreeRegressor(max_depth=6, random_state=42))
])

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(Xtr, ytr)
pred = model.predict(Xte)
print("MAE:", round(mean_absolute_error(yte, pred), 3))
print("R2 :", round(r2_score(yte, pred), 3))


Current columns: ['title', 'year', 'certificate', 'duration', 'genre', 'rating', 'description', 'stars', 'votes']
Missing features -> recomputing: ['votes_num', 'stars_count', 'year_num', 'genre_primary', 'duration_min', 'desc_len']
MAE: 0.724
R2 : 0.363


In [10]:
import joblib, os
os.makedirs("../models", exist_ok=True)
joblib.dump(model, "../models/decision_tree_rating_regressor.pkl")
print("Saved to ../models/decision_tree_rating_regressor.pkl")


Saved to ../models/decision_tree_rating_regressor.pkl
