In [1]:
# === PATH FIX & CONSTS ===
import sys, os
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join('..')))

DATA_RAW = Path("../data/raw/IMBD.csv")
MODEL_DT = Path("../models/decision_tree_rating_regressor.pkl")
MODEL_NB = Path("../models/naive_bayes_genre_from_description.pkl")

print("DATA_RAW exists:", DATA_RAW.exists())
print("MODEL_DT exists:", MODEL_DT.exists())
print("MODEL_NB exists:", MODEL_NB.exists())


DATA_RAW exists: True
MODEL_DT exists: True
MODEL_NB exists: True


In [2]:
import joblib

def load_model_or_hint(path: Path, train_hint: str):
    if not path.exists():
        raise FileNotFoundError(
            f"❌ Không tìm thấy model: {path}\n"
            f"👉 Hãy chạy {train_hint} trước (vd: `python -m src.train_tree` hoặc `python -m src.train_nb`)."
        )
    return joblib.load(path)


In [3]:
import pandas as pd

# Load model hoặc báo lỗi kèm hướng dẫn train
dt = load_model_or_hint(MODEL_DT, "notebook 02_train_decision_tree.ipynb hoặc src.train_tree")

# Tạo input mẫu ĐÚNG CỘT (khớp pipeline của bạn)
# Các cột model cần: ['year_num','duration_min','votes_num','stars_count','desc_len','certificate','genre_primary']
sample_dt = pd.DataFrame([{
    "year_num": 2022,
    "duration_min": 120,
    "votes_num": 150000,
    "stars_count": 3,
    "desc_len": 28,
    "certificate": "PG-13",
    "genre_primary": "Action",
}])

pred_rating = dt.predict(sample_dt)[0]
print(f"⭐ Dự đoán rating cho sample: {pred_rating:.2f}")


⭐ Dự đoán rating cho sample: 6.75


In [4]:
# (Tuỳ chọn) Lấy 1 hàng thật từ CSV rồi biến thành X đúng cột
from src.features import parse_year, parse_duration, clean_votes, count_stars, desc_len, primary_genre

df_raw = pd.read_csv(DATA_RAW, low_memory=False).copy()
df_raw["year_num"]      = df_raw["year"].map(parse_year)
df_raw["duration_min"]  = df_raw["duration"].map(parse_duration)
df_raw["votes_num"]     = df_raw["votes"].map(clean_votes)
df_raw["stars_count"]   = df_raw["stars"].map(count_stars)
df_raw["desc_len"]      = df_raw["description"].map(desc_len)
df_raw["genre_primary"] = df_raw["genre"].map(primary_genre)

need_cols = ["year_num","duration_min","votes_num","stars_count","desc_len","certificate","genre_primary"]
row = df_raw.dropna(subset=need_cols).iloc[0]  # chọn 1 bản ghi đủ cột
X_one = pd.DataFrame([row[need_cols].to_dict()])
print("X_one:", X_one.to_dict(orient="records")[0])
print("Pred (from real row):", float(dt.predict(X_one)[0]))


X_one: {'year_num': 2018.0, 'duration_min': 30.0, 'votes_num': 177031.0, 'stars_count': 7, 'desc_len': 21, 'certificate': 'TV-14', 'genre_primary': 'Action'}
Pred (from real row): 8.23125


In [5]:
import pandas as pd

nb = load_model_or_hint(MODEL_NB, "notebook 03_train_naive_bayes.ipynb hoặc src.train_nb")

# Nhập text mẫu để phân loại
texts = pd.Series([
    "A retired hitman embarks on a high-octane revenge mission across the city.",
    "A heartfelt family story about growing up and finding your true self.",
])

pred_labels = nb.predict(texts)
for t, y in zip(texts.tolist(), pred_labels.tolist()):
    print("—")
    print("Text:", t)
    print("Predicted genre:", y)


—
Text: A retired hitman embarks on a high-octane revenge mission across the city.
Predicted genre: Action
—
Text: A heartfelt family story about growing up and finding your true self.
Predicted genre: Comedy


In [6]:
import pandas as pd
df_raw = pd.read_csv(DATA_RAW, low_memory=False)
texts_real = df_raw["description"].dropna().astype(str).head(5)
print("Predict on first 5 descriptions:")
print(nb.predict(texts_real))


Predict on first 5 descriptions:
['Action' 'Drama' 'Comedy' 'Drama' 'Animation']


In [7]:
import pandas as pd
out = pd.DataFrame({
    "sample_text": texts,
    "pred_genre": pred_labels
})
out_path = Path("../data/processed/note04_nb_predictions_sample.csv")
out.to_csv(out_path, index=False, encoding="utf-8")
print("Saved:", out_path)


Saved: ..\data\processed\note04_nb_predictions_sample.csv
