In [1]:
# ====== 1. Import Library ======
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)

import joblib  # untuk save model


In [2]:
# ====== 2. Set Path & Load Dataset ======

# Kalau notebook di root project:
PROJECT_ROOT = Path.cwd().parent
# Kalau notebook kamu taruh di folder Membangun_model, ganti:
# PROJECT_ROOT = Path.cwd().parent

DATA_PATH = PROJECT_ROOT / "gaming_raw" / "videos_with_genre.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PATH   :", DATA_PATH)

df = pd.read_csv(DATA_PATH)
df.head()


PROJECT_ROOT: c:\Aldy-Kuliah\Semester 7\mlOps\project
DATA_PATH   : c:\Aldy-Kuliah\Semester 7\mlOps\project\gaming_raw\videos_with_genre.csv


Unnamed: 0,video_id,title,description,tags,channel_id,channel_title,published_at,view_count,like_count,comment_count,duration,genres_list,primary_genre
0,zo7i8VTpfNM,"Would You Risk Dying For $500,000?",i cant believe he did that at the end\nJack Li...,,UCX6OQ3DkcsbYNE6H8uQQuVA,MrBeast,2025-09-27T16:00:04Z,99385196,2090058,29437,PT25M26S,[],
1,Yqq5UbDVTWI,"If You Dig It, You Keep It!","If you find the expensive item in the sand, yo...",Ben Azelart|Stay Wild|Stay Wild Vlogs|Cam Huff...,UClQ3NafOy_42dJ0toK3QUKw,Stay Wild,2025-10-19T16:32:39Z,23779949,326989,12311,PT43M50S,['Battle Royale'],Battle Royale
2,9OHRtUHezTk,"$10,000 Every Boss You Beat",The new Feastables Hazelnut Cups are now avail...,,UCIPPMRA040LQr5QPyJEbmXA,MrBeast Gaming,2025-08-23T16:00:22Z,20393868,351251,4824,PT26M43S,['Sandbox'],Sandbox
3,qTMKHZelGAs,We Built New York In Minecraft,I canâ€™t believe these builders made New York C...,,UCIPPMRA040LQr5QPyJEbmXA,MrBeast Gaming,2025-11-15T17:00:01Z,14643233,518423,49117,PT15M55S,['Sandbox'],Sandbox
4,Skz60dSkmgM,Bahasa Indonesia vs MalaysiaðŸ‡®ðŸ‡©ðŸ‡²ðŸ‡¾#fyp #indonesi...,,,UCXU54id8Mh8QfQ3E8UB_Ijg,Fattah Ardhi,2025-08-23T12:33:17Z,16448511,301082,1427,PT52S,[],


In [3]:
# ====== 3. Bersihkan Label Genre ======

print("Distribusi awal primary_genre (termasuk NaN):")
print(df["primary_genre"].value_counts(dropna=False))

# Buang baris tanpa genre
df_clf = df.dropna(subset=["primary_genre"]).copy()

print("\nSetelah buang NaN:")
print(df_clf["primary_genre"].value_counts())


Distribusi awal primary_genre (termasuk NaN):
primary_genre
Simulation       123
Sandbox          114
NaN              103
Battle Royale     86
MOBA              51
Horror            23
RPG                8
FPS                5
Strategy           5
Sports             5
Name: count, dtype: int64

Setelah buang NaN:
primary_genre
Simulation       123
Sandbox          114
Battle Royale     86
MOBA              51
Horror            23
RPG                8
FPS                5
Strategy           5
Sports             5
Name: count, dtype: int64


In [4]:
# Opsional: hanya pakai genre dengan sampel >= min_samples
min_samples = 20

genre_counts = df_clf["primary_genre"].value_counts()
valid_genres = genre_counts[genre_counts >= min_samples].index.tolist()

df_clf = df_clf[df_clf["primary_genre"].isin(valid_genres)].copy()

print("\nGenre yang dipakai:", valid_genres)
print("Jumlah data akhir:", len(df_clf))
print(df_clf["primary_genre"].value_counts())



Genre yang dipakai: ['Simulation', 'Sandbox', 'Battle Royale', 'MOBA', 'Horror']
Jumlah data akhir: 397
primary_genre
Simulation       123
Sandbox          114
Battle Royale     86
MOBA              51
Horror            23
Name: count, dtype: int64


In [5]:
# ====== 4. Buat Fitur Teks Gabungan ======

def combine_text(row):
    parts = []
    for col in ["title", "description", "tags"]:
        val = row.get(col, "")
        if isinstance(val, str):
            parts.append(val)
    return " ".join(parts)

df_clf["text"] = df_clf.apply(combine_text, axis=1)

df_clf[["title", "primary_genre", "text"]].head()


Unnamed: 0,title,primary_genre,text
1,"If You Dig It, You Keep It!",Battle Royale,"If You Dig It, You Keep It! If you find the ex..."
2,"$10,000 Every Boss You Beat",Sandbox,"$10,000 Every Boss You Beat The new Feastables..."
3,We Built New York In Minecraft,Sandbox,We Built New York In Minecraft I canâ€™t believe...
5,ASMR Keyboard Clicky VS Keyboard Madu! [Roblox...,Sandbox,ASMR Keyboard Clicky VS Keyboard Madu! [Roblox...
6,"Most Expensive Art Wins $10,000!",Battle Royale,"Most Expensive Art Wins $10,000! We customized..."


In [6]:
# ====== 5. Split Data: Train / Val / Test ======

X = df_clf["text"]
y = df_clf["primary_genre"]

# Pertama: train+val vs test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Kedua: dari temp kita bagi lagi jadi train vs val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,      # 0.25 dari 0.8 = 0.2 â†’ total 60/20/20
    random_state=42,
    stratify=y_temp
)

print("Train size:", len(X_train))
print("Val size  :", len(X_val))
print("Test size :", len(X_test))

print("\nDistribusi label Train:")
print(y_train.value_counts())


Train size: 237
Val size  : 80
Test size : 80

Distribusi label Train:
primary_genre
Simulation       73
Sandbox          68
Battle Royale    52
MOBA             31
Horror           13
Name: count, dtype: int64


In [7]:
# ====== 6. Helper: Train & Eval 1 Konfigurasi ======

def train_eval_single_config(params, X_train, y_train, X_val, y_val):
    """
    params: dict dengan key:
        - max_features
        - ngram_range
        - C
    """
    max_features = params["max_features"]
    ngram_range = params["ngram_range"]
    C = params["C"]

    # Definisikan pipeline TF-IDF + LinearSVC
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            stop_words="english",
        )),
        ("clf", LinearSVC(C=C))
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Evaluasi di validation set
    y_val_pred = pipeline.predict(X_val)
    acc_val = accuracy_score(y_val, y_val_pred)
    f1_macro_val = f1_score(y_val, y_val_pred, average="macro")
    f1_weighted_val = f1_score(y_val, y_val_pred, average="weighted")

    result = {
        "params": params,
        "val_accuracy": acc_val,
        "val_f1_macro": f1_macro_val,
        "val_f1_weighted": f1_weighted_val,
    }

    return result, pipeline


In [8]:
# ====== 7. Definisikan Grid Hyperparameter ======

param_grid = {
    "max_features": [20000, 30000, 50000],
    "ngram_range": [(1, 1), (1, 2)],
    "C": [0.5, 1.0, 2.0],
}

param_list = []
for mf in param_grid["max_features"]:
    for ng in param_grid["ngram_range"]:
        for C in param_grid["C"]:
            param_list.append({
                "max_features": mf,
                "ngram_range": ng,
                "C": C,
            })

print("Total kombinasi:", len(param_list))
param_list[:3]


Total kombinasi: 18


[{'max_features': 20000, 'ngram_range': (1, 1), 'C': 0.5},
 {'max_features': 20000, 'ngram_range': (1, 1), 'C': 1.0},
 {'max_features': 20000, 'ngram_range': (1, 1), 'C': 2.0}]

In [9]:
# ====== 8. Loop Hyperparameter Tuning ======

results = []
best_result = None
best_pipeline = None

for i, params in enumerate(param_list, start=1):
    print(f"\n=== Run {i}/{len(param_list)} ===")
    print("Params:", params)

    result, pipeline = train_eval_single_config(
        params=params,
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
    )

    print(f"  â†’ val_accuracy : {result['val_accuracy']:.4f}")
    print(f"  â†’ val_f1_macro : {result['val_f1_macro']:.4f}")

    results.append(result)

    if (best_result is None) or (result["val_f1_macro"] > best_result["val_f1_macro"]):
        best_result = result
        best_pipeline = pipeline
        print(f"[BEST UPDATE] New best F1-macro: {best_result['val_f1_macro']:.4f}")

print("\n=== Tuning selesai ===")
print("Best params       :", best_result["params"])
print("Best val_accuracy :", best_result["val_accuracy"])
print("Best val_f1_macro :", best_result["val_f1_macro"])



=== Run 1/18 ===
Params: {'max_features': 20000, 'ngram_range': (1, 1), 'C': 0.5}
  â†’ val_accuracy : 0.8500
  â†’ val_f1_macro : 0.8533
[BEST UPDATE] New best F1-macro: 0.8533

=== Run 2/18 ===
Params: {'max_features': 20000, 'ngram_range': (1, 1), 'C': 1.0}
  â†’ val_accuracy : 0.8375
  â†’ val_f1_macro : 0.8314

=== Run 3/18 ===
Params: {'max_features': 20000, 'ngram_range': (1, 1), 'C': 2.0}
  â†’ val_accuracy : 0.8375
  â†’ val_f1_macro : 0.8314

=== Run 4/18 ===
Params: {'max_features': 20000, 'ngram_range': (1, 2), 'C': 0.5}
  â†’ val_accuracy : 0.8625
  â†’ val_f1_macro : 0.8631
[BEST UPDATE] New best F1-macro: 0.8631

=== Run 5/18 ===
Params: {'max_features': 20000, 'ngram_range': (1, 2), 'C': 1.0}
  â†’ val_accuracy : 0.8750
  â†’ val_f1_macro : 0.8751
[BEST UPDATE] New best F1-macro: 0.8751

=== Run 6/18 ===
Params: {'max_features': 20000, 'ngram_range': (1, 2), 'C': 2.0}
  â†’ val_accuracy : 0.8750
  â†’ val_f1_macro : 0.8751

=== Run 7/18 ===
Params: {'max_features': 300

In [10]:
# ====== 9. Evaluasi Best Model di Test Set ======

y_test_pred = best_pipeline.predict(X_test)

test_acc = accuracy_score(y_test, y_test_pred)
test_f1_macro = f1_score(y_test, y_test_pred, average="macro")
test_f1_weighted = f1_score(y_test, y_test_pred, average="weighted")

print(f"[TEST] Accuracy     : {test_acc:.4f}")
print(f"[TEST] F1-macro     : {test_f1_macro:.4f}")
print(f"[TEST] F1-weighted  : {test_f1_weighted:.4f}")
print("\n[TEST] Classification Report:\n")
print(classification_report(y_test, y_test_pred))


[TEST] Accuracy     : 0.9375
[TEST] F1-macro     : 0.9284
[TEST] F1-weighted  : 0.9362

[TEST] Classification Report:

               precision    recall  f1-score   support

Battle Royale       1.00      0.82      0.90        17
       Horror       1.00      0.80      0.89         5
         MOBA       0.91      1.00      0.95        10
      Sandbox       0.92      0.96      0.94        23
   Simulation       0.93      1.00      0.96        25

     accuracy                           0.94        80
    macro avg       0.95      0.92      0.93        80
 weighted avg       0.94      0.94      0.94        80

