In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pickle
import matplotlib.pyplot as plt
from collections import Counter

pd.set_option('display.width', 1000)

In [None]:
df = pd.read_csv("train.csv", index_col=0)
df.dropna(subset=["box_office"], inplace=True)
df = df[df["box_office"] <= 5e8]
display(df)

## Classification Model

In [None]:
bins = [0, 1e6, 10e6, 50e6, 100e6, 200e6, 500e6]
labels_dict = {
    0: '(0) 0-1M',
    1: '(1) 1-10M',
    2: '(2) 10-50M',
    3: '(3) 50-100M',
    4: '(4) 100-200M',
    5: '(5) 200M-500M'
}

df['box_office_category'] = pd.cut(df['box_office'], bins=bins, labels=labels_dict.keys(), right=False).astype(int)

In [None]:
X = df.drop(columns=['box_office_category', 'box_office'])
y = df['box_office_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

class_counts = Counter(y_train)
scale_weights = {cls: sum(class_counts.values()) / count for cls, count in class_counts.items()}
sample_weights = y_train.apply(lambda label: scale_weights[label])

xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(labels_dict),
    subsample=0.7,
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    eval_metric="mlogloss",
    random_state=42
)

xgb_model.fit(X_train, y_train, sample_weight=sample_weights)

scoring = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}
cv_metrics = cross_validate(xgb_model, X_train, y_train, cv=3, scoring=scoring, return_train_score=True)
print("CV Metrics:", cv_metrics)

y_test_pred = xgb_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Precision:", precision_score(y_test, y_test_pred, average='macro'))
print("Test Recall:", recall_score(y_test, y_test_pred, average='macro'))
print("Test F1 Score:", f1_score(y_test, y_test_pred, average='macro'))
print(classification_report(y_test, y_test_pred, target_names=list(labels_dict.values())))

with open("final_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': xgb_model.feature_importances_})
top_n = importance_df.sort_values(by='Importance', ascending=False).head(20)
plt.figure(figsize=(10, 6))
plt.barh(top_n['Feature'], top_n['Importance'])
plt.xlabel("Feature Importance")
plt.title("Top 20 Features")
plt.gca().invert_yaxis()
plt.show()


## Run The Model For Predictions

In [None]:
with open('models/classification.pkl', 'rb') as f:
    model = pickle.load(f)

df = pd.read_csv("train.csv", index_col=0)
df.drop(columns=['box_office'], inplace=True)

df["predictions"] = model.predict(df)
df["predictions"].to_csv("predictions.csv", index=True)