In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.width', 1000)

In [None]:
df = pd.read_csv("train.csv", index_col=0)

display(df)

## Classification

In [None]:
bins = [0, 1e6, 10e6, 50e6, 100e6, 200e6, 500e6]
labels_dict = {
    0: '(0) 0-1M',
    1: '(1) 1-10M',
    2: '(2) 10-50M',
    3: '(3) 50-100M',
    4: '(4) 100-200M',
    5: '(5) 200M-500M'
}

df['box_office_category'] = pd.cut(df['box_office'], bins=bins, labels=labels_dict.keys(), right=False).astype(int)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from collections import Counter
import matplotlib.pyplot as plt


X = df.drop(columns=['box_office_category', 'box_office'])
y = df['box_office_category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
)

# Keep a DataFrame for test set analysis
df_test = pd.DataFrame(data=X_test, columns=X.columns)
df_test["Actual"] = y_test.values

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

class_counts = Counter(y_train)
scale_weights = {cls: sum(class_counts.values()) / count for cls, count in
                 class_counts.items()}
sample_weights = y_train.apply(lambda label: scale_weights[label])

xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(labels_dict),
    eval_metric="mlogloss",
    random_state=42
)

param_dist = {
    "max_depth": [3, 4, 5, 6, 7],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "n_estimators": [100, 200, 300, 400],
    "subsample": [0.5, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0]
}

scoring = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

# Use RandomizedSearchCV with multiple scoring metrics; refit based on accuracy.
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=10,  # Number of random parameter settings
    scoring=scoring,
    refit='accuracy',  # The metric to optimize for selecting the best model
    cv=3,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV (pass sample_weight for imbalance handling)
random_search.fit(X_train, y_train, sample_weight=sample_weights)

print("Best Parameters:", random_search.best_params_)
print("Best CV Accuracy:", random_search.best_score_)

# Extract and display all CV results for each metric
cv_results = random_search.cv_results_
print("\nCV Results (mean scores):")
for metric in scoring.keys():
    key = f"mean_test_{metric}"
    if key in cv_results:
        print(f"{metric}: {np.max(cv_results[key]):.4f}")

best_model = random_search.best_estimator_
y_test_pred = best_model.predict(X_test)

print("\n--- Test Set Metrics ---")
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print("Test Accuracy:", test_accuracy)
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)
print("Test F1 Score:", test_f1)
print("\nClassification Report (Test Set):\n",
      classification_report(y_test, y_test_pred,
                            target_names=list(labels_dict.values())))

y_train_pred = best_model.predict(X_train)


cv_metrics = cross_validate(
    best_model, X_train, y_train, cv=3,
    scoring=scoring, return_train_score=True
)

y_test_labels = [labels_dict[label] for label in y_test]
y_pred_labels = [labels_dict[label] for label in y_test_pred]
crosstab_df = pd.crosstab(pd.Series(y_test_labels, name="Actual"),
                          pd.Series(y_pred_labels, name="Predicted"),
                          margins=True)
print(crosstab_df)

df_test["Predicted"] = y_test_pred
df_test["Error_Size"] = abs(df_test["Actual"] - df_test["Predicted"])
df_test = df_test.sort_values(by="Error_Size", ascending=False)
df_test["Actual"] = df_test["Actual"].map(labels_dict)
df_test["Predicted"] = df_test["Predicted"].map(labels_dict)
df_test["box_office"] = df.loc[df_test.index, "box_office"]

misclassified_df = df_test[df_test["Actual"] != df_test["Predicted"]]
misclassified_df[["box_office", "Actual", "Predicted"]].to_csv(
    "misclassified_movies.csv", index=False)

feature_importances = best_model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

n = 20
top_n_features = importance_df.sort_values(by='Importance',
                                           ascending=False).head(n)

plt.figure(figsize=(10, 6))
plt.barh(top_n_features['Feature'], top_n_features['Importance'])
plt.xlabel('Feature Importance')
plt.title(f'Top {n} Feature Importances')
plt.gca().invert_yaxis()  # Most important features at the top
plt.show()


In [None]:
import pickle

with open('models/classification.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("train.csv", index_col=0)

bins = [0, 1e6, 10e6, 50e6, 100e6, 200e6, 500e6]
labels_dict = {
    0: '(0) 0-1M',
    1: '(1) 1-10M',
    2: '(2) 10-50M',
    3: '(3) 50-100M',
    4: '(4) 100-200M',
    5: '(5) 200M-500M'
}

df['box_office_category'] = pd.cut(df['box_office'], bins=bins, labels=labels_dict.keys(), right=False).astype(int)
df.drop(columns=['box_office'], inplace=True)

# Assuming your DataFrame is named 'df'
target = 'box_office_category'
X = df.drop(target, axis=1)
y = df[target]

# If y is categorical, encode it
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# Create the XGBoost classifier.
# Note: use_label_encoder=False and eval_metric set to avoid warnings in newer versions.
clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

with open('models/classification.pkl', 'wb') as f:
    pickle.dump(clf, f)


In [None]:
with open('models/classification.pkl', 'rb') as f:
    model = pickle.load(f)

df = pd.read_csv("train.csv", index_col=0)
df.drop(columns=['box_office'], inplace=True)

predictions = model.predict(df)
display(predictions)

## Regression

In [None]:
df = pd.read_csv("train.csv")
df["title_id"] = df["title"] + " (" + df["release_year"].astype(str) + ")"
df.drop_duplicates(subset=["title_id"], inplace=True)
df.set_index("title_id", inplace=True)
df.drop("title", axis=1, inplace=True)
df.dropna(subset=["box_office"], inplace=True)

display(df)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Assuming df is your original DataFrame with 'box_office' as the target.
X = df.drop(columns=["box_office"])
y = df["box_office"]

# Save feature names for later feature importance plotting
feature_names = X.columns

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

# --- Randomized Search Cross-Validation ---
# Define a parameter grid for RandomizedSearchCV.
param_distributions = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, 30, 40],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomizedSearchCV with 5-fold CV and 20 iterations.
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=20,
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

# Fit the randomized search on the training data.
random_search.fit(X_train, y_train)

# Retrieve the best estimator.
best_rf = random_search.best_estimator_

print("Best parameters from RandomizedSearchCV:")
print(random_search.best_params_)
print("Best CV RMSE:", np.sqrt(-random_search.best_score_))

# --- Evaluation on the Test Set using the Best Estimator ---
y_pred_best = best_rf.predict(X_test)
rmse_test_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
mape_test_best = mean_absolute_percentage_error(y_test, y_pred_best)
r2_test_best = r2_score(y_test, y_pred_best)

print("\nRandom Forest (Best Estimator) - Test Set Evaluation Metrics:")
print(f"RMSE: {rmse_test_best:.4f}")
print(f"MAPE: {mape_test_best:.4%}")
print(f"R² Score: {r2_test_best:.4f}")

# --- Cross-Validation Metrics for the Best Estimator ---
# Compute RMSE using negative MSE (convert to positive and take square root)
cv_rmse = np.sqrt(-cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1))

# Compute R² scores
cv_r2 = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)

# Compute MAPE (convert from negative to positive)
cv_mape = -cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)

print("\nCross-Validation Metrics (Best Estimator on Training Data):")
print(f"CV RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")
print(f"CV R²: {cv_r2.mean():.4f} ± {cv_r2.std():.4f}")
print(f"CV MAPE: {cv_mape.mean():.4%} ± {cv_mape.std():.4%}")

# --- Feature Importances ---
feature_importances = best_rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df)


In [None]:
with open('models/regression.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

In [None]:
import matplotlib.pyplot as plt

# Assuming y_test and y_pred_best are already defined from your model evaluation:
# y_pred_best = best_rf.predict(X_test)

# Calculate residuals
residuals = y_test - y_pred_best

# --- Residuals Scatter Plot ---
plt.figure(figsize=(8, 6))
plt.scatter(y_pred_best, residuals, alpha=0.7)
plt.hlines(y=0, xmin=y_pred_best.min(), xmax=y_pred_best.max(), colors='red', linestyles='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs. Predicted Values")
plt.xscale('log')
plt.yscale('log')
plt.show()

# --- Residuals Histogram ---
plt.figure(figsize=(8, 6))
plt.hist(residuals, bins=30, edgecolor='k', alpha=0.7)
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.xscale('symlog')
plt.title("Distribution of Residuals")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming y_test and y_pred_best are already defined from your model evaluation
# Calculate residuals
residuals = y_test - y_pred_best

# Compute the standard deviation of the residuals
residuals_std_value = np.std(residuals)

# Compute standardized residuals
standardized_residuals = residuals / residuals_std_value

# --- Scatter Plot of Standardized Residuals vs. Predicted Values ---
plt.figure(figsize=(8, 6))
plt.scatter(y_pred_best, standardized_residuals, alpha=0.7)
plt.hlines(y=0, xmin=y_pred_best.min(), xmax=y_pred_best.max(), colors='red', linestyles='--')
plt.xlabel("Predicted Values")
plt.ylabel("Standardized Residuals")
plx.xscale('log')
plt.yscale('log')
plt.title("Standardized Residuals vs. Predicted Values")
plt.show()

# --- Histogram of Standardized Residuals ---
plt.figure(figsize=(8, 6))
plt.hist(standardized_residuals, bins=30, edgecolor='k', alpha=0.7)
plt.xlabel("Standardized Residual")
plt.ylabel("Frequency")
plt.title("Distribution of Standardized Residuals")
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_best, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.xscale('log')
plt.yscale('log')
plt.title("Predicted vs. Actual Values")
plt.show()
