In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../complete_data.csv", low_memory=False)

In [None]:
df = df[df["box_office"] > 100000]

In [None]:
display(df)

In [None]:
# Define bins and numeric labels
bins = [0, 1e6, 10e6, 50e6, 100e6, 200e6, 500e6, float('inf')]
labels_dict = {
    0: '0-1M',
    1: '1-10M',
    2: '10-50M',
    3: '50-100M',
    4: '100-200M',
    5: '200M-500M',
    6: '500M+'
}

# Assign numeric labels
df['box_office_category'] = pd.cut(df['box_office'], bins=bins, labels=labels_dict.keys(), right=False).astype(int)

display(df)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Features (X) and Target (y)
X = df.drop(columns=['box_office_category', 'box_office', "title"])
titles = df['title']
y = df['box_office_category']

# Split data while keeping it as a DataFrame
X_train, X_test, y_train, y_test, titles_train, titles_test = train_test_split(
    X, y, titles, test_size=0.2, random_state=42, stratify=y
)

# Now, X_test is still a DataFrame, so we can join title back
df_test = pd.DataFrame(data=X_test, columns=X.columns)
df_test["title"] = titles_test.values  # Ensure title is assigned properly
df_test["Actual"] = y_test.values

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from collections import Counter

class_counts = Counter(y_train)
scale_weights = {cls: sum(class_counts.values()) / count for cls, count in class_counts.items()}

model = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(labels_dict),
    eval_metric="mlogloss",
    scale_pos_weight=[scale_weights[i] for i in sorted(scale_weights.keys())]
)

model.fit(X_train, y_train)


In [None]:
# Predict on test set
y_pred = model.predict(X_test)

df_test["Predicted"] = y_pred  # Ensure y_pred is also in the correct shape

# Compute accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=labels_dict.values()))

In [None]:
import pandas as pd

# Convert numeric categories back to readable labels
y_test_labels = [labels_dict[label] for label in y_test]
y_pred_labels = [labels_dict[label] for label in y_pred]

# Create a crosstab
crosstab_df = pd.crosstab(pd.Series(y_test_labels, name="Actual"),
                          pd.Series(y_pred_labels, name="Predicted"),
                          margins=True)  # Adds row/column totals

print(crosstab_df)


In [None]:
df_test["Error_Size"] = abs(df_test["Actual"] - df_test["Predicted"])

df_test = df_test.sort_values(by="Error_Size", ascending=False)

In [None]:
df_test["Actual"] = df_test["Actual"].map(labels_dict)
df_test["Predicted"] = df_test["Predicted"].map(labels_dict)

df_test["box_office"] = df.loc[df_test.index, "box_office"]


In [None]:
misclassified_df = df_test[df_test["Actual"] != df_test["Predicted"]]

# Save to CSV
misclassified_df[["title", "box_office", "Actual", "Predicted"]].to_csv("misclassified_movies.csv", index=False)



In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame for easy sorting
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

n = 10

# Sort by importance and get top n features
top_n_features = importance_df.sort_values(by='Importance', ascending=False).head(n)

# Plot top n features
plt.figure(figsize=(10, 6))
plt.barh(top_n_features['Feature'], top_n_features['Importance'])
plt.xlabel('Feature Importance')
plt.title(f'Top {n} Feature Importances')
plt.gca().invert_yaxis()  # Optional: To have the most important features at the top
plt.show()



In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np

# Assuming df is your original DataFrame with box_office as the target
X = df.drop(columns=["box_office", "title", "box_office_category"])
y = df["box_office"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Bagging Regressor with DecisionTree as the base estimator
bagging_model = BaggingRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500, 1000],  # Different values for n_estimators to try
    'max_samples': [0.5, 0.8, 1.0],          # Fraction of samples to train each estimator on
    'max_features': [0.5, 0.8, 1.0]          # Fraction of features to train each estimator on
}

# Set up GridSearchCV
grid_search = GridSearchCV(bagging_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Best models from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions with the best models
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics for the train-test split
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mape_test = mean_absolute_percentage_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

# Calculate Scaled RMSE (scaled by the range of the target variable)
y_range = y.max() - y.min()
scaled_rmse = rmse_test / y_range

print("Train-test split evaluation metrics:")
print(f"RMSE: {rmse_test:.4f}")
print(f"Scaled RMSE: {scaled_rmse:.4f}")
print(f"MAPE: {mape_test:.4%}")
print(f"R^2 Score: {r2_test:.4f}")

# Cross-validation (on the best models)
cv_results = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Convert negative MSE to positive and compute RMSE for cross-validation
mse_cv = -cv_results
rmse_cv = np.sqrt(mse_cv)

# Calculate R² and MAPE for cross-validation
cv_r2_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
cv_mape_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)

# Convert negative MAPE to positive
cv_mape_scores = -cv_mape_scores

# Print cross-validation evaluation metrics
print("\nCross-validation evaluation metrics:")
print(f"Cross-validated RMSE: {rmse_cv.mean():.4f} ± {rmse_cv.std():.4f}")
print(f"Cross-validated R²: {cv_r2_scores.mean():.4f} ± {cv_r2_scores.std():.4f}")
print(f"Cross-validated MAPE: {cv_mape_scores.mean():.4%} ± {cv_mape_scores.std():.4%}")
