In [28]:
# Step 1: Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
import ast

In [29]:
# Step 2: Load and Merge Data

# Load train and test datasets
train = pd.read_csv(r"D:\AI\ML\TMDB Box Office Prediction\train.csv")
test = pd.read_csv(r"D:\AI\ML\TMDB Box Office Prediction\test.csv")

# Drop unnecessary columns
columns_to_drop = ['imdb_id', 'poster_path', 'original_title', 'title', 'overview', 'tagline', 'cast', 'crew']
df = pd.concat([train, test], ignore_index=True).drop(columns=columns_to_drop)

In [30]:
# Step 3: Preprocess Release Date

# Convert release_date to datetime and extract year, month, and day of the week
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year
df['release_year'] = df['release_year'].apply(lambda x: x-100 if x > 2022 else x)
df['release_month'] = df['release_date'].dt.month
df['release_wday'] = df['release_date'].dt.dayofweek

# Drop rows with missing release_date and runtime
df.dropna(subset=['release_date', 'runtime'], inplace=True)


  df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')


In [31]:
# Step 4: Convert Boolean Columns

# Convert homepage and belongs_to_collection to binary indicators
df['homepage'] = df['homepage'].notna().astype(int)
df['belongs_to_collection'] = df['belongs_to_collection'].notna().astype(int)
df['status'] = df['status'].fillna('Released')

In [32]:
# Step 5: Define a Function to Create Dummy Variables

def dummy_new2(col, key):
    temp = pd.DataFrame(df[df[col].notnull()][col].apply(lambda x: [i[key] for i in ast.literal_eval(x)]))
    temp = pd.get_dummies(temp.explode(col)).groupby(level=0).sum()
    df.loc[df[col].notnull(), temp.columns.tolist()] = temp
    df.drop(columns=col, inplace=True)

In [33]:
# Step 6: Apply the Function to Relevant Columns

columns_to_dummy = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'Keywords']
keys = ['id', 'id', 'iso_3166_1', 'iso_639_1', 'id']

for col, key in zip(columns_to_dummy, keys):
    dummy_new2(col, key)

In [34]:
# Step 7: Drop Remaining Unnecessary Columns

df.drop(columns=['release_date'], inplace=True)

In [35]:
# Step 8: Split Data Back into Train and Test Sets

df_train = df[df['id'] <= 3000]
df_test = df[df['id'] > 3000]

In [36]:
# Step 9: Handle Missing Values and Prepare Data for Modeling

# Fill missing values with 0
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

# Define features and target variable
X_train = df_train.drop(columns=['revenue', 'id'])
y_train = df_train['revenue']
X_test = df_test.drop(columns=['revenue', 'id'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.fillna(0, inplace=True)


In [37]:
# Step 10: Train and Evaluate Model

# Use one-hot encoding
X_train = pd.get_dummies(X_train, columns=['original_language', 'status'])
X_test = pd.get_dummies(X_test, columns=['original_language', 'status'])

# Align columns of X_train and X_test
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Initialize and train XGBRegressor
xgb = XGBRegressor(tree_method="hist", enable_categorical=True)
xgb.fit(X_train, y_train)

# Make predictions on the training set (for demonstration purposes)
y_pred_train = xgb.predict(X_train)

# Evaluate the model on the training set
print(f"Train R2 Score: {r2_score(y_train, y_pred_train)}")

Train R2 Score: 0.9698115638680059


In [38]:
# Step 11: Prepare Submission File

# Make predictions on the test set
pred = xgb.predict(X_test)

# Ensure submission has all IDs from the original test.csv
original_test = pd.read_csv(r"D:\AI\ML\TMDB Box Office Prediction\test.csv")
submission = pd.DataFrame({'id': original_test['id']})

# Merge predictions with all test IDs
submission = submission.merge(pd.DataFrame({'id': df_test['id'], 'revenue': pred}), on='id', how='left')

# Fill missing revenue predictions with a default value (e.g., 0)
submission['revenue'].fillna(0, inplace=True)

# Save to CSV
submission.to_csv('submission.csv', index=False)


# Save to CSV
submission.to_csv('submission.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission['revenue'].fillna(0, inplace=True)
