# IMDB Movie Box Office Prediction using Random Forest Regression

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Data Loading and Preprocessing

In [None]:
# Load the IMDB dataset
def load_and_preprocess_data(filepath):
    # Read dataset
    data = pd.read_csv(filepath)
    
    # Renaming relevant columns
    data.rename(columns={
        'Released_Year':'Release Year',
        'Certificate':'Age Rating',
        'IMDB_Rating':'IMDB Rating',
        'Meta_score':'Metascore',
        'No_of_Votes':'Votes',
        'Gross':'Gross Revenue'
    }, inplace=True)
    
    # Removing rows where Gross Revenue is null
    data = data[data['Gross Revenue'].notnull()]
    
    # Standardizing Age Rating
    data['Age Rating'] = data['Age Rating'].map({
        'U':'U', 'G':'U', 'PG':'U', 'GP':'U', 'TV-PG':'U',
        'UA':'UA', 'PG-13':'UA', 'U/A':'UA', 'Passed':'UA', 'Approved':'UA',
        'A':'A', 'R':'A'
    })
    data = data[data['Age Rating'].notnull()]
    
    # Filtering and cleaning Release Year
    data = data[data['Release Year'].str.match(r'\d\d\d\d')]
    data['Release Year'] = data['Release Year'].astype(int)
    
    # Converting Runtime
    data['Runtime'] = data['Runtime'].str[:-4].astype(int)
    
    # Converting Gross Revenue to millions
    data['Gross Revenue'] = data['Gross Revenue'].str.replace(',','').astype(float) * (10**-6)
    
    # Count and primary genre
    data['Genres'] = data['Genre'].apply(lambda x: len(x.split(', ')))
    data['Primary Genre'] = data['Genre'].str.split(', ').str[0]
    data.drop('Genre', axis=1, inplace=True)
    
    # Binary Metascore existence
    data['Metascore Exists'] = data['Metascore'].notnull()
    data.drop('Metascore', axis=1, inplace=True)
    
    return data

# Load the data
data = load_and_preprocess_data('imdb_dataset.csv')

## Exploratory Data Analysis

In [None]:
# Basic data overview
print(data.info())

# Descriptive statistics
print(data[['Release Year', 'Runtime', 'IMDB Rating', 'Gross Revenue']].describe())

In [None]:
# Visualization of Gross Revenue distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Gross Revenue'], kde=True)
plt.title('Distribution of Gross Revenue')
plt.xlabel('Gross Revenue (Millions)')
plt.show()

## Prepare Data for Modeling

In [None]:
# Prepare features and target
X = data[['Release Year', 'Age Rating', 'Runtime', 'IMDB Rating', 'Votes', 'Metascore Exists', 'Genres', 'Primary Genre']]
y = data['Gross Revenue']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Create Preprocessing and Model Pipeline

In [None]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Age Rating', 'Primary Genre']),
        ('num', 'passthrough', ['Release Year', 'Runtime', 'IMDB Rating', 'Votes', 'Metascore Exists', 'Genres'])
    ]
)

# Create Random Forest Regressor
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2
    ))
])

# Fit the model
rf_model.fit(X_train, y_train)

## Model Evaluation

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluation metrics
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))

## Feature Importance Visualization

In [None]:
# Extract feature names
feature_names = (
    list(preprocessor.named_transformers_['cat']
         .get_feature_names_out(['Age Rating', 'Primary Genre']).tolist()) +
    ['Release Year', 'Runtime', 'IMDB Rating', 'Votes', 'Metascore Exists', 'Genres']
)

# Get feature importances
importances = rf_model.named_steps['regressor'].feature_importances_

# Create DataFrame for visualization
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Feature Importances in Random Forest Model')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

## Example Prediction

In [None]:
# Create a sample movie for prediction
sample_movie = pd.DataFrame({
    'Release Year': [2023],
    'Age Rating': ['UA'],
    'Runtime': [120],
    'IMDB Rating': [7.5],
    'Votes': [500000],
    'Metascore Exists': [True],
    'Genres': [2],
    'Primary Genre': ['Action']
})

# Predict box office revenue
predicted_revenue = rf_model.predict(sample_movie)
print(f"Predicted Box Office Revenue: ${predicted_revenue[0]:.2f} Million")