# Movie Theater Revenue Optimization

Data Analytics & Predictive Modeling Project

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score

# Set random seed
np.random.seed(42)


In [None]:

# Load the dataset
df = pd.read_csv('../data/movie_theater_sales.csv')
df['Show_Date'] = pd.to_datetime(df['Show_Date'])
df.head()


## Data Overview

In [None]:

df.describe()


## Feature Engineering

In [None]:

revenue_median = df['Total_Revenue'].median()
df['High_Revenue'] = (df['Total_Revenue'] > revenue_median).astype(int)
df.head()


## Exploratory Data Analysis (EDA)

In [None]:

# Total Revenue by Show Time
plt.figure(figsize=(8,5))
sns.barplot(data=df, x='Show_Time', y='Total_Revenue', estimator=sum, ci=None, palette='coolwarm')
plt.title('Total Revenue by Show Time')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# Average Occupancy by City
plt.figure(figsize=(8,5))
sns.barplot(data=df, x='City', y='Occupancy_Rate', estimator=np.mean, ci=None, palette='muted')
plt.title('Average Occupancy Rate by City')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# Correlation Heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()


## Classification Model - Decision Tree

In [None]:

feature_cols = ['Occupancy_Rate', 'Ticket_Price', 'Tickets_Sold', 'Concession_Sales']
X = df[feature_cols]
y = df['High_Revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

dtree = DecisionTreeClassifier(max_depth=5, random_state=42)
dtree.fit(X_train, y_train)

y_pred = dtree.predict(X_test)
dtree_accuracy = accuracy_score(y_test, y_pred)

print(f"Decision Tree Classifier Accuracy: {dtree_accuracy:.2%}")


## Regression Model - Linear Regression

In [None]:

X_reg = df[feature_cols]
y_reg = df['Total_Revenue']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

regressor = LinearRegression()
regressor.fit(X_train_reg, y_train_reg)
y_pred_reg = regressor.predict(X_test_reg)

mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Linear Regression RMSE: {rmse:.2f}")
print(f"Linear Regression R2 Score: {r2:.2f}")


In [None]:

# Actual vs Predicted Revenue
plt.figure(figsize=(7,5))
sns.scatterplot(x=y_test_reg, y=y_pred_reg, alpha=0.6)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
plt.xlabel('Actual Revenue')
plt.ylabel('Predicted Revenue')
plt.title('Actual vs Predicted Revenue (Linear Regression)')
plt.tight_layout()
plt.show()


In [None]:

# Residuals distribution
residuals = y_test_reg - y_pred_reg
plt.figure(figsize=(7,5))
sns.histplot(residuals, kde=True, color='purple', bins=30)
plt.title('Distribution of Residuals (Prediction Errors)')
plt.xlabel('Error ($)')
plt.tight_layout()
plt.show()


## Final Business Recommendations


- Focus on **Evening Shows** and **IMAX screens** for maximizing revenue.
- Increase **concession sales** through promotions and upselling.
- Use **predictive models** to optimize scheduling and pricing strategies.
- Target underperforming cities for marketing efforts.
