In [None]:
!pip install pandas scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Upload your CSV file
from google.colab import files
uploaded = files.upload()

# Load the first uploaded file
df = pd.read_csv(next(iter(uploaded)))

In [None]:
df.head()  # View first few rows to understand structure

In [None]:
# Remove currency symbols and commas, convert to float
for col in ['Budget', 'Lifetime Collection', 'Opening Day Collection']:
    df[col] = df[col].replace('[₹,]', '', regex=True).astype(float)

# Convert release date to datetime and extract useful parts
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
df['Release Month'] = df['Release Date'].dt.month
df['Release Year'] = df['Release Date'].dt.year

# Fill missing genre and drop rows missing target or key features
df['Genre'] = df['Genre'].fillna('Unknown')
df.dropna(subset=['Budget', 'Lifetime Collection'], inplace=True)

In [None]:
# One-hot encode the Genre column
df = pd.get_dummies(df, columns=['Genre'], drop_first=True)

# Select features and target variable
features = ['Budget', 'Opening Day Collection', 'Release Month', 'Release Year']
features += [col for col in df.columns if col.startswith('Genre_')]

X = df[features]
y = df['Lifetime Collection']

In [None]:
# 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Create and train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict on test set
y_pred = lr.predict(X_test)

In [None]:
# Print evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

In [None]:
# Scatter plot of actual vs predicted
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Perfect prediction line
plt.xlabel("Actual Revenue")
plt.ylabel("Predicted Revenue")
plt.title("Linear Regression: Actual vs Predicted")
plt.grid(True)
plt.show()

In [None]:
# Calculate profit (positive means success)
df['Profit'] = df['Lifetime Collection'] - df['Budget']

# Create a binary label based on profit
df['Verdict'] = df['Profit'].apply(lambda x: 'Hit' if x > 0 else 'Flop')

# Preview the updated dataset
df[['Title', 'Budget', 'Lifetime Collection', 'Profit', 'Verdict']].head()

In [None]:
# We'll use the same features as before
X_profit = df[features]
y_profit = df['Profit']

# Split data
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_profit, y_profit, test_size=0.2, random_state=42
)

# Train model
profit_model = LinearRegression()
profit_model.fit(X_train_p, y_train_p)

# Predict and evaluate
y_profit_pred = profit_model.predict(X_test_p)
mse_profit = mean_squared_error(y_test_p, y_profit_pred)
r2_profit = r2_score(y_test_p, y_profit_pred)

print(f"[Profit Prediction] MSE: {mse_profit:.2f}, R² Score: {r2_profit:.2f}")

In [None]:
# Compare actual vs predicted profits
comparison = pd.DataFrame({
    'Actual Profit': y_test_p,
    'Predicted Profit': y_profit_pred.round(2)
})
comparison.head()