In [None]:
!pip install pandas scikit-learn matplotlib seaborn

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("movies.csv")  # update with your correct file path

# See column names and a sample
print(df.columns)
print(df.head())

In [None]:
# Drop rows with missing essential values
df = df.dropna(subset=['Budget', 'Lifetime Collection'])

# Convert numeric strings to float
df['Budget'] = df['Budget'].replace('[\₹,]', '', regex=True).astype(float)
df['Lifetime Collection'] = df['Lifetime Collection'].replace('[\₹,]', '', regex=True).astype(float)

# Create features
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
df['Release Month'] = df['Release Date'].dt.month
df['Release Year'] = df['Release Date'].dt.year

# Optional: one-hot encode Genre
df['Genre'] = df['Genre'].fillna('Unknown')
df = pd.get_dummies(df, columns=['Genre'], drop_first=True)

# Define input features (you can add more)
features = ['Budget', 'Release Month', 'Release Year'] + [col for col in df.columns if col.startswith('Genre_')]
X = df[features]
y = df['Lifetime Collection']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Lifetime Collection")
plt.ylabel("Predicted Lifetime Collection")
plt.title("Actual vs Predicted Box Office Revenue")
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
plt.grid()
plt.show()