In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("IMDB-Ratings.csv")

df['Runtime'] = df['Runtime'].str.replace(" min", "").astype(float)
df['Gross'] = df['Gross'].str.replace(",", "")
df['Gross'] = pd.to_numeric(df['Gross'], errors='coerce')
df['Budget_per_minute'] = df['Gross'] / df['Runtime']
df['num_genres'] = df['Genre'].apply(lambda x: len(str(x).split(',')))
df['cast_popularity'] = df[['Star1', 'Star2', 'Star3', 'Star4']].notna().sum(axis=1)

df.dropna(subset=["IMDB_Rating"], inplace=True)

numeric_cols = ['Runtime', 'Meta_score', 'Gross', 'No_of_Votes', 'Budget_per_minute', 'num_genres', 'cast_popularity']
corr_matrix = df[numeric_cols + ['IMDB_Rating']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation with IMDb Rating")
plt.tight_layout()
plt.show()

top_directors = df['Director'].value_counts().nlargest(10).index
df['Director'] = df['Director'].where(df['Director'].isin(top_directors), 'Other')
df_encoded = pd.get_dummies(df, columns=["Genre", "Certificate", "Director"], drop_first=True)

feature_cols = ['Runtime', 'Meta_score', 'Gross', 'No_of_Votes', 'Budget_per_minute', 'num_genres', 'cast_popularity'] + \
               [col for col in df_encoded.columns if col.startswith(('Genre_', 'Certificate_', 'Director_'))]
X = df_encoded[feature_cols].fillna(0)
y = df_encoded['IMDB_Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Random Forest MSE: {mse:.4f}")
print(f"Random Forest R² Score: {r2:.4f}")

importances = model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
importance_df = importance_df.sort_values(by="Importance", ascending=False).head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.title("Top 10 Feature Importances - Random Forest")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, y_test, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.title("Predicted vs Actual IMDb Ratings - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()