# 🎬 Movie Rating Prediction

In [1]:

# Movie Rating Prediction with Genre, Director, and Cast Features

## 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## 2. Load Dataset
file_path = "IMDb Movies India.csv"  # Replace with correct path if different
df = pd.read_csv(file_path, encoding="ISO-8859-1")

## 3. Clean and Preprocess Data
# Drop malformed first row
df = df.drop(index=0).reset_index(drop=True)

# Convert rating and votes to numeric
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")
df["Votes"] = pd.to_numeric(df["Votes"], errors="coerce")

# Drop rows without ratings
df = df.dropna(subset=["Rating"])

# Keep necessary columns
features = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3", "Votes"]
df_model = df[features + ["Rating"]].copy()

# Fill missing values
for col in ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]:
    df_model[col] = df_model[col].fillna("Unknown")
df_model["Votes"] = df_model["Votes"].fillna(0)

## 4. Reduce Cardinality of Categorical Features
categorical_features = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
for col in categorical_features:
    top_categories = df_model[col].value_counts().nlargest(20).index
    df_model[col] = df_model[col].apply(lambda x: x if x in top_categories else "Other")

## 5. Prepare Features and Target
X = df_model.drop("Rating", axis=1)
y = df_model["Rating"]

## 6. Create Pipeline and Train Model
pipeline = Pipeline(steps=[
    ("preprocessor", ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ], remainder='passthrough')),
    ("regressor", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

## 7. Evaluate Model
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R² Score: {r2:.3f}")
print(f"Mean Absolute Error: {mae:.3f}")


R² Score: 0.142
Mean Absolute Error: 1.007
