# 🎯 Enhanced Data Cleaning for Movie Dataset
This notebook performs comprehensive cleaning and preprocessing on a movie dataset, including feature engineering, outlier removal, genre extraction, and export.

In [None]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("movies.csv")  # Replace with your actual file
print("📊 Initial dataset shape:", df.shape)

# Display first few rows
df.head()

In [None]:
# Basic Info
print("\n🔍 Dataset Info:")
print(df.info())

print("\n📉 Summary Statistics:")
print(df.describe(include='all'))

In [None]:
# Handle Missing Values
print("\n🧬 Missing Values Before Cleaning:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Drop irrelevant columns
drop_cols = ['homepage', 'tagline', 'status', 'spoken_languages', 'production_companies']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Fill numerical columns
df['budget'] = df['budget'].fillna(0)
df['revenue'] = df['revenue'].fillna(0)
df['runtime'] = df['runtime'].fillna(df['runtime'].median())

In [None]:
# Convert release_date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year

# Drop rows with no release year
df.dropna(subset=['release_year'], inplace=True)
df['release_year'] = df['release_year'].astype(int)

In [None]:
# Clean object columns
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    try:
        df[col] = df[col].astype(str)
    except:
        pass

# Drop duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Feature Engineering
df['profit'] = df['revenue'] - df['budget']
df['profit_margin'] = np.where(df['budget'] > 0, df['profit'] / df['budget'], 0)

# Log transformations
df['log_budget'] = np.log1p(df['budget'])
df['log_revenue'] = np.log1p(df['revenue'])
df['log_profit'] = np.log1p(df['profit'])

In [None]:
# Extract main genre
def extract_main_genre(genre_str):
    try:
        genre_list = ast.literal_eval(genre_str)
        if isinstance(genre_list, list) and genre_list:
            return genre_list[0]['name']
    except:
        return np.nan

if 'genres' in df.columns:
    df['main_genre'] = df['genres'].apply(extract_main_genre)

In [None]:
# Filter for English-language movies (optional)
if 'original_language' in df.columns:
    df = df[df['original_language'] == 'en']

In [None]:
# Drop extreme outliers in revenue (optional)
q1 = df['revenue'].quantile(0.25)
q3 = df['revenue'].quantile(0.75)
iqr = q3 - q1
upper_limit = q3 + 3 * iqr
df = df[df['revenue'] <= upper_limit]

In [None]:
# Create runtime buckets
df['duration_category'] = pd.cut(df['runtime'], bins=[0, 60, 90, 120, 180, np.inf],
                                 labels=['Short (<1hr)', 'Medium (1-1.5hr)', 'Standard (1.5-2hr)', 'Long (2-3hr)', 'Epic (>3hr)'])

In [None]:
# One-hot encoding for main genre
if 'main_genre' in df.columns:
    genre_dummies = pd.get_dummies(df['main_genre'], prefix='genre', drop_first=True)
    df = pd.concat([df, genre_dummies], axis=1)

In [None]:
# Final dataset overview
print("\n👍 Cleaned Dataset Shape:", df.shape)
df[['title', 'release_year', 'runtime', 'budget', 'revenue', 'profit', 'main_genre']].head()

In [None]:
# Save cleaned data
df.to_csv("cleaned_movies.csv", index=False)
print("\n💾 Cleaned dataset saved as 'cleaned_movies.csv'")