What this script does:
1. Loads the dataset from a CSV file.
2. Converts run_time to total minutes.
3. Standardizes budget and box_office by converting them to numeric values.
4. Splits multi-value columns (genre, casts, directors, writers) into lists for better analysis.
5. Handles missing values and standardizes the certificate column.
6. Saves the cleaned dataset as IMDBTop250Movies_Cleaned.csv.

In [3]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = '/Users/allig/ads/507/IMDBTop250Movies.csv' 
df = pd.read_csv(file_path)

# Function to convert 'run_time' to minutes
def convert_runtime(runtime):
    if pd.isna(runtime):
        return np.nan
    hours = 0
    minutes = 0
    if 'h' in runtime:
        hours = int(runtime.split('h')[0])
        runtime = runtime.split('h')[1]
    if 'm' in runtime:
        minutes = int(runtime.split('m')[0].strip())
    return hours * 60 + minutes

df['run_time'] = df['run_time'].apply(convert_runtime)

# Function to convert currency values to numeric
def convert_currency(value):
    if isinstance(value, str):
        value = value.replace(',', '').strip()
        if value.isdigit():
            return float(value)
        if 'million' in value.lower():
            return float(value.split()[0]) * 1e6
        if 'billion' in value.lower():
            return float(value.split()[0]) * 1e9
    return np.nan

df['budget'] = df['budget'].apply(convert_currency)
df['box_office'] = df['box_office'].apply(convert_currency)

# Convert multi-value columns to lists
multi_value_columns = ['genre', 'casts', 'directors', 'writers']
for col in multi_value_columns:
    df[col] = df[col].apply(lambda x: x.split(',') if isinstance(x, str) else [])

# Standardizing 'certificate' (handling missing values)
df['certificate'] = df['certificate'].replace({'Not Rated': np.nan}).fillna('Unknown')

# Save the cleaned dataset
cleaned_file_path = "IMDBTop250Movies_Cleaned.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Preprocessing complete. Cleaned dataset saved as '{cleaned_file_path}'.")

Preprocessing complete. Cleaned dataset saved as 'IMDBTop250Movies_Cleaned.csv'.
