In [1]:
import pandas as pd

# Load the dataset
file_path = "/Users/ankita/Downloads/netflix_titles.csv"
df_netflix_raw = pd.read_csv(file_path)

# Step 1: Check for duplicate rows
duplicates = df_netflix_raw.duplicated().sum()

# Step 2: Rename columns to be lowercase with underscores and remove special characters
df_netflix_clean = df_netflix_raw.copy()
df_netflix_clean.columns = (
    df_netflix_clean.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
)

# Step 3: Standardize text fields
text_columns = df_netflix_clean.select_dtypes(include='object').columns
for col in text_columns:
    df_netflix_clean[col] = df_netflix_clean[col].astype(str).str.strip()

# Step 4: Convert date_added to datetime
if 'date_added' in df_netflix_clean.columns:
    df_netflix_clean['date_added'] = pd.to_datetime(df_netflix_clean['date_added'], errors='coerce')

# Step 5: Report missing values
missing_summary = df_netflix_clean.isnull().sum().sort_values(ascending=False)

# Step 6: Drop duplicates
df_netflix_clean = df_netflix_clean.drop_duplicates()

# Step 7: Save the cleaned dataset
df_netflix_clean.to_csv("netflix_titles_cleaned.csv", index=False)

# Summary
summary = {
    "Original shape": df_netflix_raw.shape,
    "Cleaned shape": df_netflix_clean.shape,
    "Duplicates removed": duplicates,
    "Missing values (by column)": missing_summary[missing_summary > 0].to_dict()
}

print("Cleaning Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")

Cleaning Summary:
Original shape: (8807, 12)
Cleaned shape: (8807, 12)
Duplicates removed: 0
Missing values (by column): {'date_added': 10}
