In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("netflix_titles.csv")


# 1. Handle Missing Values

# View missing values
print("Missing values before cleaning:\n", df.isnull().sum())

# Fill missing 'director' and 'cast' with 'Unknown'
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')

# Fill missing 'rating' with 'Not Rated'
df['rating'] = df['rating'].fillna('Not Rated')

# Drop rows with missing 'country', 'date_added', or 'duration'
df.dropna(subset=['country', 'date_added', 'duration'], inplace=True)

# 2. Remove Duplicates

df = df.drop_duplicates()


# 3. Standardize Text Values

df['type'] = df['type'].str.strip().str.title()
df['country'] = df['country'].str.strip().str.title()
df['rating'] = df['rating'].str.strip().str.upper()
df['director'] = df['director'].str.strip()
df['cast'] = df['cast'].str.strip()


# 4. Convert 'date_added' to 'dd-mm-yyyy'

df['date_added'] = pd.to_datetime(df['date_added'], format='%B %d, %Y', errors='coerce')
df['date_added'] = df['date_added'].dt.strftime('%d-%m-%Y')


# 5. Rename Columns to be Uniform

df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]


# 6. Fix Data Types
# Convert date_added back to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], format='%d-%m-%Y', errors='coerce')

# release_year should be integer (already is, but we ensure it)
df['release_year'] = df['release_year'].astype('Int64')  # supports NaNs


# Summary Output

print("\nFinal Missing Values:\n", df.isnull().sum())
print("\nFinal Data Types:\n", df.dtypes)
print("\nCleaned data shape:", df.shape)


# Save the Cleaned Data

df.to_csv("netflix_titles_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as 'netflix_titles_cleaned.csv'")


Missing values before cleaning:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

Final Missing Values:
 show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      85
release_year     0
rating           0
duration         0
listed_in        0
description      0
dtype: int64

Final Data Types:
 show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             Int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

Cleaned data shape: (7964, 12)