In [2]:
import kagglehub
import pandas as pd
import os
import shutil

In [3]:
path = kagglehub.dataset_download("shivamb/netflix-shows")
csv_path = os.path.join(path, "netflix_titles.csv")

Using Colab cache for faster access to the 'netflix-shows' dataset.


In [4]:
shutil.copy(csv_path, 'netflix_raw.csv')

'netflix_raw.csv'

In [5]:
df = pd.read_csv('netflix_raw.csv')
print(f"Initial Shape: {df.shape}")
print("--- Missing Values Before ---")
print(df.isnull().sum())

Initial Shape: (8807, 12)
--- Missing Values Before ---
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [6]:
# A. Standardize Headers
# Lowercase and remove spaces
df.columns = df.columns.str.lower().str.strip()

In [7]:
# B. Handle Missing Values
# 'director' and 'cast' have many nulls. We can't invent them, so we label them "Unknown".
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')

In [8]:
# Drop rows where 'date_added' or 'rating' is missing (since there are very few)
df.dropna(subset=['date_added', 'rating'], inplace=True)

In [9]:
# C. Fix Date Format
# The date is currently text like "September 25, 2021". We need YYYY-MM-DD.
df['date_added'] = df['date_added'].str.strip() # Remove accidental spaces
df['date_added'] = pd.to_datetime(df['date_added'])

In [10]:
# D. Handle Duplicates
initial_rows = len(df)
df = df.drop_duplicates()
duplicates_removed = initial_rows - len(df)

In [11]:
# E. Clean 'Duration' (Optional but impressive)
# Separate "min" and "Seasons" if needed, or just ensure no nulls.
# For this task, we will just ensure it is a string and fill nulls.
df['duration'] = df['duration'].fillna('0 min')

In [12]:
# EXPORT
print(f"\n--- Cleaning Summary ---")
print(f"Duplicates Removed: {duplicates_removed}")
print(f"Remaining Missing Values: {df.isnull().sum().sum()}")


--- Cleaning Summary ---
Duplicates Removed: 0
Remaining Missing Values: 0


In [13]:
df.to_csv('task1_netflix_cleaned.csv', index=False)
print("\nSUCCESS: Saved as 'task1_netflix_cleaned.csv'")


SUCCESS: Saved as 'task1_netflix_cleaned.csv'
