In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/netflix_titles.csv.zip")

# View basic info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None


In [2]:
# 1. Check and handle missing values
print(df.isnull().sum())
df['director'].fillna("Not Available", inplace=True)
df['cast'].fillna("Not Available", inplace=True)
df['country'].fillna("Not Available", inplace=True)
df['date_added'].fillna(method='ffill', inplace=True)
df['rating'].fillna("Not Rated", inplace=True)
df['duration'].fillna("Unknown", inplace=True)

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director'].fillna("Not Available", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna("Not Available", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [3]:
# 2. Remove duplicates
df.drop_duplicates(inplace=True)

In [4]:
# 3. Standardize text fields
df['type'] = df['type'].str.strip().str.lower()
df['rating'] = df['rating'].str.strip()

In [5]:
# 4. Convert date format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [6]:
# 5. Rename columns
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

In [7]:
# 6. Check and fix data types
# (most are fine; we fixed date above)


In [8]:
# Save cleaned dataset
df.to_csv("cleaned_netflix_titles.csv", index=False)