In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('netflix_titles.csv')
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB
None
            show_id  release_year
count  6.234000e+03    6234.00000
mean   7.670368e+07    2013.35932
std    1.094296e+07       8.81162
min    2.477470e+05    1925.00000
25%    8.003580e+07    2013.00000
50%    8.016337e+07    2016.00000
75%    8.

In [2]:
print(df.isnull().sum())

show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64


In [3]:
df['original_id'] = df.index   # adding a unique identifier

In [4]:
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Unknown', inplace=True)
df['listed_in'].fillna('Unknown', inplace=True)
df['rating'].fillna('Not Rated', inplace=True)

In [5]:
df['rating'] = df['rating'].astype('category')   # converting rating to categorical

In [6]:
# Split 'duration' into numeric value and unit
df['duration'] = df['duration'].astype(str)
df['duration_value'] = pd.to_numeric(df['duration'].str.extract('(\d+)').iloc[:, 0], errors='coerce').astype('Int64')
df['duration_unit'] = df['duration'].str.extract('(\D+)').iloc[:, 0].str.strip()

In [7]:
multi_value_cols = ['country', 'listed_in']       # handling multi-value columns
for col in multi_value_cols:
    df[col] = df[col].str.split(', ')
    
df_exploded = df.explode('country').explode('listed_in').reset_index(drop=True)

# Calculating split factor
df_exploded['split_factor'] = 1 / df_exploded.groupby('original_id')['original_id'].transform('count')   

# Keep director and cast as lists
# df_exploded['director'] = df_exploded['director'].str.split(', ')
# df_exploded['cast'] = df_exploded['cast'].str.split(', ')

In [14]:
print(df.head(5))

    show_id     type                                    title  \
0  81145628    Movie  Norm of the North: King Sized Adventure   
1  80117401    Movie               Jandino: Whatever it Takes   
2  70234439  TV Show                       Transformers Prime   
3  80058654  TV Show         Transformers: Robots in Disguise   
4  80125979    Movie                             #realityhigh   

                   director  \
0  Richard Finn, Tim Maltby   
1                   Unknown   
2                   Unknown   
3                   Unknown   
4          Fernando Lebrija   

                                                cast  \
0  Alan Marriott, Andrew Toth, Brian Dobson, Cole...   
1                                   Jandino Asporaat   
2  Peter Cullen, Sumalee Montano, Frank Welker, J...   
3  Will Friedle, Darren Criss, Constance Zimmer, ...   
4  Nesta Cooper, Kate Walsh, John Michael Higgins...   

                                    country         date_added  release_year  \
0  Un

In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   show_id         6234 non-null   int64   
 1   type            6234 non-null   object  
 2   title           6234 non-null   object  
 3   director        6234 non-null   object  
 4   cast            6234 non-null   object  
 5   country         5758 non-null   object  
 6   date_added      6223 non-null   object  
 7   release_year    6234 non-null   int64   
 8   rating          6234 non-null   category
 9   duration        6234 non-null   object  
 10  listed_in       6234 non-null   object  
 11  description     6234 non-null   object  
 12  duration_value  6234 non-null   int64   
 13  duration_unit   6234 non-null   object  
dtypes: category(1), int64(3), object(10)
memory usage: 640.0+ KB
None


In [16]:
# Export the cleaned dataset
df.to_csv('cleaned_netflix_dataset.csv', index=False)