Data_Cleaning and Preprocessing

In [13]:
import pandas as pd

In [14]:
df = pd.read_csv('netflix_titles.csv')
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None


Checking the Missing Values

In [16]:
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


Handling Missing Values and Replacing it

In [17]:
df.fillna({'director':'Unknown', 'cast':'Unknown', 'country':'Unknown', 'rating':'Not Rated', 'duration':'0 min'}, inplace=True)

In [19]:
df.isnull().sum()

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      10
release_year     0
rating           0
duration         0
listed_in        0
description      0
dtype: int64

In [20]:
df.dropna(subset=['date_added'], inplace=True)


In [21]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

Removing duplicates

In [23]:
df.drop_duplicates(inplace=True)

Standardize Text Columns

In [24]:
df['type'] = df['type'].str.strip().str.title()

df['country'] = df['country'].str.strip()

Fixing Date Format

In [27]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

Renamed Columns

In [29]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print(df.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


Checking and Fixing Data types

In [46]:
print(df.dtypes)

show_id           string[python]
type                    category
title             string[python]
director          string[python]
cast              string[python]
country                 category
date_added        datetime64[ns]
release_year               int64
rating                  category
duration                  object
listed_in         string[python]
description       string[python]
duration_value           float64
duration_unit           category
dtype: object


In [47]:
df['show_id'] = df['show_id'].astype(str)
df['type'] = df['type'].astype('category')
df['title'] = df['title'].astype(str)
df['director'] = df['director'].astype(str)
df['cast'] = df['cast'].astype(str)
df['country'] = df['country'].astype('category')
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['release_year'] = df['release_year'].astype(int)
df['rating'] = df['rating'].astype('category')
df['listed_in'] = df['listed_in'].astype(str)
df['description'] = df['description'].astype(str)

df['duration_value'] = df['duration'].str.extract(r'(\d+)').astype(float)
df['duration_unit'] = df['duration'].str.extract(r'([a-zA-Z ]+)$')[0].str.strip().astype('category')
print(df.dtypes)

show_id                   object
type                    category
title                     object
director                  object
cast                      object
country                 category
date_added        datetime64[ns]
release_year               int64
rating                  category
duration                  object
listed_in                 object
description               object
duration_value           float64
duration_unit           category
dtype: object


In [48]:
print(df.dtypes)

show_id                   object
type                    category
title                     object
director                  object
cast                      object
country                 category
date_added        datetime64[ns]
release_year               int64
rating                  category
duration                  object
listed_in                 object
description               object
duration_value           float64
duration_unit           category
dtype: object


In [49]:
print(df['show_id'].head(10))

0     s1
1     s2
2     s3
3     s4
4     s5
5     s6
6     s7
7     s8
8     s9
9    s10
Name: show_id, dtype: object


In [50]:
df['show_id_num'] = df['show_id'].str.extract(r'(\d+)')
df['show_id_num'] = df['show_id_num'].astype(int)
print(df['show_id_num'].head(10))

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: show_id_num, dtype: int64


In [51]:
print(df.dtypes)

show_id                   object
type                    category
title                     object
director                  object
cast                      object
country                 category
date_added        datetime64[ns]
release_year               int64
rating                  category
duration                  object
listed_in                 object
description               object
duration_value           float64
duration_unit           category
show_id_num                int64
dtype: object


In [52]:
df.to_csv('netflix_titles_cleaned.csv', index=False)