In [78]:
# Importing the libraries that we are gonna use
import pandas as pd


In [79]:
# Load csv and depicting the first 5 rows
netflix_tv_shows_df = pd.read_csv('../data/netflix_titles.csv')
netflix_tv_shows_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [80]:
# Dropping the values that they will not be usefull for our analysis
columns_to_drop = ['description', 'cast']
netflix_tv_shows_df.drop(columns=columns_to_drop, inplace=True, axis=1)


In [81]:
# Exploding the listed_in column to display a unique category for each tv show
netflix_tv_shows_df['listed_in'] = netflix_tv_shows_df['listed_in'].str.split(',')
netflix_tv_shows_df = netflix_tv_shows_df.explode('listed_in')
# Exploding the country column to display a unique country for each tv show
netflix_tv_shows_df['country'] = netflix_tv_shows_df['country'].str.split(',')
netflix_tv_shows_df = netflix_tv_shows_df.explode('country')
netflix_tv_shows_df['country'] = netflix_tv_shows_df['country'].str.strip()
netflix_tv_shows_df['listed_in'] = netflix_tv_shows_df['listed_in'].str.strip()
# Rename 'rating' to 'age_rating'
netflix_tv_shows_df.rename(columns={'rating': 'age_rating'}, inplace=True)
# Mapping between age ratings and descriptions
age_rating_descriptions = {
    'PG-13': 'Parents Strongly Cautioned',
    'TV-MA': 'Mature Audiences',
    'PG': 'Parental Guidance Suggested',
    'TV-14': 'Parents Strongly Cautioned - 14 and older',
    'TV-PG': 'Parental Guidance Suggested',
    'TV-Y': 'All Children',
    'TV-Y7': 'Directed to Older Children',
    'R': 'Restricted',
    'TV-G': 'General Audience',
    'G': 'General Audiences',
    'NC-17': 'Adults Only',
    'NR': 'Not Rated',
    'TV-Y7-FV': 'Directed to Older Children - Fantasy Violence',
    'UR': 'Unrated'
}

# Enhance the DataFrame with age rating descriptions
netflix_tv_shows_df['age_rating_description'] = netflix_tv_shows_df['age_rating'].map(age_rating_descriptions)
netflix_tv_shows_df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,age_rating,duration,listed_in,age_rating_description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,Parents Strongly Cautioned
1,s2,TV Show,Blood & Water,,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,International TV Shows,Mature Audiences
1,s2,TV Show,Blood & Water,,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,TV Dramas,Mature Audiences
1,s2,TV Show,Blood & Water,,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,TV Mysteries,Mature Audiences
2,s3,TV Show,Ganglands,Julien Leclercq,,"September 24, 2021",2021,TV-MA,1 Season,Crime TV Shows,Mature Audiences


In [82]:
# Check for NA values in the dataframe
netflix_tv_shows_df.isna().sum()

show_id                      0
type                         0
title                        0
director                  6567
country                   1722
date_added                  20
release_year                 0
age_rating                   6
duration                     3
listed_in                    0
age_rating_description       9
dtype: int64

In [83]:
# Since we dont want to lose information of movies and because these columns that have NAs are not so important we have
# decided to replace the NA with No Info
netflix_tv_shows_df.fillna('No Info', inplace=True)
netflix_tv_shows_df.isna().sum()

show_id                   0
type                      0
title                     0
director                  0
country                   0
date_added                0
release_year              0
age_rating                0
duration                  0
listed_in                 0
age_rating_description    0
dtype: int64

In [85]:
# We have loaded the updated csv to a new csv
netflix_tv_shows_df.to_csv('netflix_titles_updated.csv', index=False)