# Netflix Movies and TV Shows

Imports

In [1]:
import numpy as np
import pandas as pd

Data

In [2]:
df = pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


Unique value counts

In [4]:
for column in df.columns:
    print(f'{column}: {df[column].nunique()}')

show_id: 8807
type: 2
title: 8807
director: 4528
cast: 7692
country: 748
date_added: 1767
release_year: 74
rating: 17
duration: 220
listed_in: 514
description: 8775


Null values

In [5]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

## Data Cleaning and Preparation

In [6]:
def remove_prefix(s:str) -> str:
    return s.replace('s', '')

df['show_id'] = df['show_id'].apply(remove_prefix).astype(pd.Int16Dtype())

In [7]:
df['type'] = df['type'].astype('category')

In [8]:
cols = ['title', 'director', 'cast', 'country', 'listed_in']
for col in cols:
    df[col] = df[col].astype('string')

In [9]:
df['release_year'] = df['release_year'].astype('category')

In [10]:
df['rating'] = df['rating'].fillna('unknown')
df['rating'] = df['rating'].astype('category')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   show_id       8807 non-null   Int16   
 1   type          8807 non-null   category
 2   title         8807 non-null   string  
 3   director      6173 non-null   string  
 4   cast          7982 non-null   string  
 5   country       7976 non-null   string  
 6   date_added    8797 non-null   object  
 7   release_year  8807 non-null   category
 8   rating        8807 non-null   category
 9   duration      8804 non-null   object  
 10  listed_in     8807 non-null   string  
 11  description   8807 non-null   object  
dtypes: Int16(1), category(3), object(3), string(5)
memory usage: 605.6+ KB


In [12]:
director_dummies = df['director'].str.get_dummies(',').astype('boolean')
country_dummies = df['country'].str.get_dummies(',').astype('boolean')
cast_dummies = df['cast'].str.get_dummies(',').astype('boolean')
listed_in_dummies = df['listed_in'].str.get_dummies(',').astype('boolean')

In [22]:
df = df.drop(columns=['director', 'country', 'cast', 'listed_in'])

In [16]:
df = df.join(country_dummies.add_prefix('In_country_'))
df = df.join(director_dummies.add_prefix('Dirc_by_'))
df = df.join(cast_dummies.add_prefix('Casts_'))
df = df.join(listed_in_dummies.add_prefix('Listed_in_'))

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Columns: 44693 entries, show_id to ls_in_Thrillers
dtypes: Int16(1), boolean(44685), category(3), object(3), string(1)
memory usage: 750.9+ MB


In [29]:
df.apply(lambda x: x['duration')

KeyError: 'duration'