In [69]:
import pandas as pd
import numpy as np

### Loading the data

In [70]:
df_raw = pd.read_csv("netflix_titles.csv")
df_raw.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


### Analysing the dataset

In [71]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [72]:
def isNull(x):
    return x.isna().sum()

df_raw.agg(['dtype', 'count', 'nunique', isNull])

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
dtype,object,object,object,object,object,object,object,int64,object,object,object,object
count,8807,8807,8807,6173,7982,7976,8797,8807,8803,8804,8807,8807
nunique,8807,2,8807,4528,7692,748,1767,74,17,220,514,8775
isNull,0,0,0,2634,825,831,10,0,4,3,0,0


### Cleaning the data

In [73]:
# Remove rows that have null values in one of the defined columns
df = df_raw.dropna(subset=['date_added', 'rating', 'duration', 'country'])
# Remove the defined columns
df = df.drop(['director', 'cast', 'description'], axis=1)

In [74]:
# Parse date_added to datetimes
df['date_added'] = pd.to_datetime(df['date_added'].str.strip(), format='%B %d, %Y')

In [75]:
# Cast durations to int
isMovie = df['type'] == 'Movie'
df.loc[isMovie, 'duration'] = df.loc[isMovie, 'duration'].str.split(' ').str[0]
df.loc[~isMovie, 'duration'] = df.loc[~isMovie, 'duration'].str.split(' ').str[0]
df.duration = df.duration.astype(int)

In [76]:
# Create new column with number of countries
df['no_countries'] = df['country'].str.split(',').str.len()

In [77]:
df['main_country'] = df['country'].str.split(',').str[0]

In [78]:
df.agg(['dtype', 'count', 'nunique', isNull])

Unnamed: 0,show_id,type,title,country,date_added,release_year,rating,duration,listed_in,no_countries,main_country
dtype,object,object,object,object,datetime64[ns],int64,object,int32,object,int64,object
count,7961,7961,7961,7961,7961,7961,7961,7961,7961,7961,7961
nunique,7961,2,7961,748,1683,73,14,203,497,10,86
isNull,0,0,0,0,0,0,0,0,0,0,0


In [79]:
df.to_csv('netflix_titles_cleaned.csv', index=False)

### Queries

In [82]:
df = pd.read_csv('netflix_titles_cleaned.csv')

In [83]:
df.head(4)

Unnamed: 0,show_id,type,title,country,date_added,release_year,rating,duration,listed_in,no_countries,main_country
0,s1,Movie,Dick Johnson Is Dead,United States,2021-09-25,2020,PG-13,90,Documentaries,1,United States
1,s2,TV Show,Blood & Water,South Africa,2021-09-24,2021,TV-MA,2,"International TV Shows, TV Dramas, TV Mysteries",1,South Africa
2,s5,TV Show,Kota Factory,India,2021-09-24,2021,TV-MA,2,"International TV Shows, Romantic TV Shows, TV ...",1,India
3,s8,Movie,Sankofa,"United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125,"Dramas, Independent Movies, International Movies",6,United States


In [84]:
# Number of movies vs. TV Show entries
df['type'].value_counts()

Movie      5687
TV Show    2274
Name: type, dtype: int64

In [87]:
# Count movies per year
isMovie = df['type'] == 'Movie'
df[isMovie]['release_year'].value_counts()

2017    729
2018    713
2016    638
2019    565
2020    461
       ... 
1959      1
1946      1
1963      1
1961      1
1966      1
Name: release_year, Length: 73, dtype: int64

In [89]:
df.groupby(['main_country', 'type'])[['type']].agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,type
main_country,type,Unnamed: 2_level_1
Argentina,Movie,56
Argentina,TV Show,20
Australia,Movie,61
Australia,TV Show,54
Austria,Movie,8
...,...,...
Uruguay,TV Show,1
Venezuela,Movie,2
Vietnam,Movie,7
West Germany,Movie,1
