# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset

In [2]:
netflix_raw = pd.read_csv('../data/raw/netflix_titles.csv')

In [3]:
netflix_raw.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
netflix = netflix_raw.copy()

In [5]:
netflix.shape

(8807, 12)

In [6]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [7]:
netflix.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [8]:
#check the percentage of NaN for all columns
nulls_df = pd.DataFrame(round(netflix.isna().sum()/len(netflix),4)*100).reset_index()
nulls_df = nulls_df.rename(columns = {'index': 'header_name',0:'percent_nulls'})
nulls_df

Unnamed: 0,header_name,percent_nulls
0,show_id,0.0
1,type,0.0
2,title,0.0
3,director,29.91
4,cast,9.37
5,country,9.44
6,date_added,0.11
7,release_year,0.0
8,rating,0.05
9,duration,0.03


# Data Cleaning

### Cleaning 'date_added'

In [9]:
netflix['date_added'].dtype

dtype('O')

In [10]:
#change the type of 'date_added' into datetime
netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix['date_added']

0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-24
4      2021-09-24
          ...    
8802   2019-11-20
8803   2019-07-01
8804   2019-11-01
8805   2020-01-11
8806   2019-03-02
Name: date_added, Length: 8807, dtype: datetime64[ns]

In [12]:
#since there is only 10 missing value in 'date_added', I will just drop nan
netflix = netflix[netflix['date_added'].isna()==False]

In [13]:
netflix.isna().sum()

show_id            0
type               0
title              0
director        2624
cast             825
country          830
date_added         0
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [14]:
netflix.shape

(8797, 12)

### seprate movies and tvshows in the netflix dataset 

In [15]:
movies = netflix[netflix['type']=='Movie']
movies.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,2021-09-24,2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
12,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic",2021-09-23,2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...


In [16]:
movies.isna().sum()

show_id           0
type              0
title             0
director        188
cast            475
country         440
date_added        0
release_year      0
rating            2
duration          3
listed_in         0
description       0
dtype: int64

In [17]:
tvshows = netflix[netflix['type']=='TV Show']
tvshows.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,2021-09-24,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...


In [18]:
tvshows.isna().sum()

show_id            0
type               0
title              0
director        2436
cast             350
country          390
date_added         0
release_year       0
rating             2
duration           0
listed_in          0
description        0
dtype: int64

### cleaning 'duration' in both movies and tvshows

In [19]:
#remove strings in duration column for all movies and change the type into numerical
movies['duration'] = movies['duration'].str.replace('min','').astype(float) #due to NaN, can't change type into int
movies['duration']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['duration'] = movies['duration'].str.replace('min','').astype(float) #due to NaN, can't change type into int


0        90.0
6        91.0
7       125.0
9       104.0
12      127.0
        ...  
8801     96.0
8802    158.0
8804     88.0
8805     88.0
8806    111.0
Name: duration, Length: 6131, dtype: float64

In [20]:
movies['duration'].dtype

dtype('float64')

In [21]:
#remove strings in duration for all tvshows
words = ['Seasons','Season']
p = '|'.join(words)

tvshows['duration'] = tvshows['duration'].str.replace(p,'').astype(int)
tvshows['duration']

  tvshows['duration'] = tvshows['duration'].str.replace(p,'').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tvshows['duration'] = tvshows['duration'].str.replace(p,'').astype(int)


1       2
2       1
3       1
4       2
5       1
       ..
8795    2
8796    2
8797    3
8800    1
8803    2
Name: duration, Length: 2666, dtype: int64

In [22]:
tvshows['duration'].dtype

dtype('int64')

In [23]:
#imputing missing value with median for duration in movies
movies['duration']=movies['duration'].fillna(movies['duration'].median())
movies['duration']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['duration']=movies['duration'].fillna(movies['duration'].median())


0        90.0
6        91.0
7       125.0
9       104.0
12      127.0
        ...  
8801     96.0
8802    158.0
8804     88.0
8805     88.0
8806    111.0
Name: duration, Length: 6131, dtype: float64

In [24]:
movies['duration'].isna().sum()

0

In [25]:
tvshows['duration'].isna().sum()

0

- After imputing the missing values of duration for both movies and tvshows, I will concat it into one dataframe(netflix_new)


In [26]:
netflix_new = pd.concat([movies,tvshows], axis=0,ignore_index=True)
netflix_new.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90.0,Documentaries,"As her father nears the end of his life, filmm..."
1,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,2021-09-24,2021,PG,91.0,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
2,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125.0,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
3,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104.0,"Comedies, Dramas",A woman adjusting to life after a loss contend...
4,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic",2021-09-23,2021,TV-MA,127.0,"Dramas, International Movies",After most of her family is murdered in a terr...


- Since there is no missing value in 'duration', i can change the type from float to int.

In [27]:
netflix_new['duration']=netflix_new['duration'].astype(int)

In [28]:
netflix_new['duration'].dtype

dtype('int64')

In [29]:
netflix_new.shape

(8797, 12)

In [30]:
netflix_new.isna().sum()

show_id            0
type               0
title              0
director        2624
cast             825
country          830
date_added         0
release_year       0
rating             4
duration           0
listed_in          0
description        0
dtype: int64

### Cleaning 'rating'

In [31]:
netflix_new['rating'].unique()

array(['PG-13', 'PG', 'TV-MA', 'TV-PG', 'TV-14', 'TV-Y', 'R', 'TV-G',
       'TV-Y7', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [32]:
netflix_new['rating'].value_counts(dropna=False)

TV-MA       3205
TV-14       2157
TV-PG        861
R            799
PG-13        490
TV-Y7        333
TV-Y         306
PG           287
TV-G         220
NR            79
G             41
TV-Y7-FV       6
NaN            4
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64

-  I will fisrt replace the NaN, then correct the wrong value name'74min','84min' and '66min'.

In [33]:
movies['rating'].value_counts(dropna=False)

TV-MA       2062
TV-14       1427
R            797
TV-PG        540
PG-13        490
PG           287
TV-Y7        139
TV-Y         131
TV-G         126
NR            75
G             41
TV-Y7-FV       5
NC-17          3
UR             3
NaN            2
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64

In [34]:
tvshows['rating'].value_counts(dropna=False)

TV-MA       1143
TV-14        730
TV-PG        321
TV-Y7        194
TV-Y         175
TV-G          94
NR             4
R              2
NaN            2
TV-Y7-FV       1
Name: rating, dtype: int64

In [35]:
netflix_new['rating'].mode()

0    TV-MA
Name: rating, dtype: object

In [36]:
movies['rating'].mode()

0    TV-MA
Name: rating, dtype: object

In [37]:
tvshows['rating'].mode()

0    TV-MA
Name: rating, dtype: object

- replace 'rating' missing value with the most frequent value since the most frequent values for movies and tvshows are same.

In [38]:
netflix_new['rating'] = netflix_new['rating'].fillna(netflix_new['rating'].mode()[0])

In [39]:
netflix_new['rating'].isna().sum()

0

In [40]:
netflix_new['rating'].value_counts(dropna=False)

TV-MA       3209
TV-14       2157
TV-PG        861
R            799
PG-13        490
TV-Y7        333
TV-Y         306
PG           287
TV-G         220
NR            79
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64

In [41]:
#correct wrong values in 'rating' with the most frequent value
netflix_new['rating'] = netflix_new['rating'].replace(['74 min','84 min','66 min'],'TV-MA')
netflix_new['rating']

0       PG-13
1          PG
2       TV-MA
3       PG-13
4       TV-MA
        ...  
8792    TV-Y7
8793    TV-PG
8794    TV-Y7
8795    TV-PG
8796    TV-Y7
Name: rating, Length: 8797, dtype: object

In [42]:
netflix_new['rating'].value_counts(dropna=False)

TV-MA       3212
TV-14       2157
TV-PG        861
R            799
PG-13        490
TV-Y7        333
TV-Y         306
PG           287
TV-G         220
NR            79
G             41
TV-Y7-FV       6
NC-17          3
UR             3
Name: rating, dtype: int64

### cleaning 'country'

In [43]:
netflix_new.isna().sum()

show_id            0
type               0
title              0
director        2624
cast             825
country          830
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

In [44]:
netflix_new['country'].mode()[0]

'United States'

In [45]:
#replace NaN with most frequent value
netflix_new['country']=netflix_new['country'].fillna(netflix_new['country'].mode()[0])

In [46]:
netflix_new.isna().sum()

show_id            0
type               0
title              0
director        2624
cast             825
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

### cleaning 'cast'

- Since cast is not important here for my designed EDA questions and I am also not gonna train models in this dataset, i will just replace it with 'Unknown'. If i drop rows, it might have influence on further data analysis accuracy since there are quite a lot NaN in 'cast'.

In [47]:
netflix_new['cast'] = netflix_new['cast'].fillna('Unknown')

In [48]:
netflix_new.isna().sum()

show_id            0
type               0
title              0
director        2624
cast               0
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

### cleaning 'director'

- As can be seen from the missing value summary, there are 2624 missing values in 'director'.I can't just drop the rows, instead i will replace NaN with 'Unknown'.

In [49]:
netflix_new['director'] = netflix_new['director'].fillna('Unknown')

In [50]:
netflix_new.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [51]:
netflix_new.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90,Documentaries,"As her father nears the end of his life, filmm..."
1,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",United States,2021-09-24,2021,PG,91,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
2,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
3,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104,"Comedies, Dramas",A woman adjusting to life after a loss contend...
4,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic",2021-09-23,2021,TV-MA,127,"Dramas, International Movies",After most of her family is murdered in a terr...


In [52]:
#check duplicates
netflix_new.duplicated().sum()

0

In [53]:
netflix_new.shape

(8797, 12)

In [62]:
netflix_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8797 entries, 0 to 8796
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8797 non-null   object        
 1   type          8797 non-null   object        
 2   title         8797 non-null   object        
 3   director      8797 non-null   object        
 4   cast          8797 non-null   object        
 5   country       8797 non-null   object        
 6   date_added    8797 non-null   datetime64[ns]
 7   release_year  8797 non-null   int64         
 8   rating        8797 non-null   object        
 9   duration      8797 non-null   int64         
 10  listed_in     8797 non-null   object        
 11  description   8797 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 824.8+ KB


- Save the cleaned dataset as 'netflix_cleaned'

In [57]:
netflix_new.to_csv('../data/clean/netflix_cleaned.csv', index=False)

In [64]:
# All above is about data cleaning. The EDA part is in the other jupyter notebook.