# Netflix dataset preprocessing (for visualization task only)

In [1]:
import pandas as pd
import numpy as np
from dateutil import parser

In [2]:
df_raw = pd.read_csv(r"raw_dataset.csv")

In [3]:
df = df_raw

In [4]:
df_raw.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [5]:
df_raw.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

# show_id

In [6]:
df["show_id"]

0          s1
1          s2
2          s3
3          s4
4          s5
        ...  
8802    s8803
8803    s8804
8804    s8805
8805    s8806
8806    s8807
Name: show_id, Length: 8807, dtype: object

In [7]:
df["show_id"] = df["show_id"].str.replace("s", "")
df["show_id"]

0          1
1          2
2          3
3          4
4          5
        ... 
8802    8803
8803    8804
8804    8805
8805    8806
8806    8807
Name: show_id, Length: 8807, dtype: object

In [8]:
df = df.astype({"show_id" : "int64"})

In [9]:
df["show_id"]

0          1
1          2
2          3
3          4
4          5
        ... 
8802    8803
8803    8804
8804    8805
8805    8806
8806    8807
Name: show_id, Length: 8807, dtype: int64

In [10]:
df.drop_duplicates()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,24-Sep-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,4,TV Show,Jailbirds New Orleans,,,,24-Sep-21,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,20-Nov-19,2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,8804,TV Show,Zombie Dumb,,,,1-Jul-19,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,1-Nov-19,2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,11-Jan-20,2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


# type

In [11]:
df["type"].unique()

array(['Movie', 'TV Show'], dtype=object)

In [12]:
types = pd.Series([type.strip() for record in df["type"].str.split(",") for type in record]).value_counts()
types
types.to_csv("types.csv")

# title

In [13]:
df["title"].str.strip()

0        Dick Johnson Is Dead
1               Blood & Water
2                   Ganglands
3       Jailbirds New Orleans
4                Kota Factory
                ...          
8802                   Zodiac
8803              Zombie Dumb
8804               Zombieland
8805                     Zoom
8806                   Zubaan
Name: title, Length: 8807, dtype: object

In [14]:
df["title"].isna().sum()

0

# director

In [15]:
df["director"].unique()

array(['Kirsten Johnson', nan, 'Julien Leclercq', ..., 'Majid Al Ansari',
       'Peter Hewitt', 'Mozez Singh'], dtype=object)

In [16]:
# fill NA first before flattening, else NA will be read as numerical type and for loop cannot iterate on null values
df["director"] = df["director"].fillna("unknown")

In [17]:
directors = pd.Series([director.strip() for record in df["director"].str.split(",") for director in record]).value_counts()
# flatten list(column) of directors
directors.to_csv("directors.csv")

# cast

In [18]:
df["cast"].unique()

array([nan,
       'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng',
       'Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera',
       ...,
       'Jesse Eisenberg, Woody Harrelson, Emma Stone, Abigail Breslin, Amber Heard, Bill Murray, Derek Graf',
       'Tim Allen, Courteney Cox, Chevy Chase, Kate Mara, Ryan Newman, Michael Cassidy, Spencer Breslin, Rip Torn, Kevin Zegers',
       'Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanana, Manish Chaudhary, Meghna Malik, Malkeet Rauni, Anita Shabdish, Chittaranjan Tripathy'],
      dtype=object)

In [19]:
df["cast"] = df["cast"].fillna("unknown")

In [20]:
actors = pd.Series([actor.strip() for record in df["cast"].str.split(",") for actor in record]).value_counts()
actors.to_csv("actors.csv")
actors

unknown                    825
Anupam Kher                 43
Shah Rukh Khan              35
Julie Tejwani               33
Naseeruddin Shah            32
                          ... 
Melanie Straub               1
Gabriela Maria Schmeide      1
Helena Zengel                1
Daniel Valenzuela            1
Chittaranjan Tripathy        1
Name: count, Length: 36440, dtype: int64

# country

In [21]:
df["country"].unique()

array(['United States', 'South Africa', nan, 'India',
       'United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia',
       'United Kingdom', 'Germany, Czech Republic', 'Mexico', 'Turkey',
       'Australia', 'United States, India, France', 'Finland',
       'China, Canada, United States',
       'South Africa, United States, Japan', 'Nigeria', 'Japan',
       'Spain, United States', 'France', 'Belgium',
       'United Kingdom, United States', 'United States, United Kingdom',
       'France, United States', 'South Korea', 'Spain',
       'United States, Singapore', 'United Kingdom, Australia, France',
       'United Kingdom, Australia, France, United States',
       'United States, Canada', 'Germany, United States',
       'South Africa, United States', 'United States, Mexico',
       'United States, Italy, France, Japan',
       'United States, Italy, Romania, United Kingdom',
       'Australia, United States', 'Argentina, Venezuela',
       'United States, United Kin

In [22]:
df["country"] = df["country"].fillna("unknown")

In [23]:
countries = pd.Series([country.strip() for record in df["country"].str.split(",") for country in record])
countries = countries.replace("", np.nan).fillna("unknown").value_counts()
countries.to_csv("countries.csv")

# date_added

In [24]:
df["date_added"].dtypes

dtype('O')

In [25]:
df["date_added"].isna().sum()

10

In [26]:
df.loc[
df["date_added"].isna(),
["title", "date_added"]
]

Unnamed: 0,title,date_added
6066,A Young Doctor's Notebook and Other Stories,
6174,Anthony Bourdain: Parts Unknown,
6795,Frasier,
6806,Friends,
6901,Gunslinger Girl,
7196,Kikoriki,
7254,La Familia P. Luche,
7406,Maron,
7847,Red vs. Blue,
8182,The Adventures of Figaro Pho,


In [27]:
df.loc[6066, "date_added"] = "15-Sep-14"
df.loc[6174, "date_added"] = "1-Jun-13"
df.loc[6795, "date_added"] = "6-Apr-16"
df.loc[6806, "date_added"] = "1-Jan-15"
df.loc[6901, "date_added"] = "4-Jan-16"
df.loc[7196, "date_added"] = "6-Jul-17"
df.loc[7254, "date_added"] = "1-Jun-15"
df.loc[7406, "date_added"] = "13-Jan-17"
df.loc[7847, "date_added"] = "15-Sep-14"
df.loc[8182, "date_added"] = "15-Sep-14"

In [28]:
df["date_added"].isna().sum()

0

In [29]:
df["date_added"] = df["date_added"].apply(lambda x: parser.parse(x).strftime("%Y-%m-%d"))
df["date_added"]

0       2021-09-25
1       2021-09-24
2       2021-09-24
3       2021-09-24
4       2021-09-24
           ...    
8802    2019-11-20
8803    2019-07-01
8804    2019-11-01
8805    2020-01-11
8806    2019-03-02
Name: date_added, Length: 8807, dtype: object

In [30]:
df[
["show_id", "title", "date_added"]
].to_csv("date_added.csv", index=False)

# release_year

In [31]:
df["release_year"].dtypes

dtype('int64')

In [32]:
df["release_year"].min()

1925

In [33]:
df["release_year"].max()

2021

In [34]:
df["release_year"].isna().sum()

0

In [35]:
df["release_year"].value_counts()

release_year
2018    1147
2017    1032
2019    1030
2020     953
2016     902
        ... 
1959       1
1925       1
1961       1
1947       1
1966       1
Name: count, Length: 74, dtype: int64

In [36]:
df["release_year"].value_counts().to_csv("release year.csv")

# rating

In [37]:
df["rating"].dtypes

dtype('O')

In [38]:
df["rating"].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [39]:
df["rating"] = df["rating"].fillna("unknown")
df["rating"].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR',
       'unknown', 'TV-Y7-FV', 'UR'], dtype=object)

In [40]:
df["rating"].value_counts()
# ERROR: rating contains duration values

rating
TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
unknown        4
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64

In [41]:
df["rating"].replace(["74 min", "84 min", "66 min"], "unknown").value_counts().to_csv("ratings.csv")

# duration

In [42]:
df["duration"].dtypes

dtype('O')

In [43]:
df["duration"].unique()

array(['90 min', '2 Seasons', '1 Season', '91 min', '125 min',
       '9 Seasons', '104 min', '127 min', '4 Seasons', '67 min', '94 min',
       '5 Seasons', '161 min', '61 min', '166 min', '147 min', '103 min',
       '97 min', '106 min', '111 min', '3 Seasons', '110 min', '105 min',
       '96 min', '124 min', '116 min', '98 min', '23 min', '115 min',
       '122 min', '99 min', '88 min', '100 min', '6 Seasons', '102 min',
       '93 min', '95 min', '85 min', '83 min', '113 min', '13 min',
       '182 min', '48 min', '145 min', '87 min', '92 min', '80 min',
       '117 min', '128 min', '119 min', '143 min', '114 min', '118 min',
       '108 min', '63 min', '121 min', '142 min', '154 min', '120 min',
       '82 min', '109 min', '101 min', '86 min', '229 min', '76 min',
       '89 min', '156 min', '112 min', '107 min', '129 min', '135 min',
       '136 min', '165 min', '150 min', '133 min', '70 min', '84 min',
       '140 min', '78 min', '7 Seasons', '64 min', '59 min', '139 min',
    

In [44]:
df["duration"]

0          90 min
1       2 Seasons
2        1 Season
3        1 Season
4       2 Seasons
          ...    
8802      158 min
8803    2 Seasons
8804       88 min
8805       88 min
8806      111 min
Name: duration, Length: 8807, dtype: object

In [45]:
df["duration"].isna().sum()

3

In [46]:
df["duration"] = df["duration"].fillna("unknown")

In [47]:
duration_movies = df.loc[
df["duration"].str.contains("min"), # row / records to display
["show_id", "title", "duration"] # column / attributes to display
]
duration_movies["duration"] = duration_movies["duration"].str.replace("min", "")
duration_movies["duration"] = duration_movies["duration"].astype("float64")
duration_movies = duration_movies.sort_values(by="duration", ascending=False)
duration_movies = duration_movies.rename(columns={"duration":"minutes"})
duration_movies.to_csv("duration_movies.csv", index=False)

In [48]:
duration_tvshows = df.loc[
df["duration"].str.contains("Season"), # records to display
["show_id", "title", "duration"] # attributes to display
]
duration_tvshows["duration"] = duration_tvshows["duration"].str.replace("[^0-9]", "", regex=True)
duration_tvshows["duration"] = duration_tvshows["duration"].astype("float64")
duration_tvshows = duration_tvshows.sort_values(by="duration", ascending=False)
duration_tvshows = duration_tvshows.rename(columns={"duration":"seasons"})
duration_tvshows.to_csv("duration_tvshows.csv", index=False)

# listed_in (categories)

In [49]:
df["listed_in"].dtypes

dtype('O')

In [50]:
df["listed_in"].unique()

array(['Documentaries', 'International TV Shows, TV Dramas, TV Mysteries',
       'Crime TV Shows, International TV Shows, TV Action & Adventure',
       'Docuseries, Reality TV',
       'International TV Shows, Romantic TV Shows, TV Comedies',
       'TV Dramas, TV Horror, TV Mysteries', 'Children & Family Movies',
       'Dramas, Independent Movies, International Movies',
       'British TV Shows, Reality TV', 'Comedies, Dramas',
       'Crime TV Shows, Docuseries, International TV Shows',
       'Dramas, International Movies',
       'Children & Family Movies, Comedies',
       'British TV Shows, Crime TV Shows, Docuseries',
       'TV Comedies, TV Dramas', 'Documentaries, International Movies',
       'Crime TV Shows, Spanish-Language TV Shows, TV Dramas',
       'Thrillers',
       'International TV Shows, Spanish-Language TV Shows, TV Action & Adventure',
       'International TV Shows, TV Action & Adventure, TV Dramas',
       'Comedies, International Movies',
       'Comedies, 

In [51]:
df["listed_in"].isna().sum()

0

In [52]:
categories_movies = df.loc[
df["type"] == "Movie",
"listed_in"
]
categories_movies = pd.Series([category.strip() for record in categories_movies.str.split(",") for category in record]).value_counts()
categories_movies.to_csv("categories_movies.csv")

In [53]:
categories_tvshows = df.loc[
df["type"] == "TV Show",
"listed_in"
]
categories_tvshows = pd.Series([category.strip() for record in categories_tvshows.str.split(",") for category in record]).value_counts()
categories_tvshows.to_csv("categories_tvshows.csv")

# description
- no cleaning needed