In [1]:
import pandas as pd
import itertools

In [2]:
df = pd.read_csv('../data/netflix_titles_raw.csv') 
df.date_added = pd.to_datetime(df.date_added)
df.listed_in = df.listed_in.str.split(',')
# df.cast = df.cast.str.split(',')
# df.director = df.director.str.split(',')
df.show_id = df.show_id.str[1:]
df.show_id = 's' + df.show_id.str.zfill(4) # Padding so we can maybe use this as an ID?
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s0001,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,[Documentaries],"As her father nears the end of his life, filmm..."
1,s0002,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t..."
2,s0003,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...
3,s0004,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo..."
4,s0005,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158 min,"[Cult Movies, Dramas, Thrillers]","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,2019-07-01,2018,TV-Y7,2 Seasons,"[Kids' TV, Korean TV Shows, TV Comedies]","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88 min,"[Comedies, Horror Movies]",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88 min,"[Children & Family Movies, Comedies]","Dragged from civilian life, a former superhero..."


# Get all unique person entities

In [3]:
cast_members = list(df.cast.str.split(',').dropna())
directors = list(df.director.str.split(',').dropna())
persons = pd.DataFrame(set(person.strip() for person in itertools.chain(*cast_members + directors)),
                      columns=['person']
                     )
persons['person_id'] = persons.index + 1

persons

Unnamed: 0,person,person_id
0,Maria Sadowska,1
1,Vusi Mahlasela,2
2,Simon Baker,3
3,Byun Heebong,4
4,Michele Weaver,5
...,...,...
40943,Berta Castañé,40944
40944,Ludovica Martino,40945
40945,Joseph Chang,40946
40946,David Attenborough,40947


# Intersection table for cast and show

In [4]:
def show_cast_intersection_record_creation(record):
    return [(record.show_id, person.strip()) for person in record.cast.split(',')]

show_cast_intersection_records = []
for record in df[df.cast.notna()].itertuples():
    if record.cast:
        show_cast_intersection_records.extend(show_cast_intersection_record_creation(record))
    
show_cast_intersection_df = pd.DataFrame(
    show_cast_intersection_records, 
    columns=['show_id', 'person']
)
show_cast_intersection_df = show_cast_intersection_df.merge(persons)

del show_cast_intersection_df['person']
show_cast_intersection_df


Unnamed: 0,show_id,person_id
0,s0002,26280
1,s0002,30573
2,s0002,4933
3,s0002,24841
4,s1515,24841
...,...,...
64121,s8806,28085
64122,s8807,394
64123,s8807,30268
64124,s8807,29596


# Create a director_id column in the raw dataset

In [5]:
df = df.merge(persons, left_on='director', right_on='person', how='left')
df['director_id'] = df.person_id.astype('Int64')
# del df['person_id']
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,person,person_id,director_id
0,s0001,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,[Documentaries],"As her father nears the end of his life, filmm...",Kirsten Johnson,26310.0,26310
1,s0002,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t...",,,
2,s0003,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...,Julien Leclercq,40622.0,40622
3,s0004,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo...",,,
4,s0005,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158 min,"[Cult Movies, Dramas, Thrillers]","A political cartoonist, a crime reporter and a...",David Fincher,18462.0,18462
8803,s8804,TV Show,Zombie Dumb,,,,2019-07-01,2018,TV-Y7,2 Seasons,"[Kids' TV, Korean TV Shows, TV Comedies]","While living alone in a spooky town, a young g...",,,
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88 min,"[Comedies, Horror Movies]",Looking to survive in a world taken over by zo...,Ruben Fleischer,40723.0,40723
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88 min,"[Children & Family Movies, Comedies]","Dragged from civilian life, a former superhero...",Peter Hewitt,6276.0,6276


# Export person and show/cast intersection tables

In [6]:
persons.to_csv('../data/persons.csv', index=False)
show_cast_intersection_df.to_csv('../data/show_cast_intersection.csv', index=False)

# Create a table of unique categories

In [7]:
categories = pd.DataFrame(set(category.strip() for category in itertools.chain(*list(df.listed_in))),
                          columns=['category']
                         )
categories['category_id'] = categories.index + 1

categories

Unnamed: 0,category,category_id
0,Anime Features,1
1,Sports Movies,2
2,TV Action & Adventure,3
3,Children & Family Movies,4
4,TV Horror,5
5,Horror Movies,6
6,Kids' TV,7
7,LGBTQ Movies,8
8,Spanish-Language TV Shows,9
9,Science & Nature TV,10


# Intersection table between category and show

In [8]:
def show_category_intersection_record_creation(record):
    return [(record.show_id, category.strip()) for category in record.listed_in]

show_category_intersection_records = []
for record in df.itertuples():
    show_category_intersection_records.extend(show_category_intersection_record_creation(record))
    
show_category_intersection_df = pd.DataFrame(
    show_category_intersection_records, 
    columns=['show_id', 'category']
)

show_category_intersection_df = show_category_intersection_df.merge(categories)
del show_category_intersection_df['category']
show_category_intersection_df

Unnamed: 0,show_id,category_id
0,s0001,37
1,s0017,37
2,s0046,37
3,s0069,37
4,s0089,37
...,...,...
19318,s7722,36
19319,s8190,36
19320,s8542,36
19321,s8600,36


# Export categories and show/categories intersection tables

In [9]:
show_category_intersection_df.to_csv('../data/show_category_intersection.csv', index=False)
categories.to_csv('../data/categories.csv', index=False)

# Finally, create the show table

In [10]:
show_df = df[['show_id','type', 'title', 'director_id', 'country', 'date_added', 'release_year', 'rating', 'duration', 'description']]
show_df

Unnamed: 0,show_id,type,title,director_id,country,date_added,release_year,rating,duration,description
0,s0001,Movie,Dick Johnson Is Dead,26310,United States,2021-09-25,2020,PG-13,90 min,"As her father nears the end of his life, filmm..."
1,s0002,TV Show,Blood & Water,,South Africa,2021-09-24,2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t..."
2,s0003,TV Show,Ganglands,40622,,2021-09-24,2021,TV-MA,1 Season,To protect his family from a powerful drug lor...
3,s0004,TV Show,Jailbirds New Orleans,,,2021-09-24,2021,TV-MA,1 Season,"Feuds, flirtations and toilet talk go down amo..."
4,s0005,TV Show,Kota Factory,,India,2021-09-24,2021,TV-MA,2 Seasons,In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,18462,United States,2019-11-20,2007,R,158 min,"A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,2019-07-01,2018,TV-Y7,2 Seasons,"While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,40723,United States,2019-11-01,2009,R,88 min,Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,6276,United States,2020-01-11,2006,PG,88 min,"Dragged from civilian life, a former superhero..."


# Export it

In [11]:
show_df.to_csv('../data/shows.csv', index=False)