In [1]:
import pandas as pd
import itertools

In [9]:
df = pd.read_csv('../data/netflix_titles_raw.csv')
df.date_added = pd.to_datetime(df.date_added)
df.listed_in = df.listed_in.str.split(',')
df.cast = df.cast.str.split(',')
df.director = df.director.str.split(',')
df.show_id = df.show_id.str[1:]
df.show_id = 's' + df.show_id.str.zfill(4) # Padding so we can maybe use this as an ID?
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s0001,Movie,Dick Johnson Is Dead,[Kirsten Johnson],,United States,2021-09-25,2020,PG-13,90 min,[Documentaries],"As her father nears the end of his life, filmm..."
1,s0002,TV Show,Blood & Water,,"[Ama Qamata, Khosi Ngema, Gail Mabalane, Th...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, TV Dramas, TV Myste...","After crossing paths at a party, a Cape Town t..."
2,s0003,TV Show,Ganglands,[Julien Leclercq],"[Sami Bouajila, Tracy Gotoas, Samuel Jouy, ...",,2021-09-24,2021,TV-MA,1 Season,"[Crime TV Shows, International TV Shows, TV ...",To protect his family from a powerful drug lor...
3,s0004,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"[Docuseries, Reality TV]","Feuds, flirtations and toilet talk go down amo..."
4,s0005,TV Show,Kota Factory,,"[Mayur More, Jitendra Kumar, Ranjan Raj, Al...",India,2021-09-24,2021,TV-MA,2 Seasons,"[International TV Shows, Romantic TV Shows, ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,[David Fincher],"[Mark Ruffalo, Jake Gyllenhaal, Robert Downe...",United States,2019-11-20,2007,R,158 min,"[Cult Movies, Dramas, Thrillers]","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,2019-07-01,2018,TV-Y7,2 Seasons,"[Kids' TV, Korean TV Shows, TV Comedies]","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,[Ruben Fleischer],"[Jesse Eisenberg, Woody Harrelson, Emma Ston...",United States,2019-11-01,2009,R,88 min,"[Comedies, Horror Movies]",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,[Peter Hewitt],"[Tim Allen, Courteney Cox, Chevy Chase, Kat...",United States,2020-01-11,2006,PG,88 min,"[Children & Family Movies, Comedies]","Dragged from civilian life, a former superhero..."


In [3]:
persons = pd.DataFrame(set(person.strip() for person in itertools.chain(*list(df.cast.dropna()))),
                      columns=['person']
                     )
persons['person_id'] = persons.index + 1

persons

Unnamed: 0,person,person_id
0,Tony Hale,1
1,Kerrigan Mahan,2
2,Jim Boeven,3
3,Sanjay Mishra,4
4,Wayne Kramer,5
...,...,...
36434,Anna Chancellor,36435
36435,Sean Murray,36436
36436,Akemi Kanda,36437
36437,Shubhankar Tawde,36438


In [4]:
def show_person_intersection_record_creation(record):
    return [(record.show_id, person.strip()) for person in record.cast]

show_person_intersection_records = []
for record in df[df.cast.notna()].itertuples():
    if record.cast:
        show_person_intersection_records.extend(show_person_intersection_record_creation(record))
    
show_person_intersection_df = pd.DataFrame(
    show_person_intersection_records, 
    columns=['show_id', 'person']
)
show_person_intersection_df = show_person_intersection_df.merge(persons)

del show_person_intersection_df['person']
show_person_intersection_df


Unnamed: 0,show_id,person_id
0,s0002,35284
1,s0002,20960
2,s0002,25575
3,s0002,33339
4,s1515,33339
...,...,...
64121,s8806,25784
64122,s8807,24495
64123,s8807,5974
64124,s8807,35957


In [5]:
persons.to_csv('../data/persons.csv', index=False)
show_person_intersection_df.to_csv('../data/show_person_intersection.csv', index=False)

In [6]:
categories = pd.DataFrame(set(category.strip() for category in itertools.chain(*list(df.listed_in))),
                          columns=['category']
                         )
categories['category_id'] = categories.index + 1

categories

Unnamed: 0,category,category_id
0,Dramas,1
1,Romantic TV Shows,2
2,Classic & Cult TV,3
3,TV Mysteries,4
4,Faith & Spirituality,5
5,TV Sci-Fi & Fantasy,6
6,Anime Series,7
7,Stand-Up Comedy,8
8,Romantic Movies,9
9,Children & Family Movies,10


In [7]:
def show_category_intersection_record_creation(record):
    return [(record.show_id, category.strip()) for category in record.listed_in]

show_category_intersection_records = []
for record in df.itertuples():
    show_category_intersection_records.extend(show_category_intersection_record_creation(record))
    
show_category_intersection_df = pd.DataFrame(
    show_category_intersection_records, 
    columns=['show_id', 'category']
)

show_category_intersection_df = show_category_intersection_df.merge(categories)
del show_category_intersection_df['category']
show_category_intersection_df

Unnamed: 0,show_id,category_id
0,s0001,34
1,s0017,34
2,s0046,34
3,s0069,34
4,s0089,34
...,...,...
19318,s7722,3
19319,s8190,3
19320,s8542,3
19321,s8600,3


In [8]:
show_category_intersection_df.to_csv('../data/show_category_intersection.csv', index=False)
categories.to_csv('../data/categories.csv', index=False)