In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [5]:
rating = pd.read_table("data/rating.gz")
akas = pd.read_table("data/akas.gz", low_memory=False)
basics = pd.read_table("data/basics.tsv", low_memory=False)

In [8]:
def transform_data(rating, basics):
    movie_info = basics[(basics.titleType == 'movie') & (basics.isAdult == '0')]
    movie = pd.merge(movie_info, rating, on = 'tconst', how = 'inner')
    movie = movie.drop(columns = ['isAdult', 'endYear', 'titleType', 'originalTitle'])
    movie.loc[movie["runtimeMinutes"] == "\\N", "runtimeMinutes"] = "95"
    movie["runtimeMinutes"] = movie["runtimeMinutes"].astype(int)
    movie = movie[movie["numVotes"] > 10000]
    movie["startYear"] = movie["startYear"].astype(int)
    movie = movie.rename(columns = {"tconst":"tid", "primaryTitle":"title", "startYear":"year", 
                        "runtimeMinutes":"length", "averageRating":"rating", "numVotes":"votes"})
    genre = pd.DataFrame(columns = ["tid", "genre"])
    genres = movie[["tid", "genres"]]
    for ind, data in genres.iterrows():
        tid, y = data
        amount = y.split(',')
        for z in amount:
            genre.loc[len(genre.index)] = [tid, z]
    movie = movie.drop(columns = "genres")
    return movie, genre
    

In [None]:
'''
import requests
for __, row in movie.iterrows():
    body = {"tid":row["tid"],"title":row["title"], "year":row["year"], "length":row["length"], "rating":row["rating"], "votes":row["votes"]}
    url = "http://localhost:5001/movie"
    requests.put(url, body)
for __, row in genre.iterrows():
    body = {"tid":row["tid"], "genre":row["genre"]}
    url = "http://localhost:5001/genre"
    requests.post(url, body)
'''

In [9]:
movie, genre = transform_data(rating, basics)

In [10]:
principal = pd.read_table("data/principals.gz")
actor_info = pd.read_table("data/name.gz")

In [11]:
principal = principal[(principal.category == 'actress') | (principal.category == 'actor')| (principal.category == 'director')]

In [12]:
principal = principal.drop(columns = "job")

In [13]:
principal

Unnamed: 0,tconst,ordering,nconst,category,characters
1,tt0000001,2,nm0005690,director,\N
3,tt0000002,1,nm0721526,director,\N
5,tt0000003,1,nm0721526,director,\N
9,tt0000004,1,nm0721526,director,\N
11,tt0000005,1,nm0443482,actor,"[""Blacksmith""]"
...,...,...,...,...,...
52606336,tt9916880,1,nm1483166,actor,"[""Rude Ralph"",""Mischievous Mike"",""Jolly Josh""]"
52606337,tt9916880,2,nm0254176,actress,"[""Moody Margaret""]"
52606338,tt9916880,3,nm0286175,actor,"[""Dad"",""Aerobic Al"",""Nasty Nicola""]"
52606339,tt9916880,4,nm10535738,actress,"[""Horrid Henry""]"


In [14]:
value = movie.tid.unique()

In [15]:
principal = principal[principal['tconst'].isin(value)]

In [16]:
actor_info = actor_info[(actor_info['birthYear'] != '\\N')&(~actor_info["primaryProfession"].isna())]

In [17]:
actor_info

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0037382,tt0075213,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0056404,tt0057345,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0078723,tt0080455,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0050986,tt0060827,tt0069467"
...,...,...,...,...,...,...
12007287,nm9993379,Fanny Hozleiter,1988,\N,"director,actress,writer",tt8743182
12007333,nm9993435,William Riva,1919,1999,set_decorator,"tt0300836,tt0189339"
12007334,nm9993436,Frank J. Gaily,1915,2008,sound_department,tt0189339
12007424,nm9993535,Henry Lawfull,2006,\N,actor,"tt10187208,tt5900600"


In [18]:
val = principal['nconst'].unique()

In [19]:
actor_info = actor_info[actor_info['nconst'].isin(val)]

In [20]:
actor_info

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0037382,tt0075213,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0056404,tt0057345,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0078723,tt0080455,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0050986,tt0060827,tt0069467"
...,...,...,...,...,...,...
11905864,nm9875384,Milena Smit,1996,\N,"actress,writer","tt4359330,tt12618926,tt7354440,tt15327522"
11907587,nm9877392,Roman Griffin Davis,2007,\N,actor,"tt11628854,tt2584384"
11930512,nm9903838,Sofia Kappel,1998,\N,actress,"tt20782072,tt8550054,tt9694594"
11951942,nm9928618,Roberto Farías,1969,\N,"actor,director,writer","tt3328948,tt8274328,tt4375438,tt1863321"


In [21]:
principal = principal.drop(columns = 'ordering')

In [22]:
principal

Unnamed: 0,tconst,nconst,category,characters
28216,tt0004972,nm0001273,actress,"[""Elsie - Stoneman's Daughter""]"
28217,tt0004972,nm0550615,actress,"[""Flora Cameron - The Pet Sister""]"
28218,tt0004972,nm0910400,actor,"[""Col. Ben Cameron aka The Little Colonel""]"
28219,tt0004972,nm0178270,actress,"[""Margaret Cameron - The Elder Sister""]"
28220,tt0004972,nm0000428,director,\N
...,...,...,...,...
52580819,tt9907782,nm2933542,actor,"[""John McBride""]"
52580820,tt9907782,nm0717709,actress,"[""Isabelle Laurent""]"
52580821,tt9907782,nm0677944,actor,"[""Seamus Laurent""]"
52580822,tt9907782,nm3646923,actress,"[""Anais""]"


In [23]:
actor_info

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0037382,tt0075213,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0056404,tt0057345,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0078723,tt0080455,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0050986,tt0060827,tt0069467"
...,...,...,...,...,...,...
11905864,nm9875384,Milena Smit,1996,\N,"actress,writer","tt4359330,tt12618926,tt7354440,tt15327522"
11907587,nm9877392,Roman Griffin Davis,2007,\N,actor,"tt11628854,tt2584384"
11930512,nm9903838,Sofia Kappel,1998,\N,actress,"tt20782072,tt8550054,tt9694594"
11951942,nm9928618,Roberto Farías,1969,\N,"actor,director,writer","tt3328948,tt8274328,tt4375438,tt1863321"


In [24]:
knownFor = actor_info[['nconst', 'knownForTitles']]

In [25]:
famousFor = pd.DataFrame(columns = ['nconst', 'known'])

In [26]:
for ind, data in knownFor.iterrows():
        tid, y = data
        amount = y.split(',')
        for z in amount:
            famousFor.loc[len(famousFor.index)] = [tid, z]

In [27]:
famousFor

Unnamed: 0,nconst,known
0,nm0000001,tt0031983
1,nm0000001,tt0072308
2,nm0000001,tt0050419
3,nm0000001,tt0053137
4,nm0000002,tt0038355
...,...,...
60471,nm9928618,tt1863321
60472,nm9979692,tt8578422
60473,nm9979692,tt9529500
60474,nm9979692,tt13045890


In [28]:
principal.loc[principal.category != 'director', 'characters'] = principal.loc[principal.category != 'director', 'characters'].apply(lambda x : x[2:len(x) - 2].split(',')[0])

In [29]:
principal

Unnamed: 0,tconst,nconst,category,characters
28216,tt0004972,nm0001273,actress,Elsie - Stoneman's Daughter
28217,tt0004972,nm0550615,actress,Flora Cameron - The Pet Sister
28218,tt0004972,nm0910400,actor,Col. Ben Cameron aka The Little Colonel
28219,tt0004972,nm0178270,actress,Margaret Cameron - The Elder Sister
28220,tt0004972,nm0000428,director,\N
...,...,...,...,...
52580819,tt9907782,nm2933542,actor,John McBride
52580820,tt9907782,nm0717709,actress,Isabelle Laurent
52580821,tt9907782,nm0677944,actor,Seamus Laurent
52580822,tt9907782,nm3646923,actress,Anais


In [30]:
actor_info = actor_info.drop(columns = ('knownForTitles'))

In [31]:
actor_info.loc[actor_info.deathYear == '\\N', 'deathYear'] = '0'

In [32]:
actor_info['birthYear'] = actor_info['birthYear'].astype(int)
actor_info['deathYear'] = actor_info['deathYear'].astype(int)

In [33]:
actor_info

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack"
2,nm0000003,Brigitte Bardot,1934,0,"actress,soundtrack,music_department"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor"
...,...,...,...,...,...
11905864,nm9875384,Milena Smit,1996,0,"actress,writer"
11907587,nm9877392,Roman Griffin Davis,2007,0,actor
11930512,nm9903838,Sofia Kappel,1998,0,actress
11951942,nm9928618,Roberto Farías,1969,0,"actor,director,writer"


In [34]:
import requests
for __, row in famousFor.iterrows():
    body = {"uid":row["nconst"],"tid":row["known"]}
    url = "http://localhost:5001/famous"
    requests.put(url, body)
    
for __, row in principal.iterrows():
    body = {"uid":row["nconst"], "tid":row["tconst"], "category":row["category"], "characters":row["characters"]}
    url = "http://localhost:5001/crew"
    requests.put(url, body)
    
for __, row in actor_info.iterrows():
    body = {
        "uid":row["nconst"],
        "name":row["primaryName"],
        "birthYear":row["birthYear"],
        "deathYear":row["deathYear"],
        "primaryProfession":row["primaryProfession"]
    }
    
    url = "http://localhost:5001/info"
    requests.put(url, body)

In [35]:
actor_info

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack"
2,nm0000003,Brigitte Bardot,1934,0,"actress,soundtrack,music_department"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor"
...,...,...,...,...,...
11905864,nm9875384,Milena Smit,1996,0,"actress,writer"
11907587,nm9877392,Roman Griffin Davis,2007,0,actor
11930512,nm9903838,Sofia Kappel,1998,0,actress
11951942,nm9928618,Roberto Farías,1969,0,"actor,director,writer"
