# Step 3: Data Cleaning and manipulation (genres, ages, number of votes)

In [2]:
import numpy as np
import pandas as pd
import ast

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Load Datasets

In [150]:
movies_master = pd.read_csv('data/MoviesOnStreamingPlatforms.csv')
#Source:https://www.kaggle.com/ruchi798/tv-shows-on-netflix-prime-video-hulu-and-disney
tv_shows_master = pd.read_csv('data/tv_shows.csv') 
emmy = pd.read_csv('data/the_emmy_awards.csv')
oscar = pd.read_csv('data/the_oscar_award.csv')

type_amazon= pd.read_csv('data/amazon_prime_titles.csv')
type_netflix = pd.read_csv('data/netflix_titles.csv')

#file created by Hyunwoo (data scraped from API)
main_alt = pd.read_csv('data/main.csv') 

In [151]:
type_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [152]:
#checking the size of of all movies
movies_master.shape[0]

9515

In [153]:
# Select the columns required for the analyses & change the name of the Prime Video column to Amazon
cols = ['ID','Title', 'Year', 'Age', 'Rotten Tomatoes', 'Netflix', 'Prime Video']
movies_all = movies_master.loc[:,cols].rename(columns={'Prime Video': 'Amazon'})

In [154]:
# Drop titles that are not on Netflix or Amazon
movies = movies_all[(movies_all['Netflix']==1) | (movies_all['Amazon']==1)]

In [155]:
#Checking for the number of unique values for each columns.  
movies.nunique(axis=0)

ID                 7699
Title              7699
Year                102
Age                   5
Rotten Tomatoes      84
Netflix               2
Amazon                2
dtype: int64

In [156]:
#checking for non null values to identity columns that are can be potentially removed.
#the ratings column has a high number of null values, but this column is essential for our analysis.
#so will look to fill the null values with values from other sources

movies.isnull().sum()

ID                    0
Title                 0
Year                  0
Age                3702
Rotten Tomatoes       7
Netflix               0
Amazon                0
dtype: int64

In [157]:
movies.head()

Unnamed: 0,ID,Title,Year,Age,Rotten Tomatoes,Netflix,Amazon
0,1,The Irishman,2019,18+,98/100,1,0
1,2,Dangal,2016,7+,97/100,1,0
2,3,David Attenborough: A Life on Our Planet,2020,7+,95/100,1,0
3,4,Lagaan: Once Upon a Time in India,2001,7+,94/100,1,0
4,5,Roma,2018,18+,94/100,1,0


In [158]:
#uncomment if you want to see the the year range for which the titles are avaiable 
#sorted(movies.Year.unique())

In [159]:
#uncomment if you want to search for specific movies
#movies.loc[movies['Title'].str.contains('Barry', na=False)]

In [160]:
#note that that type is set to one for all rows. Change this to "tvshow"
tv_shows_master.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,Breaking Bad,2008,18+,9.4/10,100/100,1,0,0,0,1
1,1,2,Stranger Things,2016,16+,8.7/10,96/100,1,0,0,0,1
2,2,3,Attack on Titan,2013,18+,9.0/10,95/100,1,1,0,0,1
3,3,4,Better Call Saul,2015,18+,8.8/10,94/100,1,0,0,0,1
4,4,5,Dark,2017,16+,8.8/10,93/100,1,0,0,0,1


In [161]:
# Select the columns required for the analyses & change the name of the Prime Video column to Amazon
# Set the type column to tvshows
tv_shows_master.drop(['Unnamed: 0','Hulu','Disney+', 'Type'], axis=1,inplace =True)
tv_shows_master.rename(columns={"Prime Video": "Amazon"}, inplace=True)
tv_shows = tv_shows_master[(tv_shows_master['Netflix']==1) | (tv_shows_master['Amazon']==1)]
tv_shows['Type'] ="tvshow"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv_shows['Type'] ="tvshow"


In [163]:
#add Type column to movies dataset
movies['Type'] = np.nan
movies['IMDb'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Type'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['IMDb'] = np.nan


In [164]:
#check the update
tv_shows.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type
0,1,Breaking Bad,2008,18+,9.4/10,100/100,1,0,tvshow
1,2,Stranger Things,2016,16+,8.7/10,96/100,1,0,tvshow
2,3,Attack on Titan,2013,18+,9.0/10,95/100,1,0,tvshow
3,4,Better Call Saul,2015,18+,8.8/10,94/100,1,0,tvshow
4,5,Dark,2017,16+,8.8/10,93/100,1,0,tvshow


In [162]:
#check the number of rows
tv_shows.shape[0]

3696

In [165]:
tv_shows.isnull().sum()

ID                    0
Title                 0
Year                  0
Age                1678
IMDb                742
Rotten Tomatoes       0
Netflix               0
Amazon                0
Type                  0
dtype: int64

In [166]:
#merge movies and tvshows datasets
main = pd.concat([movies[tv_shows.columns], tv_shows])
main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11395 entries, 0 to 5303
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               11395 non-null  int64 
 1   Title            11395 non-null  object
 2   Year             11395 non-null  int64 
 3   Age              6015 non-null   object
 4   IMDb             2954 non-null   object
 5   Rotten Tomatoes  11388 non-null  object
 6   Netflix          11395 non-null  int64 
 7   Amazon           11395 non-null  int64 
 8   Type             3696 non-null   object
dtypes: int64(4), object(5)
memory usage: 890.2+ KB


In [167]:
#Get unique movie titles
all_titles = main['Title'].unique()

In [168]:
#dowload file from https://datasets.imdbws.com/
title_info = pd.DataFrame()
chunksize = 10 ** 5
for chunk in pd.read_csv("data/title.basics.tsv", delimiter='\t', chunksize=chunksize):
    #title_info=title_info.append (chunk)
    title_info=title_info.append (chunk[chunk.originalTitle.isin(all_titles)])

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [169]:
all_tconst = title_info['tconst'].unique()

In [170]:
#download the ratings file from https://datasets.imdbws.com/
ratings = pd.DataFrame()
chunksize = 10 ** 5
for chunk in pd.read_csv("data/title.ratings.tsv", delimiter='\t', chunksize=chunksize):
    #title_info=title_info.append (chunk)
    ratings=ratings.append (chunk[chunk.tconst.isin(all_tconst)])

In [171]:
#performing checks on unique values. 
title_info.nunique(axis=0)

tconst            89968
titleType            10
primaryTitle       9536
originalTitle      9154
isAdult               4
startYear           198
endYear              73
runtimeMinutes      330
genres             1219
dtype: int64

In [172]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
177,tt0000181,6.8,58
217,tt0000245,6.2,48
370,tt0000565,6.3,576
414,tt0000644,5.5,63
429,tt0000668,7.1,50


In [173]:
title_info.isnull().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [174]:
title_info.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
178,tt0000181,short,Cinderella,Cinderella,0,1898,\N,\N,"Fantasy,Short"
242,tt0000245,short,The Haunted House,The Haunted House,0,1899,\N,\N,"Comedy,Fantasy,Horror"
474,tt0000478,short,Behind the Scenes,Behind the Scenes,0,1904,\N,\N,"Comedy,Short"
561,tt0000565,short,The Night Before Christmas,The Night Before Christmas,0,1905,\N,9,"Animation,Family,Fantasy"
638,tt0000644,short,Behind the Scenes,Behind the Scenes,0,1908,\N,9,"Drama,Short"


In [175]:
#uncomment if you want to test movies in the title_info file
#title_info[(title_info['originalTitle'] == 'Cinderella') &(title_info['startYear'] == '1965')]


In [176]:
#uncomment if you want to compare movies from title and movies datasets
#main[main['Title'] == 'Cinderella']

In [177]:
#Replace first \N with nan value
title_info = title_info.replace('\\N',np.NaN)
title_info.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
178,tt0000181,short,Cinderella,Cinderella,0,1898,,,"Fantasy,Short"
242,tt0000245,short,The Haunted House,The Haunted House,0,1899,,,"Comedy,Fantasy,Horror"
474,tt0000478,short,Behind the Scenes,Behind the Scenes,0,1904,,,"Comedy,Short"
561,tt0000565,short,The Night Before Christmas,The Night Before Christmas,0,1905,,9.0,"Animation,Family,Fantasy"
638,tt0000644,short,Behind the Scenes,Behind the Scenes,0,1908,,9.0,"Drama,Short"


In [178]:
#Remove entries for years less than 1912 (since we know that movies dataset only has movies from 1912), 
#genres that are talkshows, and titles that are short, videos, tvepisodes, tvminiseries, videogames and tvspecials
#also removed tvepisode since they didn't seem relevent for our analysis. 
title_info_clean = title_info[((title_info['startYear'].astype(np.float)>=1912) 
                               & (title_info['genres'].str.contains('Talk-Show') == False)
                               & (title_info['titleType'].isin(['short','video','tvShort','tvEpisode', 'tvSpecial', 'tvMiniSeries', 'videoGame']) == False)
                              )]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  title_info_clean = title_info[((title_info['startYear'].astype(np.float)>=1912)


In [179]:
#testing to check the titletypes available
title_info_clean.titleType.unique()

array(['movie', 'tvSeries', 'tvMovie'], dtype=object)

In [30]:
#movies['Year'].dtypes

In [31]:
#After removing the unwanted years, the duplicates reduced from 68716 to 50444
#but if you just check the unique values, there are about 28470
#uncomment to check unique values
#title_info_clean.loc[title_info_clean.originalTitle.duplicated(),['originalTitle']].originalTitle.unique().size
#title_info_clean.loc[title_info_clean.originalTitle.duplicated(),['originalTitle']].originalTitle.unique


In [32]:
#I checked a couple of them and the duplicates seem to be legitimate (Oliver Twist for example). Meaning some are movies that came in different years or shows
#The other point is that the movies database doesn't seem to be comprehesive enough. 
#also remove entries which are talk shows. 
#Strange that some of the movies are not in this file downloaded from IMDB: Along with the Gods: The Two Worlds, The First King 
#Though you can find IMDB ratings for them when you search for the movie names. 

#uncomment to test movies with names that are in movies database and not in IMDB database
#title_info.loc[title_info['originalTitle'].str.contains('I Am Jonas', na=False)]
#title_info_clean.loc[title_info_clean['originalTitle'].str.contains('Along with the Gods: The Two Worlds', na=False)]
#title_info_clean.loc[((title_info_clean['originalTitle'].str.contains('Kill Chain', na=False)) & (title_info_clean['startYear'] == '2019'))]
#movies.loc[movies['Title'].str.contains('Oliver Twist', na=False)]

In [180]:
#since there are duplicate movie names with some being movies and other shows. Take the example of Oliver Twist above. 
#To exactly match the type, we also need to match the year of the movie/shows

title_info_clean_small = title_info_clean[['tconst','titleType','originalTitle','startYear','runtimeMinutes','genres']]
title_info_clean_small = title_info_clean_small[title_info_clean_small['originalTitle'].isin(all_titles)]
title_info_clean_small = title_info_clean_small.drop_duplicates(
  subset = ['originalTitle', 'startYear','titleType']).reset_index(drop = True)
title_info_clean_small = title_info_clean_small.rename(columns={"originalTitle": "Title","titleType":"type","startYear":"Year"})


In [34]:
#uncomment if you want to test match for a movie and year combination
#title= 'Terminator: Dark Fate'
#year = 2019
#title_info_clean.loc[((title_info_clean['originalTitle']== title) & (title_info_clean['startYear'].astype(int) == year)),'titleType']

In [287]:
#testing the code in the function
#uncomment for testing purposes
#testing matching with three files, IMDB file, netflix title file and Amazon title file
#Use the below title for testing titles in type_amazon file 
#title= 'Danger Close'
#title ='Terminator: Dark Fate' year = 2019
# title = 'Psycho'
# year = 2020
# runtime = np.nan
# ctry =""
# imdb_rating = np.nan
# age = np.nan
# votes =np.nan
# genres = np.nan
# old_age = '18+'

# print("title:" + title + " Year: " + str(year))
# #get type from title_info (IMDB file)
# smatch_info = title_info_clean_small.loc[((title_info_clean_small['Title']== title) & (title_info_clean_small['Year'].astype(int) == year))]
# print(smatch_info)
# #if there is a match, get type, genres, runtime minutes
# if(smatch_info.empty == False):
#     if(smatch_info['type'].empty == False):
#         stype = smatch_info['type'].values[0]
#     sconst = smatch_info['tconst'].values[0]
#     runtime = smatch_info['runtimeMinutes'].values[0]
#     genres = smatch_info['genres'].values[0]
    
#     print("match")
#     print(stype)
#     print(sconst)
    

#     #get IMDB rating and number of votes from IMDB ratings file
#     smatch_ratings = ratings.loc[(ratings['tconst']== sconst)]
    
#     #if there is a match
#     if (smatch_ratings.empty == False):
#         imdb_rating = smatch_ratings['averageRating'].values[0]
#         votes = smatch_ratings['numVotes'].values[0]
#         print(IMDB_rating)
#     #if not match, check in types_amazon and types_netflix files
# elif((type_amazon.loc[((type_amazon['title']== title) & (type_amazon['release_year'].astype(int) == year)),'type']).empty == False):
#         print("match in amazon file")
#         stype = (type_amazon.loc[((type_amazon['title']== title)),'type']).values[0].lower()
#         ctry = (type_amazon.loc[((type_amazon['title']== title)),'country']).values[0]
#         runtime = type_amazon.loc[((type_amazon['title']== title)),'duration'].values[0]
#         age = type_amazon.loc[((type_amazon['title']== title)),'rating'].values[0]
#         genres = type_amazon.loc[((type_amazon['title']== title)),'listed_in'].values[0]
#         print("In type" + stype)
#         print(runtime)
# elif((type_netflix.loc[(type_netflix['title']== title)]).empty == False):
#         print("match in netflix file")
#         print(type_netflix.loc[(type_netflix['title']== title)])
#         stype = type_netflix.loc[((type_netflix['title']== title)),'type'].values[0].lower()
#         ctry = type_netflix.loc[((type_netflix['title']== title)),'country'].values[0]
#         runtime = type_netflix.loc[((type_netflix['title']== title)),'duration'].values[0]
#         age = type_netflix.loc[((type_netflix['title']== title)),'rating'].values[0]
#         genres = type_netflix.loc[((type_netflix['title']== title)),'listed_in'].values[0]
#         print(stype)
# else:
#          print("no value")
# print(ctry)
# if(str(imdb_rating) == "" or str(genres) == "" or str(age) == "" or str(ctry) == ""):
#     print("do something")
# if(type(age) == float):
#    if(np.isnan(age) == True):
#         age = old_age
# print((stype, imdb_rating, runtime, ctry,age,votes,genres))


        
        

title:Psycho Year: 2020
           tconst      type   Title  Year runtimeMinutes          genres
7435   tt12827674  tvSeries  Psycho  2020             60           Drama
16349   tt9109976     movie  Psycho  2020            134  Crime,Thriller
match
tvSeries
tt12827674
6.2

do something
('tvSeries', 7.1, '60', '', '18+', 355, 'Drama')


In [285]:
## Add additonal features number of votes, country and duration 
main['votes'] =np.nan
main['country']=np.nan
main['runtime']=np.nan
main['Age'] =np.nan
main['genres'] =np.nan

In [183]:
#prior to mapping with IMDB database, the null type values are 9515 and IMDB = 10477
main.isnull().sum()

ID                     0
Title                  0
Year                   0
Age                11395
IMDb                8441
Rotten Tomatoes        7
Netflix                0
Amazon                 0
Type                7699
votes              11395
country            11395
runtime            11395
genres             11395
dtype: int64

In [368]:
#main_small  = main.head(10)
main_small.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres
0,1,The Irishman,2019,R,7.8,98/100,1,0,movie,360756,United States,209,"Biography,Crime,Drama"
1,2,Dangal,2016,TV-PG,8.4,97/100,1,0,movie,176090,India,161,"Action,Biography,Drama"
2,3,David Attenborough: A Life on Our Planet,2020,PG,9.0,95/100,1,0,movie,30235,"United States, United Kingdom",83,"Biography,Documentary"
3,4,Lagaan: Once Upon a Time in India,2001,,8.1,94/100,1,0,movie,110501,,224,"Drama,Musical,Sport"
4,5,Roma,2018,,7.7,94/100,1,0,movie,151255,,135,Drama


In [455]:
#Function to set the type for all entries in the movies dataset
#title, year,oldtype, old_IMDB_rating, old_age
def get_type(movies_df):
    
    title = movies_df['Title']
    year  = movies_df['Year']
    oldtype = movies_df['Type']
    old_IMDB_rating = movies_df['IMDb']
    old_age = movies_df['Age']
    
    #print("title:" + title + " Year: " + str(year) + " Old Type:" + str(oldtype) +  " IMDB_rating:" +str(old_IMDB_rating))
    stype =""
    imdb_rating = ""
    runtime = ""
    ctry =""
    age = ""
    votes =np.nan
    genres = ""
    
    #get type from title_info (IMDB file)
    smatch_info = title_info_clean_small.loc[((title_info_clean_small['Title']== title) & (title_info_clean_small['Year'].astype(int) == year))]
    #print(smatch_info)
    
    #if there is a match, get type, genres, runtime minutes
    if(smatch_info.empty == False):

        if(smatch_info['type'].empty == False):
            stype = smatch_info['type'].values[0]
        sconst = smatch_info['tconst'].values[0]
        runtime = smatch_info['runtimeMinutes'].values[0]
        genres = smatch_info['genres'].values[0]

        #print("match")
        #print(stype)
        #print(sconst)


        #get IMDB rating and number of votes from IMDB ratings file
        smatch_ratings = ratings.loc[(ratings['tconst']== sconst)]

        #if there is a match
        if (smatch_ratings.empty == False):
            imdb_rating = smatch_ratings['averageRating'].values[0]
            votes = smatch_ratings['numVotes'].values[0]
            #print(imdb_rating)
    #if not match, check in types_amazon and types_netflix files. not matching years because of year mismatches
    elif((type_amazon.loc[((type_amazon['title']== title))]).empty == False):
        #print("match in amazon file")
        stype = (type_amazon.loc[((type_amazon['title']== title)),'type']).values[0].lower()
        ctry = (type_amazon.loc[((type_amazon['title']== title)),'country']).values[0]
        runtime = type_amazon.loc[((type_amazon['title']== title)),'duration'].values[0]
        age = type_amazon.loc[((type_amazon['title']== title)),'rating'].values[0]
        genres = type_amazon.loc[((type_amazon['title']== title)),'listed_in'].values[0]
        #print("In type" + stype)
        #print(runtime)
    elif((type_netflix.loc[((type_netflix['title']== title))]).empty == False):
        #print("match in netflix file")
        stype = type_netflix.loc[((type_netflix['title']== title)),'type'].values[0].lower()
        ctry = type_netflix.loc[((type_netflix['title']== title)),'country'].values[0]
        runtime = type_netflix.loc[((type_netflix['title']== title)),'duration'].values[0]
        age = type_netflix.loc[((type_netflix['title']== title)),'rating'].values[0]
        genres = type_netflix.loc[((type_netflix['title']== title)),'listed_in'].values[0]
        #print(stype)
    elif((main_alt.loc[((main_alt['Title']== title) & (main_alt['Year'].astype(int) == year))]).empty == False):
        #print("here")
        mmatch = main_alt.loc[((main_alt['Title']== title) & (main_alt['Year'].astype(int) == year))]
        stype = mmatch['Type'].values[0]
        imdb_rating = mmatch['IMDb'].values[0]
        age = mmatch['Age'].values[0]
        tempgenres = ast.literal_eval(mmatch['Genres'].values[0])
        genres = ",".join(tempgenres)
        #print(genres)
        #print(stype)
    else:
        stype = oldtype
    
#check if rating,runtime or genre is empty
    if(str(runtime) == "" or str(genres) == "" or str(age) == "" or str(ctry) == ""):
        #print("here")
        smatch = type_amazon.loc[((type_amazon['title']== title))]
    
        if(smatch.empty == False):
            #print("match in amazon file")
            if(str(ctry)==""):
                ctry = (type_amazon.loc[((type_amazon['title']== title)),'country']).values[0]
                if(str(runtime) ==""):
                    runtime = type_amazon.loc[((type_amazon['title']== title)),'duration'].values[0]
                if(str(age) == ""):
                    age = type_amazon.loc[((type_amazon['title']== title)),'rating'].values[0]
                if(genres == ""):
                    genres = type_amazon.loc[((type_amazon['title']== title)),'listed_in'].values[0]
        elif((type_netflix.loc[((type_netflix['title']== title))]).empty == False):
            smatch = type_netflix.loc[((type_netflix['title']== title))]
            #print("match in netflix file")
            if(smatch.empty == False):
                if(str(ctry)==""):
                    ctry = type_netflix.loc[((type_netflix['title']== title)),'country'].values[0]
                if(str(runtime) == ""):
                    runtime = type_netflix.loc[((type_netflix['title']== title)),'duration'].values[0]
                if(str(age) == ""):
                    age = type_netflix.loc[((type_netflix['title']== title)),'rating'].values[0]
                if(genres == ""):
                    genres = type_netflix.loc[((type_netflix['title']== title)),'listed_in'].values[0]
        else:
            mmatch = main_alt.loc[((main_alt['Title']== title) & (main_alt['Year'].astype(int) == year))]
            if(mmatch.empty == False):
                if(stype ==""):
                    stype = mmatch['Type'].values[0].lower()
                if(imdb_rating ==""):
                    imdb_rating = mmatch['IMDb'].values[0]
                if(str(age) == ""):
                    age = mmatch['Age'].values[0]
                if(genres == ""):
                    tempgenres = ast.literal_eval(mmatch['Genres'].values[0])
                    genres = ",".join(tempgenres)
                    #print(genres)

    if(stype == 'tv show' or stype =='tvSeries' or stype == 'tv series' or stype == 'tv mini series'):
        stype = "tvshow"
    if(stype == 'tvMovie' or stype == 'tv movie'):
        stype = 'movie'
    
    #print("IMDB new rating" + str(imdb_rating))
    #print("IMDB old rating" + str(old_IMDB_rating))
    
    if(str(imdb_rating) == ""):
#         if(np.isnan(imdb_rating) == True):
            imdb_rating = old_IMDB_rating
    
    if(str(age) ==""):
        age = old_age
        
    if(ctry == ""):
        ctry = np.nan
    if(runtime == ""):
        runtime = np.nan
    if(str(age) ==""):
        age = np.nan
    if(stype == ""):
        stype = np.nan
    if(genres == ""):
        genres = np.nan
    
    
    #print all values
    #print(stype, imdb_rating, runtime, ctry,age,votes,genres)
    
    return (stype, imdb_rating, runtime, ctry,age,votes,genres)


In [456]:
#testing on a small set
main_small[['Type','IMDb','runtime','country','Age','votes','genres']] \
 =main_small.apply(get_type , axis=1,result_type="expand")

In [457]:
main[['Type','IMDb','runtime','country','Age','votes','genres']] \
 =main.apply(get_type , axis=1,result_type="expand")

In [437]:
#post mapping with IMDB database, the null type values are 9515 and IMDB = 10477
main.isnull().sum()

ID                    0
Title                 0
Year                  0
Age                3009
IMDb               2568
Rotten Tomatoes       7
Netflix               0
Amazon                0
Type                111
votes              4610
country            6745
runtime            2540
genres              941
dtype: int64

In [438]:
#checking the values for movies and tvshows
main.groupby(['Type']).agg('count')

Unnamed: 0_level_0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,votes,country,runtime,genres
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
episode,63,63,63,28,24,63,63,63,0,0,0,63
movie,7521,7521,7521,6181,5784,7514,7521,7521,4988,3160,6660,7521
music video,1,1,1,1,1,1,1,1,0,0,0,1
podcast episode,3,3,3,1,0,3,3,3,0,0,0,3
tv short,8,8,8,5,8,8,8,8,0,0,0,8
tvshow,3616,3616,3616,2107,2942,3616,3616,3616,1797,1490,2195,2675
video movie,72,72,72,34,68,72,72,72,0,0,0,72


In [439]:
main[main['Type'].isin(['video movie'])]

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres
1456,1457,Dragons: Dawn Of The Dragon Racers,2014,,7.0,57/100,1,0,video movie,,,,"Animation,Short,Adventure,Comedy,Family,Fantasy"
1492,1493,American Pie Presents: Girls' Rules,2020,18+,3.8,57/100,1,0,video movie,,,,Comedy
1546,1547,Katt Williams: American Hustle,2007,18+,6.7,56/100,1,0,video movie,,,,"Documentary,Comedy"
1955,1956,Barbie: Dolphin Magic,2017,all,5.7,52/100,1,0,video movie,,,,"Animation,Adventure,Family,Fantasy"
2091,2092,Hulk: Where Monsters Dwell,2016,7+,5.4,51/100,1,0,video movie,,,,"Animation,Action,Fantasy,Sci-Fi"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8548,8549,Guilty by Association,2003,18+,2.4,30/100,0,1,video movie,,,,"Action,Crime,Drama"
8553,8554,Air Rage,2001,18+,3.2,29/100,0,1,video movie,,,,"Action,Adventure"
8559,8560,Feeders 2: Slay Bells,1998,18+,2.5,28/100,0,1,video movie,,,,"Comedy,Horror,Sci-Fi"
8565,8566,1313: Giant Killer Bees!,2011,16+,2.0,26/100,0,1,video movie,,,,"Horror,Sci-Fi"


# Add emmy data to the main dataset

In [440]:
# Get a subset of the emmy dataset where titles & years are matched
emmy_1 = emmy.loc[emmy['nominee'].isin(main.Title) & emmy['year'].isin(main.Year)][['nominee', 'year', 'win']]
emmy_1.shape[0]

2403

In [441]:
# Get the number of nominations for each title in the emmy dataset
emmy_nomination_count = emmy_1.groupby(['nominee', 'year']).count()
emmy_nomination_count.rename_axis(index={'nominee': 'Title', 'year': 'Year'}, inplace=True)
emmy_nomination_count.rename(columns={'win': 'emmy_nomination'}, inplace=True)
#uncomment if you want to see the dataset
#emmy_nomination_count.sort_values('emmy_nomination', ascending=False).head(60)

In [442]:
# Add the number of nominations for each title to the main dataset
main_1 = main.set_index(['Title', 'Year']).join(emmy_nomination_count)
main_1.sort_values('emmy_nomination', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres,emmy_nomination
Title,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
The Pacific,2010,3611,,8.3/10,83/100,0,1,tvshow,,,,,24.0
When They See Us,2019,44,TV-MA,8.9/10,86/100,1,0,tvshow,,United States,1 Season,"Crime TV Shows, TV Dramas",16.0
Russian Doll,2019,90,TV-MA,7.8,82/100,1,0,tvshow,77962.0,United States,30,"Adventure,Comedy,Drama",13.0
The Night Manager,2016,3601,18+,8.1/10,86/100,0,1,tvshow,,,1 Season,"Action, Drama, Suspense",12.0
Genius,2017,2152,TV-14,8.3,74/100,1,1,tvshow,17948.0,India,43,"Biography,Drama,History",10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lego Set Builds Chima - Artifex,2017,5375,,,10/100,0,1,tvshow,,,,,
The Bad Batch,2021,5385,R,8.2/10,82/100,1,0,movie,,United States,119 min,"Dramas, Independent Movies, Thrillers",
The Muppets,2015,5425,PG,7.4/10,64/100,1,0,movie,,United States,104 min,"Children & Family Movies, Comedies",
Pick of the Litter,2019,5575,TV-PG,8.1/10,47/100,1,0,movie,,United States,81 min,"Children & Family Movies, Documentaries",


In [443]:
# Get the number of wins for each title in the emmy dataset
emmy_win_count = emmy_1.groupby(['nominee', 'year']).sum()
emmy_win_count.rename_axis(index={'nominee': 'Title', 'year': 'Year'}, inplace=True)
emmy_win_count.rename(columns={'win': 'emmy_win'}, inplace=True)
#uncomment to see the results
#emmy_win_count.sort_values('emmy_win', ascending=False).head(60)

In [444]:
# Add the number of wins for each title to the main dataset
main_2 = main_1.join(emmy_win_count)
main_2.sort_values('emmy_win', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres,emmy_nomination,emmy_win
Title,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
The Pacific,2010,3611,,8.3/10,83/100,0,1,tvshow,,,,,24.0,8.0
"Love, Death & Robots",2019,45,TV-MA,8.5,86/100,1,0,tvshow,129141.0,United States,15,"Action,Adventure,Animation",6.0,5.0
House of Cards,2013,28,TV-MA,8.7,88/100,1,0,tvshow,486771.0,United States,51,Drama,7.0,3.0
Queer Eye,2018,140,TV-14,8.5,78/100,1,0,tvshow,17109.0,United States,45,Reality-TV,4.0,3.0
Star Trek: Deep Space Nine,1993,105,TV-PG,8.0,80/100,1,1,tvshow,59093.0,,45,"Action,Adventure,Drama",7.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lego Set Builds Chima - Artifex,2017,5375,,,10/100,0,1,tvshow,,,,,,
The Bad Batch,2021,5385,R,8.2/10,82/100,1,0,movie,,United States,119 min,"Dramas, Independent Movies, Thrillers",,
The Muppets,2015,5425,PG,7.4/10,64/100,1,0,movie,,United States,104 min,"Children & Family Movies, Comedies",,
Pick of the Litter,2019,5575,TV-PG,8.1/10,47/100,1,0,movie,,United States,81 min,"Children & Family Movies, Documentaries",,


# Add oscar data to the main dataset

In [445]:
# Get a subset of the emmy dataset where titles & years are matched
oscar_1 = oscar.loc[oscar.film.isin(main.Title) & oscar.year_film.isin(main.Year)][['year_film', 'film', 'winner']]
oscar_1.shape[0]

1150

In [446]:
# Get the number of nominations for each title in the oscar dataset
oscar_nomination_count = oscar_1.groupby(['film', 'year_film']).count()
oscar_nomination_count.rename_axis(index={'film': 'Title', 'year_film': 'Year'}, inplace=True)
oscar_nomination_count.rename(columns={'winner': 'oscar_nomination'}, inplace=True)
#uncomment to see results
#oscar_nomination_count.sort_values('oscar_nomination', ascending=False).head(60)

In [447]:
# Add the number of nominations for each title to the main dataset
main_3 = main_2.join(oscar_nomination_count)
main_3.sort_values('oscar_nomination', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres,emmy_nomination,emmy_win,oscar_nomination
Title,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
My Fair Lady,1964,285,G,7.8,75/100,1,0,movie,92512.0,United States,170,"Drama,Family,Musical",,,12.0
On the Waterfront,1954,4757,7+,8.1,79/100,0,1,movie,150281.0,,108,"Crime,Drama,Thriller",,,12.0
Hugo,2011,174,PG,7.5,79/100,1,0,movie,316144.0,"United Kingdom, United States, France",126,"Adventure,Drama,Family",,,11.0
Chinatown,1974,112,18+,8.2,82/100,1,0,movie,313879.0,,130,"Drama,Mystery,Thriller",,,11.0
The Pride of the Yankees,1942,5106,,7.7,65/100,0,1,movie,10623.0,,128,"Biography,Drama,Romance",,,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lego Set Builds Chima - Artifex,2017,5375,,,10/100,0,1,tvshow,,,,,,,
The Bad Batch,2021,5385,R,8.2/10,82/100,1,0,movie,,United States,119 min,"Dramas, Independent Movies, Thrillers",,,
The Muppets,2015,5425,PG,7.4/10,64/100,1,0,movie,,United States,104 min,"Children & Family Movies, Comedies",,,
Pick of the Litter,2019,5575,TV-PG,8.1/10,47/100,1,0,movie,,United States,81 min,"Children & Family Movies, Documentaries",,,


In [448]:
# Get the number of wins for each title in the emmy dataset
oscar_win_count = oscar_1.groupby(['film', 'year_film']).sum()
oscar_win_count.rename_axis(index={'film': 'Title', 'year_film': 'Year'}, inplace=True)
oscar_win_count.rename(columns={'winner': 'oscar_win'}, inplace=True)
#uncomment to see results
#oscar_win_count.sort_values('oscar_win', ascending=False).head(60)

In [449]:
# Add the number of wins for each title to the main dataset
main_4 = main_3.join(oscar_win_count)
main_4.sort_values('oscar_win', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres,emmy_nomination,emmy_win,oscar_nomination,oscar_win
Title,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
On the Waterfront,1954,4757,7+,8.1,79/100,0,1,movie,150281.0,,108,"Crime,Drama,Thriller",,,12.0,8.0
My Fair Lady,1964,285,G,7.8,75/100,1,0,movie,92512.0,United States,170,"Drama,Family,Musical",,,12.0,8.0
Patton,1970,4768,7+,7.9,77/100,0,1,movie,99677.0,,172,"Biography,Drama,War",,,10.0,7.0
A Place in the Sun,1951,4912,13+,7.8,70/100,0,1,movie,21926.0,,122,"Drama,Romance",,,9.0,6.0
The French Connection,1971,4775,18+,7.7,77/100,0,1,movie,117825.0,,104,"Action,Crime,Drama",,,8.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lego Set Builds Chima - Artifex,2017,5375,,,10/100,0,1,tvshow,,,,,,,,
The Bad Batch,2021,5385,R,8.2/10,82/100,1,0,movie,,United States,119 min,"Dramas, Independent Movies, Thrillers",,,,
The Muppets,2015,5425,PG,7.4/10,64/100,1,0,movie,,United States,104 min,"Children & Family Movies, Comedies",,,,
Pick of the Litter,2019,5575,TV-PG,8.1/10,47/100,1,0,movie,,United States,81 min,"Children & Family Movies, Documentaries",,,,


In [450]:
main_final = main_4.reset_index()
main_final.head()

Unnamed: 0,Title,Year,ID,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres,emmy_nomination,emmy_win,oscar_nomination,oscar_win
0,The Irishman,2019,1,R,7.8,98/100,1,0,movie,360756.0,United States,209,"Biography,Crime,Drama",,,10.0,0.0
1,Dangal,2016,2,TV-PG,8.4,97/100,1,0,movie,176090.0,India,161,"Action,Biography,Drama",,,,
2,David Attenborough: A Life on Our Planet,2020,3,PG,9.0,95/100,1,0,movie,30235.0,"United States, United Kingdom",83,"Biography,Documentary",,,,
3,Lagaan: Once Upon a Time in India,2001,4,7+,8.1,94/100,1,0,movie,110501.0,,224,"Drama,Musical,Sport",,,,
4,Roma,2018,5,18+,7.7,94/100,1,0,movie,151255.0,,135,Drama,,,10.0,3.0


In [451]:
main_final.isnull().sum()

Title                   0
Year                    0
ID                      0
Age                  3009
IMDb                 2568
Rotten Tomatoes         7
Netflix                 0
Amazon                  0
Type                  111
votes                4610
country              6745
runtime              2540
genres                941
emmy_nomination     11336
emmy_win            11336
oscar_nomination    11148
oscar_win           11148
dtype: int64

In [454]:
main_final.to_csv("data/main_final.csv",index=False,encoding='utf-8')

 # Other general tests

In [315]:
type_netflix.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [249]:
type_amazon.groupby("type").agg("count")

Unnamed: 0_level_0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Movie,7814,7814,7586,7050,569,16,7814,7483,7814,7814,7814
TV Show,1854,1854,0,1385,103,139,1854,1848,1854,1854,1854


In [231]:
movies[movies['Title'] == 'Psycho']

Unnamed: 0,ID,Title,Year,Age,Rotten Tomatoes,Netflix,Amazon,Type,IMDb
1774,1775,Psycho,2020,18+,54/100,1,0,,


In [258]:
#list of titles that got tagged as tvshows from the movies dataset
#main_5[((main_5['Title'].isin(movie_names)) & (main_5['Type'] == 'tvshow'))]

In [357]:
main_final.loc[main_final['Type'].isnull()]

Unnamed: 0,Title,Year,ID,Age,IMDb,Rotten Tomatoes,Netflix,Amazon,Type,votes,country,runtime,genres,emmy_nomination,emmy_win,oscar_nomination,oscar_win
11,Jim & Andy: The Great Beyond- Featuring a Very...,2017,12,,,92/100,1,0,,,,,,,,,
48,The Guernsey Literary & Potato Peel Pie Society,2018,49,,,87/100,1,0,,,,,,,,,
99,Adam Sandler: 100% Fresh,2018,100,,,83/100,1,0,,,,,,,,,
101,A Twelve-Year Night,2018,102,,,83/100,1,0,,,,,,,,,
114,Springsteen On Broadway,2018,115,,,82/100,1,0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7691,The Amazing Adventure of Marchello the Cat,2017,8608,,,13/100,0,1,,,,,,,,,
7693,Little Pim: Let's Count - Spanish for Kids,2010,8610,,,13/100,0,1,,,,,,,,,
7695,Rollin: The Fall of the Auto Industry and the ...,2016,8612,,,13/100,0,1,,,,,,,,,
7696,The Fairies: Fairy Beach,2007,8613,,,12/100,0,1,,,,,,,,,


In [354]:
null_types = main_final.loc[main_final['Type'].isnull(),'Title'].unique()

In [252]:
main_final[main_final['Amazon']==1].groupby(['Amazon','Type']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Title,Year,ID,Age,IMDb,Rotten Tomatoes,Netflix,votes,country,runtime,genres,emmy_nomination,emmy_win,oscar_nomination,oscar_win
Amazon,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,movie,3529,3529,3529,489,2966,3529,3529,2950,54,3498,3529,3,3,146,146
1,tvshow,1786,1786,1786,187,1195,1786,1786,776,30,801,1032,20,20,1,1


In [253]:
main_final[main_final['Netflix']==1].groupby(['Netflix','Type']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Title,Year,ID,Age,IMDb,Rotten Tomatoes,Amazon,votes,country,runtime,genres,emmy_nomination,emmy_win,oscar_nomination,oscar_win
Netflix,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,movie,3313,3313,3313,1150,2143,3306,3313,2116,1009,3271,3313,4,4,100,100
1,tvshow,1923,1923,1923,614,1836,1923,1923,1095,528,1475,1730,36,36,0,0


In [347]:
main_alt.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Netflix,Amazon,emmy_nomination,emmy_win,oscar_nomination,oscar_win,Type,IMDb,Genres
0,0,The Irishman,2019,18+,98/100,1,0,0,0,10,0,movie,7.8,"['Biography', 'Crime', 'Drama']"
1,1,Dangal,2016,7+,97/100,1,0,0,0,0,0,movie,8.4,"['Action', 'Biography', 'Drama', 'Sport']"
2,2,David Attenborough: A Life on Our Planet,2020,7+,95/100,1,0,0,0,0,0,movie,9.0,"['Documentary', 'Biography']"
3,3,Lagaan: Once Upon a Time in India,2001,7+,94/100,1,0,0,0,0,0,movie,8.1,"['Drama', 'Musical', 'Sport']"
4,4,Roma,2018,18+,94/100,1,0,0,0,10,3,movie,7.7,['Drama']


In [348]:
main_alt.nunique(axis=0)

Unnamed: 0          7699
Title               7699
Year                 102
Age                    5
Rotten Tomatoes       84
Netflix                2
Amazon                 2
emmy_nomination       13
emmy_win               7
oscar_nomination      13
oscar_win              9
Type                  11
IMDb                  82
Genres              1139
dtype: int64

In [349]:
main_alt.isnull().sum()

Unnamed: 0             0
Title                  0
Year                   0
Age                 3702
Rotten Tomatoes        7
Netflix                0
Amazon                 0
emmy_nomination        0
emmy_win               0
oscar_nomination       0
oscar_win              0
Type                 212
IMDb                 492
Genres                 0
dtype: int64

In [350]:
main_alt.size

107786

In [355]:
main_alt[main_alt['Title'].isin(null_types)]

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Netflix,Amazon,emmy_nomination,emmy_win,oscar_nomination,oscar_win,Type,IMDb,Genres
11,11,Jim & Andy: The Great Beyond- Featuring a Very...,2017,18+,92/100,1,0,0,0,0,0,movie,7.7,['Documentary']
48,48,The Guernsey Literary & Potato Peel Pie Society,2018,16+,87/100,1,0,0,0,0,0,movie,7.4,"['Drama', 'Romance', 'War']"
99,99,Adam Sandler: 100% Fresh,2018,18+,83/100,1,0,1,0,0,0,movie,7.6,"['Comedy', 'Music']"
101,101,A Twelve-Year Night,2018,18+,83/100,1,0,0,0,0,0,movie,7.5,"['Adventure', 'Biography', 'Crime', 'Drama', '..."
114,114,Springsteen On Broadway,2018,,82/100,1,0,2,1,0,0,tv movie,8.5,"['Documentary', 'Music']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7691,8607,The Amazing Adventure of Marchello the Cat,2017,,13/100,0,1,0,0,0,0,,,['nan']
7693,8609,Little Pim: Let's Count - Spanish for Kids,2010,,13/100,0,1,0,0,0,0,,,['nan']
7695,8611,Rollin: The Fall of the Auto Industry and the ...,2016,,13/100,0,1,0,0,0,0,movie,6.5,"['Documentary', 'Crime', 'News']"
7696,8612,The Fairies: Fairy Beach,2007,all,12/100,0,1,0,0,0,0,tv movie,,['Family']


In [351]:
main_alt[main_alt['Amazon']==1].groupby(['Amazon','Type']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Netflix,emmy_nomination,emmy_win,oscar_nomination,oscar_win,IMDb,Genres
Amazon,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,episode,95,95,95,56,95,95,95,95,95,95,39,95
1,movie,3519,3519,3519,1852,3519,3519,3519,3519,3519,3519,3463,3519
1,music video,1,1,1,1,1,1,1,1,1,1,1,1
1,podcast episode,9,9,9,5,9,9,9,9,9,9,0,9
1,tv mini series,4,4,4,1,4,4,4,4,4,4,4,4
1,tv movie,268,268,268,165,268,268,268,268,268,268,265,268
1,tv series,16,16,16,4,16,16,16,16,16,16,13,16
1,tv short,3,3,3,1,3,3,3,3,3,3,3,3
1,video game,1,1,1,0,1,1,1,1,1,1,1,1
1,video movie,115,115,115,59,115,115,115,115,115,115,106,115


In [352]:
main_alt[main_alt['Netflix']==1].groupby(['Netflix','Type']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Amazon,emmy_nomination,emmy_win,oscar_nomination,oscar_win,IMDb,Genres
Netflix,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,episode,103,103,103,43,103,103,103,103,103,103,47,103
1,movie,3241,3241,3241,1714,3238,3241,3241,3241,3241,3241,3180,3241
1,podcast episode,10,10,10,2,10,10,10,10,10,10,0,10
1,podcast series,2,2,2,0,2,2,2,2,2,2,0,2
1,tv mini series,4,4,4,2,4,4,4,4,4,4,3,4
1,tv movie,120,120,120,51,120,120,120,120,120,120,114,120
1,tv series,13,13,13,5,13,13,13,13,13,13,10,13
1,tv short,10,10,10,7,10,10,10,10,10,10,10,10
1,video game,3,3,3,3,3,3,3,3,3,3,3,3
1,video movie,58,58,58,36,58,58,58,58,58,58,50,58


In [68]:
main_alt.groupby(['Netflix','Amazon']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,Title,Year,Age,Rotten Tomatoes,emmy_nomination,emmy_win,oscar_nomination,oscar_win,Type,IMDb,Genres
Netflix,Amazon,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,4004,4004,4004,2099,4004,4004,4004,4004,4004,3923,3790,4004
1,0,3586,3586,3586,1832,3579,3586,3586,3586,3586,3456,3312,3586
1,1,109,109,109,66,109,109,109,109,109,108,105,109


In [69]:
main_alt.groupby(['Type']).agg('count')

Unnamed: 0_level_0,Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Netflix,Amazon,emmy_nomination,emmy_win,oscar_nomination,oscar_win,IMDb,Genres
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
episode,197,197,197,98,197,197,197,197,197,197,197,86,197
movie,6658,6658,6658,3503,6655,6658,6658,6658,6658,6658,6658,6543,6658
music video,1,1,1,1,1,1,1,1,1,1,1,1,1
podcast episode,19,19,19,7,19,19,19,19,19,19,19,0,19
podcast series,2,2,2,0,2,2,2,2,2,2,2,0,2
tv mini series,8,8,8,3,8,8,8,8,8,8,8,7,8
tv movie,384,384,384,214,384,384,384,384,384,384,384,375,384
tv series,29,29,29,9,29,29,29,29,29,29,29,23,29
tv short,13,13,13,8,13,13,13,13,13,13,13,13,13
video game,4,4,4,3,4,4,4,4,4,4,4,4,4


In [70]:
 main_alt.loc[main['Type'].isnull(),'Title'].unique()

array(['Nick Kroll & John Mulaney: Oh, Hello on Broadway', 'Screwball',
       'Errementari: The Blacksmith and the Devil', 'Kill the Irishman',
       'SampleThis', 'Rust Creek', 'Master Z: Ip Man Legacy',
       'The Blue Elephant Part 2', 'Wasp Network', 'Retablo',
       'Bleach the Movie: Fade to Black', 'Wedding Association',
       'Birdshot',
       'Pretty Guardians Sailor Moon Eternal The MOVIE - Part 1',
       'Tim Minchin and the Heritage Orchestra: Live at the Royal Albert Hall',
       'Sillu Karupatti', 'Loev', 'MFKZ', 'Freedom at Midnight',
       'Tim Minchin: So F**king Rock Live',
       'Cracked Up: The Darrell Hammond Story', 'J.T. LeRoy',
       'Creating an Army of the Dead',
       'Merata: How Mum Decolonised the Screen', 'Chaman Bahar',
       'Toll Booth', '1000 Rupee Note', 'Sarvam Thaala Mayam',
       'Poeslief: een ode aan de kat',
       "The B-Side: Elsa Dorfman's Portrait Photography",
       'Bill Hicks: One Night Stand', 'Lang Tong',
       'Wedding

In [71]:
#number of rows
main_alt.shape[0]

7699

In [72]:
#number of rows
main.shape[0]

9515

In [73]:
#https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system
#what is 7+? and what is the difference between 18+ and 16+
main_alt.Age.unique()

array(['18+', '7+', '13+', '16+', 'all', nan], dtype=object)

In [74]:
type_amazon.groupby(['type']).agg('count')

Unnamed: 0_level_0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Movie,7814,7814,7586,7050,569,16,7814,7483,7814,7814,7814
TV Show,1854,1854,0,1385,103,139,1854,1848,1854,1854,1854


In [75]:
main_alt[main_alt['emmy_nomination'] > 0]

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Netflix,Amazon,emmy_nomination,emmy_win,oscar_nomination,oscar_win,Type,IMDb,Genres
17,17,Virunga,2014,16+,90/100,1,0,3,2,1,0,movie,8.2,"['Documentary', 'War']"
62,62,The White Helmets,2016,16+,86/100,1,0,2,0,1,1,movie,7.5,"['Documentary', 'Short', 'War']"
83,83,The Pixar Story,2007,all,84/100,1,0,1,0,0,0,movie,7.8,['Documentary']
88,88,"What Happened, Miss Simone?",2015,,84/100,1,0,6,1,1,0,movie,7.6,"['Documentary', 'Biography', 'Music']"
99,99,Adam Sandler: 100% Fresh,2018,18+,83/100,1,0,1,0,0,0,movie,7.6,"['Comedy', 'Music']"
107,107,Hey Arnold! The Jungle Movie,2017,7+,83/100,1,0,1,1,0,0,tv movie,7.6,"['Animation', 'Adventure', 'Comedy', 'Family',..."
114,114,Springsteen On Broadway,2018,,82/100,1,0,2,1,0,0,tv movie,8.5,"['Documentary', 'Music']"
153,153,Hannah Gadsby: Nanette,2018,18+,80/100,1,0,2,1,0,0,movie,8.1,"['Documentary', 'Comedy']"
188,188,Quincy,2018,18+,79/100,1,0,5,0,0,0,movie,7.6,"['Documentary', 'Biography']"
257,257,13th,2016,18+,76/100,1,0,9,4,1,0,movie,8.2,"['Documentary', 'Crime', 'History']"


In [76]:
emmy[emmy['nominee']=='My Brother\'s Keeper']

Unnamed: 0,id,year,category,nominee,staff,company,producer,win
12174,12175,1995,OUTSTANDING INDIVIDUAL ACHIEVEMENT IN CINEMATO...,My Brother's Keeper,"Tom Del Ruth, A.S.C.,",CBS,,False
12383,12384,1995,OUTSTANDING LEAD ACTOR IN A MINISERIES OR SPECIAL,My Brother's Keeper,"John Lithgow, as Tom Bradley and Bob Bradley",CBS,,False


In [77]:
movies[movies['Title']=='Virunga']

Unnamed: 0,ID,Title,Year,Age,Rotten Tomatoes,Netflix,Amazon,Type
17,18,Virunga,2014,16+,90/100,1,0,movie
