In [1]:
# Importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Datasets

Box Office Mojo:
- bom.movie_gross.csv.gz

IMDB (more infor info [here](https://www.imdb.com/interfaces/)):
- imdb.name.basics.csv.gz
- imdb.title.akas.csv.gz
- imdb.title.basics.csv.gz
- imdb.title.crew.csv.gz
- imdb.title.principals.csv.gz
- imdb.title.ratings.csv.gz

Rotten Tomatoes:
- rt.movie_info.tsv.gz
- rt.reviews.tsv.gz

TheMovieDB:
- tmdb.movies.csv.gz

The Numbers:
- tn.movie_budgets.csv.gz

In [2]:
genres_df = pd.read_csv('zippedData/imdb.title.basics.csv.gz')
genres_df

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.0,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
146142,tt9916730,6 Gunn,6 Gunn,2017,116.0,


How many rows in the genres column have null values?

In [3]:
genres_null_values = genres_df['genres'].isna().sum()
print(f'Rows with Null Values in Genres Column: {genres_null_values}')

Rows with Null Values in Genres Column: 5408


These null values could in theory be given genres if time permitted.
But for the sake of time, all rows with null values in the genres column will be removed.

In [4]:
genres_df.dropna(axis=0, subset=['genres'], inplace=True)
genres_df

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146138,tt9916428,The Secret of China,The Secret of China,2019,,"Adventure,History,War"
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.0,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy


The dtype for columns is currently a string, and I need to split up the different genres per title into lists of genres.

In [5]:
genres_df['genres'] = genres_df['genres'].map(lambda x: x.split(','))

I would like to see how many different genres can be associated with a single title, so I know 

In [6]:
genres_df['genres_len'] = genres_df['genres'].map(lambda x: len(x))

In [7]:
genres_df.head(20)

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,genres_len
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"[Action, Crime, Drama]",3
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"[Biography, Drama]",2
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,[Drama],1
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"[Comedy, Drama]",2
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"[Comedy, Drama, Fantasy]",3
5,tt0111414,A Thin Life,A Thin Life,2018,75.0,[Comedy],1
6,tt0112502,Bigfoot,Bigfoot,2017,,"[Horror, Thriller]",2
7,tt0137204,Joe Finds Grace,Joe Finds Grace,2017,83.0,"[Adventure, Animation, Comedy]",3
8,tt0139613,O Silêncio,O Silêncio,2012,,"[Documentary, History]",2
9,tt0144449,Nema aviona za Zagreb,Nema aviona za Zagreb,2012,82.0,[Biography],1


Notice index 16 was removed

In [8]:
max_genres_for_single_title = genres_df['genres_len'].max()
print(f'Maximum Amount of Genres for a Single Title: {max_genres_for_single_title}')

Maximum Amount of Genres for a Single Title: 3


In [9]:
def unique_genres(genres_series):
    unique_genres_set = set()
    unique_genres_list = []
    unique_genres_dict = {}
    
    unique_genres_set = set([g for g_list in genres_series for g in g_list])
    unique_genres_list = sorted(list(unique_genres_set))
    unique_genres_dict = {k: v for k, v in enumerate(unique_genres_list)}
        
    return(unique_genres_dict)


unique_genres_dict = unique_genres(genres_df['genres'])
unique_genres_dict

{0: 'Action',
 1: 'Adult',
 2: 'Adventure',
 3: 'Animation',
 4: 'Biography',
 5: 'Comedy',
 6: 'Crime',
 7: 'Documentary',
 8: 'Drama',
 9: 'Family',
 10: 'Fantasy',
 11: 'Game-Show',
 12: 'History',
 13: 'Horror',
 14: 'Music',
 15: 'Musical',
 16: 'Mystery',
 17: 'News',
 18: 'Reality-TV',
 19: 'Romance',
 20: 'Sci-Fi',
 21: 'Short',
 22: 'Sport',
 23: 'Talk-Show',
 24: 'Thriller',
 25: 'War',
 26: 'Western'}

In [10]:
def select_genre_types(unique_genres_dict, int_list):
    genre_types_list = []
    
    genre_types_list = [unique_genres_dict[index] for index in int_list]
    
    return genre_types_list


genre_types = select_genre_types(unique_genres_dict, [0, 13, 16, 19])
print(genre_types)

['Action', 'Horror', 'Mystery', 'Romance']


In [11]:
# # test_genres_df = genres_df
# # I HIGHLY RECOMMEND NOT PERFORMING THE FOR LOOP ON ALL +140,000 RECORDS
# # This code block will take ~2 minutes to compute.
# # And the following code block will not compute under 10 minutes...

# # I recommend taking a sample of genres_df to parse through instead
# test_genres_df = genres_df[0:5000]
# movies_for_removal = []

# for movie in list(range(len(test_genres_df.index))):
#     test_df = pd.DataFrame(test_genres_df.iloc[movie]).T
#     for g_type in genre_types:
#         if g_type in test_df['genres'][test_df.index[0]]:
#             break
#         elif g_type == genre_types[len(genre_types)-1]:
#             movies_for_removal.append(test_df['tconst'][test_df.index[0]])

# print("Amount of rows to remove: ", len(movies_for_removal))
# print("Total amount of rows in test_genres_df DataFrame: ", len(test_genres_df.index))
# print("Expected amount of rows after removal: ", len(test_genres_df.index)-len(movies_for_removal))

# for movie_for_removal in movies_for_removal:
#     test_genres_df = test_genres_df[test_genres_df['tconst'] != movie_for_removal]

# test_genres_df

In [12]:
# I HIGHLY RECOMMEND NOT PERFORMING THIS FUNCTION ON DATAFRAMES WITH OVER 5,000 RECORDS
# AS THIS FUNCTION MAY TAKE VERY LONG TO COMPUTE

def movies_w_specified_genres(genres_df, genre_types_list):
    movies_for_removal = []

    for movie in list(range(len(genres_df.index))):
        test_df = pd.DataFrame(genres_df.iloc[movie]).T
        for g_type in genre_types_list:
            if g_type in test_df['genres'][test_df.index[0]]:
                break
            elif g_type == genre_types_list[len(genre_types_list)-1]:
                movies_for_removal.append(test_df['tconst'][test_df.index[0]])
    for movie_for_removal in movies_for_removal:
        genres_df = genres_df[genres_df['tconst'] != movie_for_removal]
    
    return(genres_df)
            

test_df = movies_w_specified_genres(genres_df[0:5000], genre_types)
test_df

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,genres_len
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"[Action, Crime, Drama]",3
6,tt0112502,Bigfoot,Bigfoot,2017,,"[Horror, Thriller]",2
19,tt0247643,Los pájaros se van con la muerte,Los pájaros se van con la muerte,2011,110.0,"[Drama, Mystery]",2
20,tt0249516,Foodfight!,Foodfight!,2012,91.0,"[Action, Animation, Comedy]",3
27,tt0276568,To Chase a Million,To Chase a Million,2018,97.0,"[Action, Drama]",2
...,...,...,...,...,...,...,...
5515,tt1078597,Elimination,Elimination,2010,87.0,"[Action, Horror, Thriller]",3
5517,tt1078896,Hauntsville,Hauntsville,2016,,"[Horror, Thriller]",2
5518,tt1078897,Hysteria,Hysteria,2010,,"[Horror, Thriller]",2
5519,tt1079360,The Girl from the Naked Eye,The Girl from the Naked Eye,2012,84.0,"[Action, Crime, Mystery]",3
