# IMDb

In [None]:
import pandas as pd
import numpy as np
%matplotlib widget
import matplotlib.pyplot as plt

In [None]:
data_basics = pd.read_csv ("title_basics.tsv", sep = '\t')
data_ratings = pd.read_csv ("title_ratings.tsv", sep = '\t')

In [None]:
data_basics.shape

In [None]:
#not all elements in basics have corresponding ratings
# filter rows based on list values
mask = data_basics['tconst'].isin(data_ratings.tconst)
data_basics = data_basics[mask]


In [None]:
#select only the movies with ratings
mask = data_basics['titleType'] == 'movie'
movies = data_basics[mask]

#select only the ratings of movies with ratings
mask = data_ratings['tconst'].isin(movies.tconst)
movie_ratings = data_ratings[mask]

In [None]:
#remove the ones without start date
mask = movies['startYear'] != "\\N"
movies = movies[mask]

#remove them from ratings too
mask = data_ratings['tconst'].isin(movies.tconst)
movie_ratings = movie_ratings[mask]

In [None]:
first_year = 2000
last_year = 2020

print("The data spans between " + str(np.min(movies['startYear'])) + " and " + str(np.max(pd.to_numeric(movies['startYear']))) + ".")
print("We are interested in the data between " + str(first_year) + " and " + str(last_year) + ".")

In [None]:
#remove the ones without start date
mask_min = movies['startYear'].apply(pd.to_numeric) >= first_year 
movies = movies[mask_min]
mask_max = movies['startYear'].apply(pd.to_numeric) <= last_year
movies = movies[mask_max]


#remove them from ratings too
mask = movie_ratings['tconst'].isin(movies.tconst)
movie_ratings = movie_ratings[mask]

print("Selected data between " + str(np.min(movies['startYear'])) + " and " + str(np.max(pd.to_numeric(movies['startYear']))) + ".")

In [None]:
min_votes = 1000
mask = movie_ratings['numVotes'].apply(pd.to_numeric) >= min_votes 
movie_ratings = movie_ratings[mask]

mask = movies['tconst'].isin(movie_ratings.tconst)
movies = movies[mask]

In [None]:
#remove adult movies
mask = movies['isAdult'].apply(pd.to_numeric) == 0 
movies = movies[mask]

mask = movie_ratings['tconst'].isin(movies.tconst)
movie_ratings = movie_ratings[mask]

In [None]:
movies

In [None]:
#plot relative error results
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(movie_ratings['averageRating'], color='b')
    
ax.set_xlabel('Average rating')
ax.set_ylabel('Number of ocurences')
ax.set_title('Average rating histogram')
ax.grid()

In [None]:
#Select genres of interest
genres = ['Comedy', 'Drama', 'Fantasy', 'Horror', 'Romance', 'Action', 'Thriller', 'Sci-Fi', 'Documentary']
multi_genre = 0 # 0 drop multi-genre, 1 keep multi genre without repetition

match multi_genre:
    case 0:
        movies_split = {gen: movies[movies['genres'] == gen] for gen in genres}
        movie_ratings_split = {gen: movie_ratings[movie_ratings['tconst'].isin(movies_split[gen]['tconst'])] for gen in genres}
    
    case 1:
        print("Build later")

In [None]:
type(genres)

In [None]:
#plot relative error results
fig, ax = plt.subplots(int(np.ceil(len(genres)/3)), 3, figsize=(10,7))
ax = ax.ravel()
fig.suptitle('Average rating for different genres', fontsize=15)
for axes, i in zip(ax, range(0, len(genres))):
    axes.hist(movie_ratings_split[genres[i]]['averageRating'], color='b')
    axes.legend([genres[i]], loc="upper left", fontsize=7)
    axes.grid()
fig.text(0.5, 0.04, 'Average rating', ha='center')
fig.text(0.04, 0.5, 'Number of ocurences', va='center', rotation='vertical')
