# IMDb

In [None]:
import pandas as pd
from scipy import stats
import pingouin as pg
import numpy as np
import statistics as stat
from operator import itemgetter
%matplotlib widget
import matplotlib.pyplot as plt

In [None]:
data_basics = pd.read_csv ("title_basics.tsv", sep = '\t')
data_ratings = pd.read_csv ("title_ratings.tsv", sep = '\t')

In [None]:
data_basics.shape

In [None]:
#not all elements in basics have corresponding ratings
# filter rows based on list values
mask = data_basics['tconst'].isin(data_ratings.tconst)
data_basics = data_basics[mask]


In [None]:
#select only the movies with ratings
mask = data_basics['titleType'] == 'movie'
movies = data_basics[mask]

#select only the ratings of movies with ratings
mask = data_ratings['tconst'].isin(movies.tconst)
movie_ratings = data_ratings[mask]

In [None]:
#remove the ones without start date
mask = movies['startYear'] != "\\N"
movies = movies[mask]

#remove them from ratings too
mask = data_ratings['tconst'].isin(movies.tconst)
movie_ratings = movie_ratings[mask]

In [None]:
first_year = 2000
last_year = 2020

print("The data spans between " + str(np.min(movies['startYear'])) + " and " + str(np.max(pd.to_numeric(movies['startYear']))) + ".")
print("We are interested in the data between " + str(first_year) + " and " + str(last_year) + ".")

In [None]:
#remove the ones without start date
mask_min = movies['startYear'].apply(pd.to_numeric) >= first_year 
movies = movies[mask_min]
mask_max = movies['startYear'].apply(pd.to_numeric) <= last_year
movies = movies[mask_max]


#remove them from ratings too
mask = movie_ratings['tconst'].isin(movies.tconst)
movie_ratings = movie_ratings[mask]

print("Selected data between " + str(np.min(movies['startYear'])) + " and " + str(np.max(pd.to_numeric(movies['startYear']))) + ".")

In [None]:
min_votes = 1000
mask = movie_ratings['numVotes'].apply(pd.to_numeric) >= min_votes 
movie_ratings = movie_ratings[mask]

mask = movies['tconst'].isin(movie_ratings.tconst)
movies = movies[mask]

In [None]:
#remove adult movies
mask = movies['isAdult'].apply(pd.to_numeric) == 0 
movies = movies[mask]

mask = movie_ratings['tconst'].isin(movies.tconst)
movie_ratings = movie_ratings[mask]

In [None]:
#plot relative error results
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(movie_ratings['averageRating'], color='b')
    
ax.set_xlabel('Average rating')
ax.set_ylabel('Number of ocurences')
ax.set_title('Average rating histogram')
ax.grid()

In [None]:
#Select genres of interest
genres = ['Comedy', 'Drama', 'Fantasy', 'Horror', 'Romance', 'Action', 'Thriller', 'Sci-Fi', 'Documentary']
multi_genre = 1 # 0 drop multi-genre, 1 keep multi genre without repetition

match multi_genre:
    case 0:
        movies_split = {gen: movies[movies['genres'] == gen] for gen in genres}
        movie_ratings_split = {gen: movie_ratings[movie_ratings['tconst'].isin(movies_split[gen]['tconst'])] for gen in genres}
    
    case 1:
        movies_split = {gen: movies[movies['genres'].apply(lambda x: x.split(',')[0]) == gen] for gen in genres}
        movie_ratings_split = {gen: movie_ratings[movie_ratings['tconst'].isin(movies_split[gen]['tconst'])] for gen in genres}

In [None]:
#plot relative error results
fig, ax = plt.subplots(int(np.ceil(len(genres)/3)), 3, figsize=(10,7))
ax = ax.ravel()
fig.suptitle('Average rating for different genres', fontsize=15)
for axes, i in zip(ax, range(0, len(genres))):
    axes.hist(movie_ratings_split[genres[i]]['averageRating'], color='b')
    axes.legend([genres[i]], loc="upper left", fontsize=7)
    axes.grid()
fig.text(0.5, 0.04, 'Average rating', ha='center')
fig.text(0.04, 0.5, 'Number of ocurences', va='center', rotation='vertical')


In [None]:
#Organize data to plot in a single graph
ratings_genre = [movie_ratings_split[genre]['averageRating'] for genre in genres]

#plot relative error results
fig, ax = plt.subplots(figsize=(10,7))

fig.suptitle('Average rating for different genres', fontsize=15)
ax.boxplot(ratings_genre)
ax.grid()

ax.set_xticklabels(genres, rotation=45, ha='right', fontsize=10)
fig.text(0.5, 0.04, 'Genres', ha='center')
fig.text(0.04, 0.5, 'Average rating', va='center', rotation='vertical')
plt.tight_layout()

In [None]:
outliers_info = {}
outliers = {} 

# calculate percentage of outliers
for genre in genres:
    data = movie_ratings_split[genre]['averageRating']
    q1 = np.percentile(data, 25) #first quartile
    q3 = np.percentile(data, 75) # third quartlie
    iqr = q3 - q1 #interquartile range
    
    #Tukey, J. W. (1977). Exploratory Data Analysis. 
    #Outlier limit defined by matplotlib
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers[genre] = data[(data < lower_bound) | (data > upper_bound)] 
    
    #percentage of the samples that are considered "outliers"
    count = len(outliers[genre])
    percentage = count / len(data) * 100
    outliers_info[genre] = f"{count} outliers ({percentage:.2f}%)"

# Print outlier info per genre
for genre, info in outliers_info.items():
    print(f"{genre}: {info}")

In [None]:
#evaluate outliers

#check for wrong type
[print(outlier) for genre in genres for outlier in outliers[genre] if not isinstance(outlier, float)]
#check non-positive values
[print(outlier) for genre in genres for outlier in outliers[genre] if outlier <= 0.0]
#check values bigger than 10
[print(outlier) for genre in genres for outlier in outliers[genre] if outlier > 10.0]

In [None]:
parameters = {} #genres, []

#Quick sample parameters check
#THINK OF THE CLEANER WAY OF DOING THIS
for genre in genres:
    parameters[genre] = {
        "mean" : (stat.mean(movie_ratings_split[genre]['averageRating'])),  
        "mode" : ((stat.mode(movie_ratings_split[genre]['averageRating']))), 
        "median" : (stat.median(movie_ratings_split[genre]['averageRating'])), 
        "std" : (stat.stdev(movie_ratings_split[genre]['averageRating'])),
        "var" : (stat.stdev(movie_ratings_split[genre]['averageRating']))**2}
    
vars = list(map(itemgetter('var'), parameters.values()))

print(parameters)

print(max(vars)/min(vars))

Possible courses of action:
-Resample with the same sample size + regular anova: pros: balanced design + simple cons: heteroscedaticity, maybe small sample size could impact normality
-Regular anova (accept ratio as okay): pros: quick and easy, higher sample sizes (?) cons:  heteroscedaticity, unbalanced design
-Welch ANOVA or Brown-Forsythe (with post-hoc analysis): pros: more robust to unbalanced sample sizes and heteroscedasticity cons: higher story points

In [None]:
samples = np.array([movie_ratings_split[genre]['averageRating'].values for genre in genres], dtype=object)

#Since the sample ratio is higher than 2, regular ANOVA is not suitable
# Applied Linear Statistical Models by Kutner et all
#Welch’s ANOVA test with different sample sizes
f_welch, p_welch = stats.f_oneway(*samples, axis=0, equal_var=False)
print(f_welch)
print(p_welch)

In [None]:
movie_ratings_split_flat= pd.DataFrame(
    [(genre, rating) 
     for genre in genres 
     for rating in movie_ratings_split[genre]['averageRating']],
    columns=['genre', 'averageRating']
)

In [None]:
#Post Hoc analysis
#Which groups differ?
#Games-Howell test

Games_Howell = pg.pairwise_gameshowell(data=movie_ratings_split_flat, dv='averageRating', between='genre' )

print(Games_Howell)