In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import csv
import gc

# Genre Dataset

In [None]:
new = pd.read_csv('./merged_data.csv')

In [None]:
new = new.drop(columns=['genres_x'])

In [None]:
new.rename(columns={'originalTitle':'title', 'genres_y':'genre'}, inplace=True)

In [None]:
new = new.dropna(subset = ['eventName', 'genre'])

### Find most famous genre per event

In [None]:
most_famous_awards = ['Primetime Emmy Awards',
 'Grammy Awards',
 'Online Film & Television Association',
 'BAFTA Awards',
 'Academy Awards, USA',
 'Adult Video News Awards',
 'Young Artist Awards',
 'Academy of Science Fiction, Fantasy & Horror Films, USA',
 'Cannes Film Festival',
 'Berlin International Film Festival',
 'Leo Awards',
 'Venice Film Festival']

In [None]:
movie_genre = new[new['eventName'].isin(most_famous_awards)]

In [None]:
movie_genre_by_event = movie_genre.groupby(['eventName','genre']).size().to_frame('count').reset_index().rename(columns={'count':'numMovies'})

In [None]:
count_awards = movie_genre.groupby(['eventName']).size().to_frame('count').reset_index().rename(columns={'count':'numMovies'})
count_awards = count_awards.sort_values(by=['numMovies'], ascending = False)

Choose top 3 genre per event

In [None]:
top_3_df = movie_genre_by_event.set_index('genre').groupby("eventName")['numMovies'].nlargest(3).reset_index()
list(top_3_df[top_3_df['eventName']=='Academy Awards, USA']['genre'])

In [None]:
genres_names = list(top_3_df['genre'].unique())

## Plot Genres and Top 3 Genres per most famous event

In [None]:
# Prepare data
x_var = 'eventName'
groupby_var = 'genre'
df = movie_genre
df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
vals = [df[x_var].values.tolist() for i, df in df_agg]

# Draw
plt.figure(figsize=(16,9), dpi= 80)
colors = [plt.cm.Spectral(i/float(len(vals)-1)) for i in range(len(vals))]
n, bins, patches = plt.hist(vals, df[x_var].unique().__len__(), stacked=True, density=False, color=colors[:len(vals)])

# Decoration
plt.legend({group:col for group, col in zip(np.unique(df[groupby_var]).tolist(), colors[:len(vals)])})
plt.title(f"Stacked Histogram of ${x_var}$ colored by ${groupby_var}$", fontsize=22)
plt.xlabel(x_var)
plt.ylabel("Number of movies")
plt.ylim(0, 100000)
plt.xticks(ticks=bins, labels=np.unique(df[x_var]).tolist(), rotation=90, horizontalalignment='left')
plt.show()

In [None]:
plott_2 = movie_genre.copy()

In [None]:
for event in most_famous_awards:
    genre_per_event = list(top_3_df[top_3_df['eventName']==event]['genre'])
    genre_not_in_event = list(set(movie_genre['genre']) - set(genre_per_event))
    for genre in genre_not_in_event:
        plott_2 = plott_2.drop(plott_2[(plott_2.eventName==event) & (plott_2.genre==genre)].index)

In [None]:
# Prepare data
x_var = 'eventName'
groupby_var = 'genre'
df = movie_genre
df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
vals = [df[x_var].values.tolist() for i, df in df_agg]

# Draw
plt.figure(figsize=(16,9), dpi= 80)
colors = [plt.cm.Spectral(i/float(len(vals)-1)) for i in range(len(vals))]
n, bins, patches = plt.hist(vals, df[x_var].unique().__len__(), stacked=True, density=False, color=colors[:len(vals)])

# Decoration
plt.legend({group:col for group, col in zip(np.unique(df[groupby_var]).tolist(), colors[:len(vals)])})
plt.title(f"Stacked Histogram of ${x_var}$ colored by ${groupby_var}$", fontsize=22)
plt.xlabel(x_var)
plt.ylabel("Number of movies")
plt.ylim(0, 100000)
plt.xticks(ticks=bins, labels=np.unique(df[x_var]).tolist(), rotation=90, horizontalalignment='left')
plt.show()

## Timeline of top_genres for one specific event

In [None]:
new_2 = new.copy()
new_2['count'] = new_2.groupby(['eventName','genre','year'])['genre'].transform('count')
new_2

In [None]:
idx = new_2.groupby(['eventName', 'year'])['count'].transform(max) == new_2['count']
new_2[idx]

In [None]:
timeline = new_2[new_2['eventName'] == 'Academy Awards, USA']
timeline = timeline.drop(columns=['imdb_title_id','title','eventId', 'awardName', 'country'])
timeline = timeline.drop_duplicates()
timeline

valu = 3
idx2 = list(timeline.groupby(['year'])['count'].nlargest(valu).reset_index()['level_1'])
timeline = timeline.loc[idx2,:]
timeline

In [None]:
# Prepare data
x_var = 'year'
groupby_var = 'genre'
df = timeline
df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
vals = [df[x_var].values.tolist() for i, df in df_agg]

# Draw
plt.figure(figsize=(18,3), dpi= 80)
colors = [plt.cm.Spectral(i/float(len(vals))) for i in range(len(vals))]
n, bins, patches = plt.hist(vals, df[x_var].unique().__len__(), stacked=True, density=False, color=colors[:len(vals)])

# Decoration
plt.legend({group:col for group, col in zip(np.unique(df[groupby_var]).tolist(), colors[:len(vals)])})
plt.title(f"Stacked Histogram of ${x_var}$ colored by ${groupby_var}$", fontsize=22)
plt.xlabel(x_var)
plt.ylabel(f"Top ${valu}$ winning genres")
plt.ylim(0, valu)
plt.xticks(ticks=bins, labels=np.unique(df[x_var]).tolist(), rotation=90, horizontalalignment='left')
plt.show()