In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import statistics as st

In [2]:
def df_to_dict(df, two_cols=None): 
    df.replace(u'\xa0',u'', regex=True, inplace=True)
    sample_dict = {}
    sample_set = set()
    for index, row in df.iterrows():
        year = row["Year"]
        film = row["Film"]
        
        if year not in sample_dict:
            sample_set = set()

        if two_cols == "actor": 
            actor = row["Actor"]
            sample_set.add((actor, film))
        elif two_cols == "song": 
            song = row["Song"]
            sample_set.add((song, film))
        else: 
            sample_set.add(film)
            
        sample_dict[year] = sample_set
        
    return sample_dict

In [3]:
def read_file(award_names, path, applied=False, two_cols=None):
    awards = []
    
    for award_name in award_names:
        try:
            award_df = pd.read_csv(path + award_name + ".csv", encoding="ISO-8859-1")
            award_dict = df_to_dict(award_df, two_cols)
        except:
            award_df = pd.read_csv(path + award_name + ".csv", encoding="utf-8-sig")
            award_dict = df_to_dict(award_df, two_cols)
        finally:
            if applied: 
                awards.append(award_dict[2020])
            else:
                awards.append(award_dict)
        
    return awards

In [4]:
def union_all(oscar, awards):
    union_all = []
    
    for year in range(2010, 2020):
        union_all_each = oscar[year].copy()
        for award in awards:
            union_all_each |= award[year]
        union_all.append(union_all_each)
    
    return union_all

In [5]:
def calc_percentage_2(A, B, X, year, k=0):
    A_y = A[year]
    B_y = B[year]
    
    A_if_B = len(A_y & B_y)/len(B_y)
#   A_if_not_B = (len(A_y) - len(A_y & B_y))/(len(X[year-2010]) - len(B_y))
    B_if_A = len(A_y & B_y)/len(A_y)
    if k == 0:
        B_if_not_A = (len(B_y) - len(A_y & B_y))/(len(X[year-2010]) - len(A_y))
    else: 
        B_if_not_A = (len(B_y) - len(A_y & B_y))/(k - len(A_y))
    
    return (A_if_B, B_if_A, B_if_not_A)

In [6]:
def calc_values(oscars, others, union_all, k=0):
    percentage_oscars_if_others = []
    percentage_others_if_oscars = []
    percentage_others_ifnot_oscars = []
    
    percentage_oscars_if_others_each = []
    percentage_others_if_oscars_each = []
    percentage_others_ifnot_oscars_each = []


    mean_oscars = []    
    
    for award in others:
        for year in range(2010, 2020):
            percentage_oscars_if_others_each.append(calc_percentage_2(oscars, award, union_all, year, k)[0])
            percentage_others_if_oscars_each.append(calc_percentage_2(oscars, award, union_all, year, k)[1])
            percentage_others_ifnot_oscars_each.append(calc_percentage_2(oscars, award, union_all, year, k)[2])

        mean_oscars.append([st.mean(percentage_oscars_if_others_each), 
                            st.mean(percentage_others_if_oscars_each),
                            st.mean(percentage_others_ifnot_oscars_each)])    

        percentage_oscars_if_others.append(percentage_oscars_if_others_each)
        percentage_others_if_oscars.append(percentage_others_if_oscars_each)
        percentage_others_ifnot_oscars.append(percentage_others_ifnot_oscars_each)

        percentage_oscars_if_others_each = []
        percentage_others_if_oscars_each = []
        percentage_others_ifnot_oscars_each = []
        
    return percentage_oscars_if_others, mean_oscars

In [7]:
def calc_bayes(awards_2020, awards_2020_union_all, awards_mean, other_films=True, num_of_nominees=5):
    P_OSCARS = num_of_nominees/len(awards_2020_union_all)
    P_NOT_OSCARS = 1 - P_OSCARS
    
    p_final = []
    
    if other_films:
        awards_2020_union_all.append("Others")
        
    for film in awards_2020_union_all:
        p_calculated = 0
        bayes_n = P_OSCARS
        bayes_d = P_NOT_OSCARS
        
        for i in range(len(awards_2020)):
            if film in awards_2020[i]:
                bayes_n *= awards_mean[i][1]
                bayes_d *= awards_mean[i][2]
            
            else:
                bayes_n *= awards_mean[i][2]
                bayes_d *= awards_mean[i][1]
        
        p_calculated = bayes_n/(bayes_n + bayes_d)
        p_final.append(p_calculated)
    
    film_final = [i for _, i in sorted(zip(p_final, awards_2020_union_all), reverse=True)]
    p_final = sorted(p_final, reverse=True)
    p_final_100 = list(map(lambda x: x*100, p_final)) 
    
    return film_final, p_final_100

In [8]:
def visualize_prediction(film, probability):
    fig, ax = plt.subplots(figsize =(16, 9))
    ax.barh(film, probability, color ='gold')

    # Remove axes splines
    for s in ['top', 'bottom', 'left', 'right']:
        ax.spines[s].set_visible(False)

    # Remove x, y Ticks
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')

    # Add padding between axes and labels
    ax.xaxis.set_tick_params(pad = 5)
    ax.yaxis.set_tick_params(pad = 10)

    # Add x, y gridlines
    ax.grid(b = True, color ='grey',
            linestyle ='-.', linewidth = 0.5,
            alpha = 0.2)

    # Show top values 
    ax.invert_yaxis()

    # Add annotation to bars
    for i in ax.patches:
        plt.text(i.get_width()+0.2, i.get_y()+0.5, 
                 " " + str(round((i.get_width()), 2)) + " %",
                 fontsize = 10, fontweight ='bold',
                 color ='grey')

    # Add Plot Title
    ax.set_title('Probability of receiving an Oscar nomination based on other major awards',
                 loc ='center')
    ax.set_ylabel('Films nominated in other major awards', labelpad=30)
    ax.set_xlabel('Probability of nominations (%)', labelpad=30)

    # Add Text watermark
    fig.text(0.9, 0.15, '@caoxantb', fontsize = 12,
             color ='grey', ha ='right', va ='bottom',
             alpha = 0.7)

    # Show Plot
    plt.show()