# Code 3: Build and Review of Netflix Quality Scores

### Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

import seaborn as sns
sns.set_theme()
sns.set(rc={'figure.figsize':(12,8)})

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# read in master data file
data = pd.read_pickle('master_data_v2.pkl')
data.head()

In [None]:
# take subset related to RT data
data_subset = data[['Title','RT_score','RT_rev_cnt','RT_comb_score']]
data_subset

#### *Before*

In [None]:
data_subset.sort_values(by='RT_comb_score', ascending=False)[:20]

In [None]:
data_subset['RT_comb_score'].hist();

## Identifying the right transformation

### Standardized

In [None]:
from sklearn import preprocessing

RT_comb_score = list(data_subset['RT_comb_score'])

std_score = preprocessing.scale(RT_comb_score)

data_subset['Std_score']=std_score

data_subset.head()

In [None]:
data_subset['Std_score'].max()
data_subset['Std_score'].min()

In [None]:
data_subset['Std_score'].hist();

Hm, this looks pretty much identical to the normalized scores.

### Percentiles

In [None]:
data_subset['Pct_score'] = data_subset['RT_comb_score'].rank(pct=True)
data_subset.head()

In [None]:
data_subset['Pct_score'].hist();

In [None]:
data_subset.sort_values(by='Pct_score', ascending=False)[:20]

The problem with this is that it suggests there is an equal distribution of high-quality, medium-quality, and low-quality films, which intuitively feels wrong. It seems like bad films should be more prevalent, and good films should be more rare. However, we don't want the steep drop-off that we get with the normalized scores.

### Square root transformation

In [None]:
import math

trans_score = []
for i in RT_comb_score:
    i_new = math.sqrt(i)
    trans_score.append(i_new)

data_subset['Trans_score'] = trans_score
data_subset['Trans_score'].hist();

In [None]:
data_subset.sort_values(by='RT_comb_score', ascending=False)[:30].reset_index(drop=True)

This feels like a much better representation of the distribution of quality among films.

## Identifying the right formula

In [None]:
colors = ['#c1071e','#b33a3a','#f2b0a5','#43465e','#751a2c']
bg_color = '#fbfbfb'
txt_color = '#5c5c5c'

sns.palplot(colors)

In [None]:
#### COMBINED SCORE FORMULA ####
RT_comb_score2 = (data_subset['RT_score']**2) * (data_subset['RT_rev_cnt']**0.5)
RT_comb_score2

sqrt_trans_score2 = []
for i in RT_comb_score2:
    i_norm = (i-RT_comb_score2.min())/(RT_comb_score2.max()-RT_comb_score2.min())
    i_new = math.sqrt(i_norm)
    sqrt_trans_score2.append(i_new)

sns.set_style("white", {'axes.grid' : False})

data_subset['Form2_score'] = sqrt_trans_score2
ax = sns.histplot(data_subset['Form2_score'], color='#c1071e');
ax.set(xlabel='Movie Quality');

In [None]:
data_subset[data_subset['RT_rev_cnt']>=20].sort_values(by='Form2_score', ascending=True)[:10].reset_index(drop=True)

This ranking is similar to the previous one, except it elevates some films with high scores but lower numbers of reviews (such as His House). This may be a better reflection of the "quality" metric we're after. 

In [None]:
data['Form2_score']=sqrt_trans_score2

# Cast/Crew Quality Scores

First, we need to apply the same formula/transformations to the list of films in the cast/crew's filmographies.

In [None]:
all_films_RT = pd.read_csv('filmography_RTscores.csv').iloc[:,2:]
all_films_RT

In [None]:
#### COMBINED SCORE FORMULA ####
RT_comb_score3 = (all_films_RT['Score']**2) * (all_films_RT['Revs']**0.5)
RT_comb_score3

sqrt_trans_score3 = []
for i in RT_comb_score3:
    i_norm = (i-RT_comb_score3.min())/(RT_comb_score3.max()-RT_comb_score3.min())
    i_new = math.sqrt(i_norm)
    sqrt_trans_score3.append(i_new)

all_films_RT['Form2_score'] = sqrt_trans_score3
all_films_RT['Form2_score'].hist();

In [None]:
all_films_RT.sort_values(by='Form2_score', ascending=False)[:20]

Now we can use this to calculate the cast/crew quality scores.

In [None]:
df_dirs = data[['Dir1','Dir2','Dir3']]
df_dir_films = data[['Rel_year','Dir1_films','Dir2_films','Dir3_films']]

In [None]:
df_writer = data[['Writer1','Writer2','Writer3']]
df_writer_films = data[['Rel_year','Writer1_films','Writer2_films','Writer3_films']]

In [None]:
df_actors = data[['Actor1','Actor2','Actor3','Actor4','Actor5']]
df_actors_films = data[['Rel_year','Actor1_films','Actor2_films','Actor3_films','Actor4_films','Actor5_films']]

In [None]:
# define function to calculate quality of crew
def calc_crew_qual(df, cast_crew_col): # df
    
    cast_crew_qual = []
    cast_crew_film_scores = []
    
    for i in range(len(data)):
        scores_ = []
        rel_year = df['Rel_year'].iloc[i] # new
        dic = cast_crew_col.iloc[i]
        if dic!=None:
            for j in dic:
                year = dic.get(j)
                if ((all_films_RT['Title']==j) & (all_films_RT['Year']==year)).any():
                    movie_score = all_films_RT[(all_films_RT['Title']==j) & (all_films_RT['Year']==year)]['Form2_score'].item()
                    cast_crew_score = movie_score # /((rel_year-year+1)**0.5)
                else:
                    movie_score = np.nan
                    cast_crew_score = np.nan
                scores_.append(cast_crew_score)
            if all(x is np.nan for x in scores_):
                cast_crew_score = np.nan
            else:
                cast_crew_score = np.nansum(scores_)/math.sqrt(len(scores_)) # np.count_nonzero(~np.isnan(scores_)) / len(scores_)
            cast_crew_film_scores.append(scores_)
            cast_crew_qual.append(cast_crew_score)
        else:
            cast_crew_film_scores.append(np.nan)
            cast_crew_qual.append(np.nan)
    
    return cast_crew_film_scores, cast_crew_qual


def norm_cast_crew_qual(lst1, lst2, lst3):

    dir_lst = lst1 + lst2 + lst3
    
    cast_crew_qual_lst = []
    for i in dir_lst:
        if i is np.nan:
            i_norm = np.nan
        else:
            i_norm = (i-np.nanmin(dir_lst))/(np.nanmax(dir_lst)-np.nanmin(dir_lst))
        cast_crew_qual_lst.append(i_norm)
    
    norm_lst1 = cast_crew_qual_lst[:622]
    norm_lst2 = cast_crew_qual_lst[622:1244]
    norm_lst3 = cast_crew_qual_lst[1244:1866]
    
    return norm_lst1, norm_lst2, norm_lst3

In [None]:
# define function to calculate quality of cast
def calc_cast_qual(df, cast_crew_col):
    
    cast_crew_qual = []
    cast_crew_film_scores = []
    
    for i in range(len(data)):
        scores_ = []
        rel_year = df['Rel_year'].iloc[i] # new
        dic = cast_crew_col.iloc[i]
        if dic!=None:
            for j in dic:
                year = dic.get(j)[0]
                rank = dic.get(j)[1]
                if rank==0:
                    rank=np.nan
                if ((all_films_RT['Title']==j) & (all_films_RT['Year']==year)).any():
                    movie_score = all_films_RT[(all_films_RT['Title']==j) & (all_films_RT['Year']==year)]['Form2_score'].item()
                    cast_crew_score = movie_score/((rel_year-year+1)**(1/3)) # (rank**0.5)*
                else:
                    movie_score = np.nan
                    cast_crew_score = np.nan
                scores_.append(cast_crew_score)
            if all(x is np.nan for x in scores_):
                cast_crew_score = np.nan
            else:
                cast_crew_score = np.nansum(scores_)/math.sqrt(len(scores_)) # np.count_nonzero(~np.isnan(scores_)) / len(scores_)
            cast_crew_film_scores.append(scores_)
            cast_crew_qual.append(cast_crew_score)
        else:
            cast_crew_film_scores.append(np.nan)
            cast_crew_qual.append(np.nan)
    
    return cast_crew_film_scores, cast_crew_qual


def norm_cast_qual(lst1, lst2, lst3, lst4, lst5):

    actor_lst = lst1 + lst2 + lst3 + lst4 + lst5
    
    cast_crew_qual_lst = []
    for i in actor_lst:
        if i is np.nan:
            i_norm = np.nan
        else:
            i_norm = (i-np.nanmin(actor_lst))/(np.nanmax(actor_lst)-np.nanmin(actor_lst))
        cast_crew_qual_lst.append(i_norm)
    
    norm_lst1 = cast_crew_qual_lst[:622]
    norm_lst2 = cast_crew_qual_lst[622:1244]
    norm_lst3 = cast_crew_qual_lst[1244:1866]
    norm_lst4 = cast_crew_qual_lst[1866:2488]
    norm_lst5 = cast_crew_qual_lst[2488:3110]

    return norm_lst1, norm_lst2, norm_lst3, norm_lst4, norm_lst5

In [None]:
dir1_film_scores, dir1_qual = calc_crew_qual(df_dir_films, df_dir_films['Dir1_films']) # df_dir_films, 
dir2_film_scores, dir2_qual = calc_crew_qual(df_dir_films, df_dir_films['Dir2_films']) # df_dir_films, 
dir3_film_scores, dir3_qual = calc_crew_qual(df_dir_films, df_dir_films['Dir3_films']) # df_dir_films, 

In [None]:
dir1_qual, dir2_qual, dir3_qual = norm_cast_crew_qual(dir1_qual, dir2_qual, dir3_qual)

In [None]:
df_dir_final = data[['Title','Rel_year','RT_rev_cnt','RT_comb_score','Form2_score']]
df_dir_final['Dir1'] = df_dirs[['Dir1']]
df_dir_final['Dir1_films'] = df_dir_films[['Dir1_films']]
df_dir_final['Dir1_film_scores'] = dir1_film_scores
df_dir_final['Dir1_qual']=dir1_qual

df_dir_final['Dir2'] = df_dirs[['Dir2']]
df_dir_final['Dir2_films'] = df_dir_films[['Dir2_films']]
df_dir_final['Dir2_film_scores'] = dir2_film_scores
df_dir_final['Dir2_qual']=dir2_qual

df_dir_final['Dir3'] = df_dirs[['Dir3']]
df_dir_final['Dir3_films'] = df_dir_films[['Dir3_films']]
df_dir_final['Dir3_film_scores'] = dir3_film_scores
df_dir_final['Dir3_qual']=dir3_qual

In [None]:
df_dir_final['Dir_avg_qual'] = df_dir_final[['Dir1_qual','Dir2_qual','Dir3_qual']].mean(axis=1)

In [None]:
df_dir_final2 = df_dir_final.sort_values(by='Dir1_qual', ascending=False)[['Title','Form2_score','Dir1','Dir2','Dir3','Dir1_films','Dir1_qual','Dir2_qual','Dir3_qual']]
df_dir_final2[:25].reset_index(drop=True)

In [None]:
length = range(622)

bins = []
for i in length:
    bin_num = i//60
    bins.append(bin_num)

df_dir_final2['Bins']=bins

In [None]:
# include year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_dir_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# include mitigated year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_dir_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# omit year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_dir_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
writer1_film_scores, writer1_qual = calc_crew_qual(df_writer_films, df_writer_films['Writer1_films'])
writer2_film_scores, writer2_qual = calc_crew_qual(df_writer_films, df_writer_films['Writer2_films'])
writer3_film_scores, writer3_qual = calc_crew_qual(df_writer_films, df_writer_films['Writer3_films'])

In [None]:
writer1_qual, writer2_qual, writer3_qual = norm_cast_crew_qual(writer1_qual, writer2_qual, writer3_qual)

In [None]:
df_writer_final = data[['Title','Rel_year','RT_rev_cnt','RT_comb_score','Form2_score']]
df_writer_final['Writer1'] = df_writer[['Writer1']]
df_writer_final['Writer1_films'] = df_writer_films[['Writer1_films']]
df_writer_final['Writer1_film_scores'] = writer1_film_scores
df_writer_final['Writer1_qual'] = writer1_qual

df_writer_final['Writer2'] = df_writer[['Writer2']]
df_writer_final['Writer2_films'] = df_writer_films[['Writer2_films']]
df_writer_final['Writer2_film_scores'] = writer2_film_scores
df_writer_final['Writer2_qual'] = writer2_qual

df_writer_final['Writer3'] = df_writer[['Writer3']]
df_writer_final['Writer3_films'] = df_writer_films[['Writer3_films']]
df_writer_final['Writer3_film_scores'] = writer3_film_scores
df_writer_final['Writer3_qual'] = writer3_qual

In [None]:
df_writer_final['Writer_avg_qual'] = df_writer_final[['Writer1_qual','Writer2_qual','Writer3_qual']].mean(axis=1)

In [None]:
df_writer_final2 = df_writer_final.sort_values(by='Writer1_qual', ascending=False)[['Title','Form2_score','Writer1','Writer2','Writer3','Writer1_films','Writer1_qual','Writer2_qual','Writer3_qual']]
df_writer_final2[:25].reset_index(drop=True)

In [None]:
length = range(622)

bins = []
for i in length:
    bin_num = i//60
    bins.append(bin_num)

df_writer_final2['Bins']=bins

In [None]:
# include year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_writer_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# include mitigated year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_writer_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# omit year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_writer_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
actor1_film_scores, actor1_qual = calc_cast_qual(df_actors_films, df_actors_films['Actor1_films'])
actor2_film_scores, actor2_qual = calc_cast_qual(df_actors_films, df_actors_films['Actor2_films'])
actor3_film_scores, actor3_qual = calc_cast_qual(df_actors_films, df_actors_films['Actor3_films'])
actor4_film_scores, actor4_qual = calc_cast_qual(df_actors_films, df_actors_films['Actor4_films'])
actor5_film_scores, actor5_qual = calc_cast_qual(df_actors_films, df_actors_films['Actor5_films'])

In [None]:
actor1_qual,actor2_qual,actor3_qual,actor4_qual,actor5_qual = norm_cast_qual(actor1_qual,actor2_qual,actor3_qual,actor4_qual,actor5_qual)

In [None]:
df_actors_final = data[['Title','Rel_year','RT_rev_cnt','RT_comb_score','Form2_score']]
df_actors_final['Actor1'] = df_actors[['Actor1']]
df_actors_final['Actor1_films'] = df_actors_films[['Actor1_films']]
df_actors_final['Actor1_film_scores'] = actor1_film_scores
df_actors_final['Actor1_qual']=actor1_qual

df_actors_final['Actor2'] = df_actors[['Actor2']]
df_actors_final['Actor2_films'] = df_actors_films[['Actor2_films']]
df_actors_final['Actor2_film_scores'] = actor2_film_scores
df_actors_final['Actor2_qual']=actor2_qual

df_actors_final['Actor3'] = df_actors[['Actor3']]
df_actors_final['Actor3_films'] = df_actors_films[['Actor3_films']]
df_actors_final['Actor3_film_scores'] = actor3_film_scores
df_actors_final['Actor3_qual']=actor3_qual

df_actors_final['Actor4'] = df_actors[['Actor4']]
df_actors_final['Actor4_films'] = df_actors_films[['Actor4_films']]
df_actors_final['Actor4_film_scores'] = actor4_film_scores
df_actors_final['Actor4_qual']=actor4_qual

df_actors_final['Actor5'] = df_actors[['Actor5']]
df_actors_final['Actor5_films'] = df_actors_films[['Actor5_films']]
df_actors_final['Actor5_film_scores'] = actor5_film_scores
df_actors_final['Actor5_qual']=actor5_qual

In [None]:
df_actors_final['Actor_avg_qual'] = df_actors_final[['Actor1_qual','Actor2_qual','Actor3_qual','Actor4_qual','Actor5_qual']].mean(axis=1)
# .apply(pd.Series.nlargest, axis=1, n=3).mean(axis=1)

In [None]:
df_actors_final2 = df_actors_final[df_actors_final['Actor1_qual'].notnull()].\
sort_values(by='Actor_avg_qual', ascending=False)[['Title','Form2_score','Actor1','Actor2','Actor3','Actor4','Actor5',\
                                                'Actor1_qual','Actor2_qual','Actor3_qual','Actor4_qual',\
                                                'Actor5_qual','Actor_avg_qual']]
df_actors_final2[:25].reset_index(drop=True)

In [None]:
length = range(443)

bins = []
for i in length:
    bin_num = i//50
    bins.append(bin_num)

df_actors_final2['Bins']=bins

In [None]:
sns.boxplot(x='Bins',y='Form2_score',data=df_actors_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# amplify rank variable
sns.boxplot(x='Bins',y='Form2_score',data=df_actors_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# include rank variable
sns.boxplot(x='Bins',y='Form2_score',data=df_actors_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# drop rank variable
sns.boxplot(x='Bins',y='Form2_score',data=df_actors_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# omit year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_actors_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
# include mitigated year variable
sns.boxplot(x='Bins',y='Form2_score',data=df_actors_final2,showmeans=True,meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                      "markersize":"10"});

In [None]:
data[['Title','Plot_summ','Plot_keywords','Genre']]

In [None]:
data.columns

In [None]:
df_part1 = pd.concat([data.iloc[:,:41],df_dir_final.iloc[:,7:]], axis=1)

In [None]:
df_part2 = pd.concat([df_part1,df_writer_final.iloc[:,5:]], axis=1)

In [None]:
df_part3 = pd.concat([df_part2,df_actors_final.iloc[:,5:]], axis=1)

In [None]:
df_part4 = pd.concat([df_part3,data.iloc[:,86:88]], axis=1)
df_part4['RT_comb_score']=list(data['Form2_score'])

In [None]:
df_final = pd.concat([df_part4,data.iloc[:,89:91]], axis=1)

In [None]:
df_final.columns

In [None]:
# df_final.to_pickle('master_data_v2.pkl')

In [None]:
new_df = pd.read_pickle('master_data_v3.pkl')

In [None]:
# new_df.iloc[:,:42]
# df_dir_final.iloc[:,5:]
# new_df.iloc[:,55:56]
# df_writer_final.iloc[:,5:]
# new_df.iloc[:,69:70]
# df_actors_final.iloc[:,5:]
# new_df.iloc[:,-6:]

In [None]:
new_df2 = pd.concat([new_df.iloc[:,:42], df_dir_final.iloc[:,5:], new_df.iloc[:,55:56], df_writer_final.iloc[:,5:], new_df.iloc[:,69:70],\
                    df_actors_final.iloc[:,5:], new_df.iloc[:,-6:]], axis=1)

In [None]:
new_df2.columns

In [None]:
# new_df2.to_pickle('master_data_v4.pkl')

In [None]:
new_df2[['Title','Dir1','Dir2','Dir2','Writer1','Actor1','Actor2','Actor3','Dir_nom_cnt','Writer_nom_cnt','Actor_nom_cnt']].\
sort_values(by='Actor_nom_cnt',ascending=False)

In [None]:
new_df2[['Title','Actor1','Actor2','Actor3','Actor4','Actor5','Actor_nom_cnt']].\
sort_values(by='Actor_nom_cnt',ascending=False)[:5]

In [None]:
new_df2[['Title','Genre','RT_comb_score']].sort_values(by='RT_comb_score', ascending=False)