# Building content-based filtering

### Creating several categories


In [1]:
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot  as plt
import numpy as np
import feather

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist,squareform 
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from scipy.sparse import csr_matrix




from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 500
pd.options.display.max_columns = 200 
pd.set_option('display.float_format', lambda x: '%.3f' % x) #display numbers as decimals instead of scientific value

data_filepath = 'movie_metadata_cleaned_v1.feather'


In [2]:
df = feather.read_dataframe(data_filepath)
movies_count = df.shape[0]
print (   movies_count ,'movies imported')

df_movies = df.copy()
df_movies['movie_id'] = df_movies.index
df_movies = df_movies.set_index(df['movie_title'])
movies_list = list(df['movie_title'])

4916 movies imported


In [3]:
df.describe(include='all')

Unnamed: 0,movie_title,title_year,imdb_score,country,language,genres,plot_keywords,director_name,director_facebook_likes,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,num_user_for_reviews,num_critic_for_reviews,num_voted_users,gross,budget,content_rating,color,duration,aspect_ratio
count,4916,4916.0,4916.0,4916,4916,4916,4916.0,4916.0,4916.0,4916,4916,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916,4916,4916.0,4916.0
unique,4916,,,66,48,914,4757.0,2398.0,,2096,3031,3520.0,,,,,,,,,,,18,2,,
top,Faithful,,,USA,English,Drama,,,,Robert De Niro,Morgan Freeman,,,,,,,,,,,,R,Color,,
freq,1,,,3710,4582,232,152.0,102.0,,48,18,23.0,,,,,,,,,,,2367,4712,,
mean,,2002.453,6.437,,,,,,676.677,,,,6485.2,1617.624,628.313,9579.733,7348.296,266.525,136.617,82645.242,47779121.547,33576517.195,,,107.092,2.096
std,,12.315,1.128,,,,,,2805.134,,,,15098.229,4006.861,1622.642,18164.359,19206.016,372.547,120.418,138322.057,61184608.962,38653472.075,,,25.247,0.289
min,,1916.0,1.6,,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,162.0,218.0,,,7.0,1.18
25%,,1999.0,5.8,,,,,,6.0,,,,606.0,275.0,129.0,1394.75,0.0,62.0,47.0,8361.75,8052632.5,7000000.0,,,93.0,1.85
50%,,2005.0,6.6,,,,,,44.0,,,,981.5,592.0,362.0,3049.0,159.0,152.0,106.5,33132.5,36948242.0,23000000.0,,,103.0,2.22
75%,,2010.0,7.2,,,,,,188.0,,,,11000.0,912.0,631.0,13616.75,2000.0,319.25,191.0,93772.75,50875351.25,40000000.0,,,118.0,2.35


In [4]:
def replace_spaces(feature):
    feature = feature.str.replace(' ', '_')
    feature = feature.str.replace('.', '')
    feature = feature.str.replace('|', ' ')
    return feature

df_2 = df.copy()
df_2 = df_2.set_index(df_2['movie_title'])
object_only = df_2.select_dtypes(include=['object'])
df_2[object_only.columns] = object_only.apply(replace_spaces, axis=1)


In [5]:
df_2.head()

Unnamed: 0_level_0,movie_title,title_year,imdb_score,country,language,genres,plot_keywords,director_name,director_facebook_likes,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,num_user_for_reviews,num_critic_for_reviews,num_voted_users,gross,budget,content_rating,color,duration,aspect_ratio
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Gods and Monsters,Gods_and_Monsters,1998.0,7.5,USA,English,Biography Drama,friendship gardener gay_interest homosexual me...,Bill_Condon,386.0,Brendan_Fraser,Lynn_Redgrave,Kevin_J_O'Connor,3000.0,258.0,248.0,4238,0,229.0,127.0,24977,6390032.0,3500000.0,R,_Black_and_White,105.0,2.35
Runner Runner,Runner_Runner,2013.0,5.6,USA,English,Crime Thriller,costa_rica fbi_agent gambling money online_gam...,Brad_Furman,65.0,Justin_Timberlake,John_Heard,David_Costabile,3000.0,697.0,681.0,5706,0,95.0,184.0,52069,19316646.0,30000000.0,R,Color,88.0,2.35
Antitrust,Antitrust,2001.0,6.1,USA,English,Action Crime Drama Thriller,boss communications computer genius job,Peter_Howitt,29.0,Tyler_Labine,Richard_Roundtree,Ned_Bellamy,779.0,240.0,91.0,1297,0,221.0,123.0,25558,10965209.0,30000000.0,PG-13,Color,109.0,2.35
Lady in White,Lady_in_White,1988.0,6.7,USA,English,Fantasy Horror Mystery Thriller,1960s boy ghost murder small_town,Frank_LaLoggia,7.0,Alex_Rocco,Lukas_Haas,Katherine_Helmond,968.0,733.0,339.0,2545,0,69.0,130.0,4695,1705139.0,4700000.0,R,Color,118.0,1.85
Dead Man Down,Dead_Man_Down,2013.0,6.5,USA,English,Action Crime Drama Thriller,crime_lord dead_body dead_body_in_a_freezer ne...,Niels_Arden_Oplev,76.0,Dominic_Cooper,Isabelle_Huppert,James_Biberi,3000.0,678.0,174.0,4253,0,127.0,217.0,54147,10880926.0,30000000.0,R,Color,118.0,2.35


In [6]:
def calculate_dist(df_, min_words_freq = 1):
    df = df_.copy()
    ######### Words columns ############
    df_words_cols = df.select_dtypes(include=['object']).copy()
    df.loc[:,'words']=''
    for column in df_words_cols.columns:
        df.loc[:,'words'] +=df.loc[:,column]+' ' #.T.head(1).T+' '
        if column in ['actor_1_name', 'director_name', 'genres']:
            df.loc[:,'words'] +=df.loc[:,column]+' '
        
    if df_words_cols.shape[1]>0 :
        vectorizer  = TfidfVectorizer(min_df = min_words_freq+1)
        words_freq_csrm = vectorizer.fit_transform(list(df['words']))
        #df_freq = pd.DataFrame(words_freq_csrm.toarray(),index=movies_list, columns=vectorizer.get_feature_names())
        #words_frequencies = pd.Series(df_freq.astype(bool).sum(axis=0))
        #keep_words_i = list(np.where(words_frequencies>min_words_freq))[0]
        #words_freq_csrm= words_freq_csrm[:,keep_words_i]
        words_features = words_freq_csrm.toarray()

    ###### Numeric columns #########
    df_numeric_cols = df.select_dtypes(include=['float64']).copy()
    
    if df_numeric_cols.shape[1]>0 :
        scaler = MinMaxScaler((0.5,2))
        numeric_features = scaler.fit_transform(df_numeric_cols)
        if df_words_cols.shape[1]>0 :
            all_features = np.hstack((words_features, numeric_features))
            n_components = int(round(words_features.shape[1]/3, 0))+numeric_features.shape[1]
        else:
            all_features = numeric_features
            n_components = numeric_features.shape[1]
    else:
        all_features = words_features
        n_components = int(round(words_features.shape[1]/3, 0))
        
    
    print('Reducing to',n_components,'principal components out of',all_features.shape[1],'features')
    ## => indiquer la variance
    pca = PCA(n_components=n_components) # We keep 1/3
    pca_features = pca.fit_transform(all_features)
    ####### Dist matrix ############
    pdist_matrix = pdist(pca_features)
    pdist_square = squareform(pdist_matrix)
    df_pdist = pd.DataFrame(pdist_square, index = movies_list, columns =movies_list )
    
    return df_pdist



In [7]:

def display_closest(movie_name, columns_names, df_pdist, n=5, min_words_freq = 1, recalc=False):
    if (recalc or df_pdist.shape[0]==0):
        df_pdist = calculate_dist(df_2[columns_names], min_words_freq=min_words_freq)
    df_display = df_2[columns_names].copy()
    df_display['distance'] = df_pdist[movie_name].copy()
    display(df_display.loc[df_display['distance'].sort_values().iloc[1:n+1].index, :])
    return df_pdist
    
df_pdist_overall = pd.DataFrame()
df_pdist_genres = pd.DataFrame()
df_pdist_cast = pd.DataFrame()
df_pdist_success= pd.DataFrame()
df_pdist_pict= pd.DataFrame()
def display_categories(movie_name, n=5):
    global df_pdist_overall
    global df_pdist_genres
    global df_pdist_cast
    global df_pdist_success
    global df_pdist_pict

    print('Similar overall')
    df_pdist_overall = display_closest(movie_name, n=n, min_words_freq=2
                                      , df_pdist = df_pdist_overall
                                      , columns_names = ['genres', 'director_name', 'actor_1_name','title_year', 'country', 'language', 'imdb_score', 'content_rating', 'color'])
    
    print('Similar genre')
    df_pdist_genres = display_closest(movie_name, n=n, min_words_freq=5
                                      , df_pdist = df_pdist_genres
                                      , columns_names = ['genres', 'plot_keywords'])
    print('Similar picture')
    df_pdist_pict = display_closest(movie_name, n=n
                                      , df_pdist = df_pdist_pict
                                      , columns_names = ['director_name', 'color', 'title_year', 'aspect_ratio', 'duration', 'country', 'language'])
    
    print('Similar cast')
    df_pdist_cast = display_closest(movie_name, n=n
                                      , df_pdist = df_pdist_cast
                                      , columns_names = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'title_year'])
    print('Similar success')
    df_pdist_success = display_closest(movie_name, n=n
                                      , df_pdist = df_pdist_success
                                      , columns_names = ['imdb_score', 'imdb_score', 'gross', 'num_user_for_reviews', 'movie_facebook_likes',  'num_critic_for_reviews', 'num_voted_users'])
  


In [8]:
df.columns

Index(['movie_title', 'title_year', 'imdb_score', 'country', 'language',
       'genres', 'plot_keywords', 'director_name', 'director_facebook_likes',
       'actor_1_name', 'actor_2_name', 'actor_3_name',
       'actor_1_facebook_likes', 'actor_2_facebook_likes',
       'actor_3_facebook_likes', 'cast_total_facebook_likes',
       'movie_facebook_likes', 'num_user_for_reviews',
       'num_critic_for_reviews', 'num_voted_users', 'gross', 'budget',
       'content_rating', 'color', 'duration', 'aspect_ratio'],
      dtype='object')

In [9]:
display_categories('Fight Club')

Similar overall
Reducing to 354 principal components out of 1058 features


Unnamed: 0_level_0,genres,director_name,actor_1_name,title_year,country,language,imdb_score,content_rating,color,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
The Curious Case of Benjamin Button,Drama Fantasy Romance,David_Fincher,Brad_Pitt,2008.0,USA,English,7.8,PG-13,Color,0.466
The Social Network,Biography Drama,David_Fincher,Andrew_Garfield,2010.0,USA,English,7.7,PG-13,Color,0.826
The Game,Drama Mystery Thriller,David_Fincher,Deborah_Kara_Unger,1997.0,USA,English,7.8,R,Color,0.832
Gone Girl,Crime Drama Mystery Thriller,David_Fincher,Patrick_Fugit,2014.0,USA,English,8.1,R,Color,0.833
The Tree of Life,Drama Fantasy,Terrence_Malick,Brad_Pitt,2011.0,USA,English,6.7,PG-13,Color,0.899


Similar genre
Reducing to 271 principal components out of 813 features


Unnamed: 0_level_0,genres,plot_keywords,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Emperor's Club,Drama,classroom fund_raising headmaster reunion roman,0.452
Lost in Translation,Drama,fish_out_of_water japanese loneliness older_ma...,0.467
Requiem for a Dream,Drama,addiction diet_pill drug_addiction fast_motion...,0.474
Frost/Nixon,Drama,american_president political_drama scandal us_...,0.479
The Master,Drama,drifter past_life_regression psychological_tes...,0.497


Similar picture
Reducing to 322 principal components out of 960 features


Unnamed: 0_level_0,director_name,color,title_year,aspect_ratio,duration,country,language,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
The Game,David_Fincher,Color,1997.0,2.35,129.0,USA,English,0.071
Se7en,David_Fincher,Color,1995.0,2.35,127.0,USA,English,0.092
Alien 3,David_Fincher,Color,1992.0,2.35,145.0,USA,English,0.106
Panic Room,David_Fincher,Color,2002.0,2.35,112.0,USA,English,0.122
Zodiac,David_Fincher,Color,2007.0,2.35,162.0,USA,English,0.124


Similar cast
Reducing to 790 principal components out of 2367 features


Unnamed: 0_level_0,actor_1_name,actor_2_name,actor_3_name,title_year,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sinbad: Legend of the Seven Seas,Brad_Pitt,Adriano_Giannini,Timothy_West,2003.0,0.537
Babel,Brad_Pitt,Harriet_Walter,Dermot_Crowley,2006.0,0.544
Johnny Suede,Brad_Pitt,Tina_Louise,Nick_Cave,1991.0,0.546
The Tree of Life,Brad_Pitt,Tye_Sheridan,Fiona_Shaw,2011.0,0.57
Ocean's Twelve,Brad_Pitt,Julia_Roberts,Mini_Anden,2004.0,0.657


Similar success
Reducing to 5 principal components out of 5 features


Unnamed: 0_level_0,imdb_score,imdb_score,gross,num_user_for_reviews,movie_facebook_likes,num_critic_for_reviews,num_voted_users,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Memento,8.5,8.5,25530884.0,2067.0,40000,274.0,845580,0.29
Pulp Fiction,8.9,8.9,107930000.0,2195.0,45000,215.0,1324680,0.327
Donnie Darko,8.1,8.1,727883.0,2110.0,33000,283.0,580999,0.33
Kill Bill: Vol. 1,8.1,8.1,70098138.0,2105.0,13000,354.0,735784,0.332
The Matrix,8.7,8.7,171383253.0,3646.0,25000,313.0,1217752,0.334


In [10]:
display_categories('Django Unchained')

Similar overall


Unnamed: 0_level_0,genres,director_name,actor_1_name,title_year,country,language,imdb_score,content_rating,color,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
The Hateful Eight,Crime Drama Mystery Thriller Western,Quentin_Tarantino,Craig_Stark,2015.0,USA,English,7.9,R,Color,0.765
The Revenant,Adventure Drama Thriller Western,Alejandro_G_Iñárritu,Leonardo_DiCaprio,2015.0,USA,English,8.1,R,Color,0.794
Kill Bill: Vol. 1,Action,Quentin_Tarantino,David_Carradine,2003.0,USA,English,8.1,R,_Black_and_White,0.832
Reservoir Dogs,Crime Drama Thriller,Quentin_Tarantino,Quentin_Tarantino,1992.0,USA,English,8.4,R,Color,0.881
The Great Gatsby,Drama Romance,Baz_Luhrmann,Leonardo_DiCaprio,2013.0,Australia,English,7.3,PG-13,Color,0.911


Similar genre


Unnamed: 0_level_0,genres,plot_keywords,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Forsaken,Drama Western,,0.198
Unforgiven,Drama Western,englishman leaving_flowers_on_a_grave one_last...,0.198
The Work and the Glory II: American Zion,Drama Western,estrangement missouri mormon mormon_church mor...,0.25
Doc Holliday's Revenge,Western,1880s cowboy low_budget_film shoot_out wild_west,0.335
The Ballad of Gregorio Cortez,Western,,0.35


Similar picture


Unnamed: 0_level_0,director_name,color,title_year,aspect_ratio,duration,country,language,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Inglourious Basterds,Quentin_Tarantino,Color,2009.0,2.35,153.0,USA,English,0.057
The Hateful Eight,Quentin_Tarantino,Color,2015.0,2.76,187.0,USA,English,0.231
Pulp Fiction,Quentin_Tarantino,Color,1994.0,2.35,178.0,USA,English,0.271
Kill Bill: Vol. 2,Quentin_Tarantino,_Black_and_White,2004.0,2.35,137.0,USA,English,0.319
Jackie Brown,Quentin_Tarantino,Color,1997.0,1.85,154.0,USA,English,0.349


Similar cast


Unnamed: 0_level_0,actor_1_name,actor_2_name,actor_3_name,title_year,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Shutter Island,Leonardo_DiCaprio,Joseph_Sikora,Nellie_Sciutto,2010.0,0.467
The Great Gatsby,Leonardo_DiCaprio,Elizabeth_Debicki,Steve_Bisley,2013.0,0.471
Body of Lies,Leonardo_DiCaprio,Simon_McBurney,Michael_Gaston,2008.0,0.474
The Beach,Leonardo_DiCaprio,Virginie_Ledoyen,Peter_Youngblood_Hills,2000.0,0.503
The Quick and the Dead,Leonardo_DiCaprio,Pat_Hingle,Roberts_Blossom,1995.0,0.544


Similar success


Unnamed: 0_level_0,imdb_score,imdb_score,gross,num_user_for_reviews,movie_facebook_likes,num_critic_for_reviews,num_voted_users,distance
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mad Max: Fury Road,8.1,8.1,153629485.0,1588.0,191000,739.0,552503,0.167
Black Swan,8.0,8.0,106952327.0,1140.0,106000,669.0,551363,0.249
The Wolf of Wall Street,8.2,8.2,116866727.0,1138.0,138000,606.0,780588,0.318
Argo,7.7,7.7,136019448.0,695.0,89000,656.0,452465,0.333
Skyfall,7.8,7.8,304360277.0,1498.0,80000,750.0,522030,0.35


In [11]:
df_2.sort_values('movie_facebook_likes').tail(100).index

Index(['Independence Day: Resurgence', 'The Wolverine',
       'The Secret Life of Walter Mitty', 'Evil Dead', 'Minions', 'Creed',
       'Lincoln', 'Noah', '300: Rise of an Empire', 'Oblivion',
       'Moonrise Kingdom', 'Room', 'Captain America: Civil War',
       'Pride and Prejudice and Zombies', 'Ted', 'The Social Network',
       'The Help', 'Looper', 'Into the Wild', 'Straight Outta Compton',
       'Edge of Tomorrow', 'Warrior', 'Midnight in Paris', 'Idiocracy',
       'Spotlight', 'Suicide Squad', 'Skyfall', 'Warm Bodies', 'Drive', 'Fury',
       'Dark Shadows', 'The Hunger Games: Catching Fire',
       'X-Men: Days of Future Past', 'Terminator Genisys', 'Pacific Rim',
       '12 Years a Slave', 'Lucy', 'The Hobbit: The Desolation of Smaug',
       'Rush', 'Spectre', 'Prisoners', 'Warcraft', 'Maleficent', 'Argo',
       'Into the Woods', 'The Theory of Everything', 'Boyhood',
       'Star Trek Into Darkness', 'Sharknado', 'The Fault in Our Stars',
       'Furious 7', 'Iron Man

##### Prepare export of all our reco

In [18]:
def closest_n(df_pdist, i, n=5, np_already_shown=np.full((movies_count,1), -1)):
    movie_distances = df_pdist[df.loc[i, 'movie_title']]
    np_closest = movie_distances.sort_values()[:n*5]
    # remove movie indexes which have already been suggested(from np_already_shown)
    np_closest = df_movies.loc[np_closest.index, 'movie_id']
    np_closest = np_closest[np.isin(np_closest, np_already_shown[i], invert=True)][:n]
    return (np_closest)

#initiate the list of movies to ignore with each movie's own name
np_already_shown =np.reshape(range(movies_count), (movies_count, 1))
closest_overall = np.array([closest_n(df_pdist_overall,i
                                     ,np_already_shown = np_already_shown
                                     ) for i in range(movies_count)])

np_already_shown = np.hstack((np_already_shown, closest_overall))
closest_genres = np.array([closest_n(df_pdist_genres,i
                                     ,np_already_shown = np_already_shown
                                     ) for i in range(movies_count)])

np_already_shown = np.hstack((np_already_shown, closest_genres))
closest_cast = np.array([closest_n(df_pdist_cast,i
                                     ,np_already_shown = np_already_shown
                                     ) for i in range(movies_count)])


np_already_shown = np.hstack((np_already_shown, closest_cast))
closest_pict = np.array([closest_n(df_pdist_pict,i
                                     ,np_already_shown = np_already_shown
                                     ) for i in range(movies_count)])

np_already_shown = np.hstack((np_already_shown, closest_pict))
closest_success = np.array([closest_n(df_pdist_success,i
                                     ,np_already_shown = np_already_shown
                                     ) for i in range(movies_count)])

### Trying to evaluate results


Calculate percent of chances for movies with same director to end up in our recommendation of either overall or similar pict.

In [63]:
add_percents_reco_same_director =0
nb_movies = 0
for i in range(movies_count):
    nb_common_directors = df[(df['director_name'] == df.loc[i, 'director_name'] ) & (df['director_name'] != '') ].count().max()-1
    if nb_common_directors > 0:
        nb_movies+=1
        df_close_movies =  df.loc[closest_overall[i]].append(df.loc[closest_pict[i]])
        nb_directors_in_reco = df_close_movies[df_close_movies['director_name'] == df.loc[i, 'director_name']].count().max()
        percent_reco_same_director= (nb_directors_in_reco/nb_common_directors)*100
        add_percents_reco_same_director += percent_reco_same_director
        
        
chance_reco_same_director =  add_percents_reco_same_director/nb_movies #77.6

print(chance_reco_same_director, 'chances to find same director movies')

77.67371232899231

Same test with actor_1

In [68]:
add_percents_reco_same_actor =0
nb_movies = 0
for i in range(movies_count):
    nb_common_actors = df[(df['actor_1_name'] == df.loc[i, 'actor_1_name'] ) & (df['actor_1_name'] != '') ].count().max()-1
    if nb_common_actors > 0:
        nb_movies+=1
        df_close_movies =  df.loc[closest_overall[i]].append(df.loc[closest_cast[i]])
        nb_actors_in_reco = df_close_movies[df_close_movies['actor_1_name'] == df.loc[i, 'actor_1_name']].count().max()
        percent_reco_same_actor= (nb_actors_in_reco/nb_common_actors)*100
        add_percents_reco_same_actor += percent_reco_same_actor
        
        
chance_reco_same_actor =  add_percents_reco_same_actor/nb_movies #

print(chance_reco_same_actor, 'chances to find same actor1 movies')

81.31536153012554 chances to find same actor1 movies


In [88]:
print('Average variance of imdb_score in our dataset', df['imdb_score'].var())

Average variance of imdb_score in our dataset 1.271970623049137


In [92]:
sum_variances =0

for i in range(movies_count):
    closect_ids = np.insert(closest_success[i],0,i)
    df_close_movies = df.loc[closect_ids ]
    sum_variances+=df_close_movies['imdb_score'].var()
    
avg_variance = sum_variances/movies_count
print('Average variance of imdb_score in similar success categ:', avg_variance)

Average variance of imdb_score in similar success categ: 0.010970979115812298


In [89]:
sum_variances =0

for i in range(movies_count):
    closect_ids = np.insert(closest_overall[i],0,i)
    df_close_movies = df.loc[closect_ids ]
    sum_variances+=df_close_movies['imdb_score'].var()
    
avg_variance = sum_variances/movies_count
print('Average variance of imdb_score in similar overall:', avg_variance)

Average variance of imdb_score in similar overall: 0.44534384323297943


#### Export

In [21]:
methods_list = ('closest_overall', 'closest_genres', 'closest_cast', 'closest_pict', 'closest_success')
methods={}
for i in methods_list:
    methods[i] = locals()[i]
    

In [22]:
for method in methods:
    np.save(method, methods[method])

In [24]:
for i in methods_list:
    locals()[i] = np.load(i+'.npy')

df_movies[['movie_id', 'movie_title']].to_csv('movies_titles_list.csv')


### Online API URL : http://xmontamat.pythonanywhere.com/
