In [312]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import nan_euclidean_distances, cosine_similarity
from scipy.spatial import distance

In [313]:
movies = pd.read_csv('data/movies.dat', sep='::', names=['movie_id', 'movie_title', 'genra'], header=None )
movies.head()

Unnamed: 0,movie_id,movie_title,genra
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [476]:
movies[movies['movie_title']== 'Tenet (2020)']

Unnamed: 0_level_0,movie_title,genra
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6723592,Tenet (2020),Action|Drama|Thriller


In [477]:
movies[movies['movie_title']== 'Looper (2012)']

Unnamed: 0_level_0,movie_title,genra
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1276104,Looper (2012),Action|Crime|Drama|Sci-Fi|Thriller


In [314]:
movies.shape[0]

36383

In [315]:
movies = movies.dropna()
movies.shape[0]

36151

In [316]:
movies.shape[0] - movies['movie_id'].unique().shape[0]

2

In [317]:
counts = movies.groupby(['movie_id'])[['movie_title']].size()
counts[counts>1]

movie_id
1979376    2
4160708    2
dtype: int64

In [318]:
movies[movies['movie_id'] == 106519]

Unnamed: 0,movie_id,movie_title,genra
8242,106519,Carlito's Way (1993),Crime|Drama|Thriller


In [319]:
movies[movies['movie_id'] == 1979376]

Unnamed: 0,movie_id,movie_title,genra
21924,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy
21925,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy


In [320]:
movies[movies['movie_id'] == 4160708]

Unnamed: 0,movie_id,movie_title,genra
29305,4160708,Don't Breathe (2016),Crime|Horror|Thriller
29306,4160708,Don't Breathe (2016),Crime|Horror|Thriller


In [321]:
movies.drop_duplicates(subset=['movie_id'], inplace=True)

In [322]:
movies.index = movies['movie_id']
movies = movies.drop(columns=['movie_id'])
movies.head()

Unnamed: 0_level_0,movie_title,genra
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
10,La sortie des usines Lumière (1895),Documentary|Short
12,The Arrival of a Train (1896),Documentary|Short
91,Le manoir du diable (1896),Short|Horror
131,Une nuit terrible (1896),Short|Comedy|Horror


In [323]:
ratings = pd.read_csv('data/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'rating_timestamp'], header=None )
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [324]:
ratings.shape

(888452, 4)

In [325]:
ratings.describe()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
count,888452.0,888452.0,888452.0,888452.0
mean,34879.738435,2187479.0,7.316825,1459300000.0
std,20100.224164,2025072.0,1.853552,69025600.0
min,1.0,8.0,0.0,1362062000.0
25%,17775.0,765443.0,6.0,1396187000.0
50%,34764.5,1714206.0,8.0,1450370000.0
75%,51869.0,2883512.0,9.0,1513955000.0
max,69324.0,12920710.0,10.0,1600911000.0


In [484]:
ratings['rating'].mean()

7.316825219595431

In [326]:
ratings_new = ratings.drop(columns=['rating_timestamp'])

In [327]:
ratings_new['count_user'] = ratings_new.groupby(['user_id'])['user_id'].transform('count')

In [328]:
ratings_new.head()

Unnamed: 0,user_id,movie_id,rating,count_user
0,1,114508,8,1
1,2,75314,1,21
2,2,102926,9,21
3,2,114369,10,21
4,2,118715,8,21


In [329]:
ratings_new.describe()

Unnamed: 0,user_id,movie_id,rating,count_user
count,888452.0,888452.0,888452.0,888452.0
mean,34879.738435,2187479.0,7.316825,176.492132
std,20100.224164,2025072.0,1.853552,304.982483
min,1.0,8.0,0.0,1.0
25%,17775.0,765443.0,6.0,22.0
50%,34764.5,1714206.0,8.0,73.0
75%,51869.0,2883512.0,9.0,201.0
max,69324.0,12920710.0,10.0,2875.0


In [330]:
ratings_user = ratings_new.query('count_user >= 2')
ratings_user.describe()

Unnamed: 0,user_id,movie_id,rating,count_user
count,858457.0,858457.0,858457.0,858457.0
mean,34891.289295,2173989.0,7.28744,182.623932
std,20103.784633,2012844.0,1.835697,308.464928
min,2.0,8.0,0.0,2.0
25%,17819.0,497465.0,6.0,25.0
50%,34767.0,1707386.0,7.0,78.0
75%,51869.0,2872732.0,9.0,211.0
max,69323.0,12920710.0,10.0,2875.0


In [331]:
ratings_user.shape

(858457, 4)

In [332]:
ratings_user = ratings_user.drop(columns=['count_user'])

In [333]:
ratings_user['count_movie'] = ratings_user.groupby(['movie_id'])['movie_id'].transform('count')
ratings_user.describe()

Unnamed: 0,user_id,movie_id,rating,count_movie
count,858457.0,858457.0,858457.0,858457.0
mean,34891.289295,2173989.0,7.28744,527.442254
std,20103.784633,2012844.0,1.835697,600.505193
min,2.0,8.0,0.0,1.0
25%,17819.0,497465.0,6.0,73.0
50%,34767.0,1707386.0,7.0,301.0
75%,51869.0,2872732.0,9.0,795.0
max,69323.0,12920710.0,10.0,2904.0


In [334]:
ratings_movie = ratings_user.query('count_movie >= 15')
ratings_movie.shape

(774628, 4)

In [335]:
ratings_movie.describe()

Unnamed: 0,user_id,movie_id,rating,count_movie
count,774628.0,774628.0,774628.0,774628.0
mean,34850.393443,2170880.0,7.356214,583.892687
std,20055.977964,1938695.0,1.796465,605.801772
min,2.0,417.0,0.0,15.0
25%,17866.0,816692.0,6.0,119.0
50%,34658.0,1727824.0,8.0,359.0
75%,51842.0,2854926.0,9.0,857.0
max,69323.0,12117850.0,10.0,2904.0


In [485]:
ratings_movie['rating'].mean()

7.3562135631554755

In [336]:
user_by_movie = ratings_movie.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

In [337]:
user_by_movie.shape

(39076, 6178)

In [338]:
movies_used = movies.loc[user_by_movie.columns, 'movie_title']
movies_used

movie_id
417                        A Trip to the Moon (1902)
10323            Das Cabinet des Dr. Caligari (1920)
12349                                 The Kid (1921)
13442                               Nosferatu (1922)
15324                            Sherlock Jr. (1924)
                              ...                   
11561866                   Masameer the Movie (2020)
11615290                   Beastie Boys Story (2020)
11833648                      Shams al-Maaref (2020)
11958344    Nakitai watashi wa neko wo kaburu (2020)
12117854     Jerry Seinfeld: 23 Hours to Kill (2020)
Name: movie_title, Length: 6178, dtype: object

In [339]:
movies_used.shape

(6178,)

In [340]:
user_by_movie_matrix = user_by_movie.to_numpy()

In [490]:
np.nanmean(user_by_movie_matrix)

7.3562135631554755

In [341]:
movie_genres_df = movies.loc[user_by_movie.columns, 'genra']
movie_genres_df

movie_id
417         Short|Action|Adventure|Comedy|Fantasy|Sci-Fi
10323                    Fantasy|Horror|Mystery|Thriller
12349                                Comedy|Drama|Family
13442                                     Fantasy|Horror
15324                              Action|Comedy|Romance
                                ...                     
11561866                                       Animation
11615290                                     Documentary
11833648                                          Comedy
11958344                                       Animation
12117854                                          Comedy
Name: genra, Length: 6178, dtype: object

In [342]:
movie_genres_df.isna().any()

False

In [343]:
movie_genres_df = movie_genres_df.fillna('')

In [344]:
def create_genra_list(movie_genres_df):
    genres_set = set()
    for i in range(movie_genres_df.shape[0]):
        genres_list = movie_genres_df.iloc[i].split('|')
        genres_set.update(genres_list)
    return list(genres_set)

In [345]:
genres = create_genra_list(movie_genres_df)
genres

['Sci-Fi',
 'Crime',
 'Sport',
 'Horror',
 'Action',
 'Short',
 'Biography',
 'War',
 'Documentary',
 'News',
 'Romance',
 'History',
 'Musical',
 'Western',
 'Mystery',
 'Family',
 'Film-Noir',
 'Adventure',
 'Animation',
 'Comedy',
 'Drama',
 'Thriller',
 'Music',
 'Fantasy']

In [346]:
def create_genres_matrix(movie_genres_df, genres):
    numb_movies = movie_genres_df.shape[0]
    numb_genres = len(genres)
    genres_matrix = np.zeros(shape=(numb_movies, numb_genres))
    for i in range(numb_movies):
        genres_row = set(movie_genres_df.iloc[i].split('|'))
        for j,genre in enumerate(genres):
            if genre in genres_row:
                genres_matrix[i,j] = 1
    return genres_matrix

In [347]:
genres_matrix = create_genres_matrix(movie_genres_df, genres)
genres_matrix

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [348]:
genre_similarities = cosine_similarity(genres_matrix)
print(genre_similarities)

[[1.         0.20412415 0.23570226 ... 0.40824829 0.         0.40824829]
 [0.20412415 1.         0.         ... 0.         0.         0.        ]
 [0.23570226 0.         1.         ... 0.57735027 0.         0.57735027]
 ...
 [0.40824829 0.         0.57735027 ... 1.         0.         1.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.40824829 0.         0.57735027 ... 1.         0.         1.        ]]


In [349]:
def get_common_raters(user_by_movie_matrix):
    '''
        get_common_raters - function to calculate number of common raters (users who rated both movies) between each pair of movies based on user ratings 
        using linear kernel to speed up the process (like cosine similarity but without norming - sum of multilications of corresponding coordinates).

        Input:
        - user_by_movie_matrix - numpy matrix with movie ratings where each row represents a user and each column a movie
        Output:
        - common_raters - numpy matrix of raters where each row and column represent a movie, so score[i,j] stores number of users who scored both movies number i and number j.
    '''
    # create a copy of matrix since we will be modifying it
    temp_matrix = user_by_movie_matrix.copy()
    # fill all non nans with ones to compute number of common elements when multiplying vectors
    temp_matrix[~np.isnan(temp_matrix)] = 1
    # fill nans with zeroes to have correct matrix multiplication
    temp_matrix[np.isnan(temp_matrix)] = 0
    # compute linear kernel (like cosine similarity but without norming) - measure of similarity between movies
    # the first term is transposed as user_by_movie_matrix has movies as columns, but we need first term to row-oriented
    common_raters = np.dot(temp_matrix.T, temp_matrix)
    # minimum number of common reviewers between two movies to be considered as similar
    min_common_raters = 5
    # replace zeroes and movies that have too few common reviewers with -1 to avoid dividing by zero and give the movies with few common raters a low score
    common_raters[common_raters<min_common_raters] = -1
    # replace other large enough amounts of common raters with 1, so that dividing similarity scores by it has no effect
    # common_raters[common_raters>=min_common_raters] = 1
    return common_raters

def get_all_scores(user_by_movie_matrix):
    '''
        get_all_scores - function to calculate similarity score between each pair of movies based on user ratings 
        using linear kernel (like cosine similarity but without norming - sum of multilications of corresponding coordinates).
        This works well for several reasons:
        1. The higher the both scores by a user to a 2 different movies, the more the user liked both movies, and the higher will be the score. 
        2. The more times similar users watched some 2 movies, the higher will be the score, and so the more likely it is that
         movies are similar (think about Harry Potter or say scary movie fans).

        Input:
        - user_by_movie_matrix - numpy matrix with movie ratings where each row represents a user and each column a movie
        Output:
        - scores - numpy matrix of scores where each row and column represent a movie, so score[i,j] stores similarity score based on ratings
        between movies number i and number j.
    '''
    # create a copy of matrix since we will be modifying it
    temp_matrix = user_by_movie_matrix.copy()
    # fill nans with zeroes to have correct matrix multiplication
    temp_matrix[np.isnan(temp_matrix)] = 0.0
    # compute linear kernel (like cosine similarity but without norming) - measure of similarity between movies
    scores = np.dot(temp_matrix.T, temp_matrix)
    #scores = cosine_similarity(temp_matrix.T)
    #scores = np.multiply(scores, scores_cosine)
    # set diagonals with 0 so that movies is not declared to be most similar to itself in the web app
    np.fill_diagonal(scores, 0)
    return scores

In [350]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 23:22:02


In [399]:
scores = get_all_scores(user_by_movie_matrix)
# get number of common raters
common_raters = get_common_raters(user_by_movie_matrix)
# divide score by the number of common raters to get a more fair score
scores = np.multiply(scores, (1/common_raters))

In [400]:
scores.shape

(6178, 6178)

In [401]:
number_movies_returned  = 15
negative_n = -1*number_movies_returned
closest_movies = np.argpartition(scores, negative_n, axis=1)[:, negative_n:]

In [402]:
closest_movies.shape

(6178, 15)

In [355]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 23:25:12


In [403]:
all_closest_movies_df = pd.DataFrame()
all_closest_movies_df['movie_title'] = movies_used.values
for i in range(number_movies_returned):
    all_closest_movies_df['closest_movie_{}'.format(i+1)] = closest_movies[:,i]

In [357]:
all_closest_movies_df.head()

Unnamed: 0,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5,closest_movie_6,closest_movie_7,closest_movie_8,closest_movie_9,closest_movie_10,closest_movie_11,closest_movie_12,closest_movie_13,closest_movie_14,closest_movie_15
0,A Trip to the Moon (1902),246,4579,4789,4298,4529,19,209,6078,3839,4597,3288,111,405,2534,64
1,Das Cabinet des Dr. Caligari (1920),2534,4597,4039,2778,416,1761,3,3203,4529,530,4874,458,174,325,209
2,The Kid (1921),241,1012,2273,1176,376,2922,3169,1989,1178,1245,504,190,1984,2986,1626
3,Nosferatu (1922),374,132,508,1761,139,1428,458,4872,466,339,1429,3569,530,424,544
4,Sherlock Jr. (1924),94,3809,107,3868,91,190,953,4137,5521,143,83,246,1864,84,3881


In [358]:
all_closest_movies_df.shape

(6178, 16)

In [359]:
database_filename = 'movie_recommendations.db'
table_name = 'Closest_movies'
engine = create_engine('sqlite:///' + database_filename)
all_closest_movies_df.to_sql(table_name, engine, index=False, if_exists='replace')
engine.dispose()

In [360]:
# See the vectors

In [361]:
movies_used_new = movies_used.reset_index(drop=True)

In [528]:
movie_1 = "7500 (2019)"
movie_2 = "Tenet (2020)"
movie_id_1 = movies_used_new.loc[movies_used_new == movie_1].index.values[0]
movie_id_2 = movies_used_new.loc[movies_used_new == movie_2].index.values[0]
print(movie_id_1, movie_id_2)

5845 5886


In [529]:
print(scores[movie_id_1, movie_id_2])

49.285714285714285


In [530]:
all_closest_movies_df.loc[movie_id_2]

movie_title         Tenet (2020)
closest_movie_1             1206
closest_movie_2             1943
closest_movie_3             1219
closest_movie_4             2534
closest_movie_5             2571
closest_movie_6             5610
closest_movie_7             3982
closest_movie_8             1862
closest_movie_9             1562
closest_movie_10            2847
closest_movie_11            1146
closest_movie_12            5170
closest_movie_13            3160
closest_movie_14            1023
closest_movie_15            1356
Name: 5886, dtype: object

In [531]:
movie_vector1 = user_by_movie_matrix[:, movie_id_1].copy()
movie_vector2 = user_by_movie_matrix[:, movie_id_2].copy()
indexes_1 = np.where(~np.isnan(movie_vector1))
indexes_2 = np.where(~np.isnan(movie_vector2))
common_indexes = np.intersect1d(indexes_1, indexes_2)
movie_vector1[np.isnan(movie_vector1)] = 0
movie_vector2[np.isnan(movie_vector2)] = 0
vector1 = movie_vector1[common_indexes]
vector2 = movie_vector2[common_indexes]

In [532]:
print(len(common_indexes))
print(1- distance.cosine(vector1,vector2))
print(np.dot(movie_vector1, movie_vector2.T))
print(np.dot(movie_vector1, movie_vector2.T) / len(common_indexes))

7
0.9671077018960662
345.0
49.285714285714285


In [482]:
# Extract saved recommendations

In [367]:
database_filepath = 'recommendation_database/movie_recs_5000_multiply_genres_15.db'
engine = create_engine('sqlite:///'+ database_filepath)
df = pd.read_sql_table('Closest_movies', engine)
engine.dispose()

In [368]:
df.head()

Unnamed: 0,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5,closest_movie_6,closest_movie_7,closest_movie_8,closest_movie_9,closest_movie_10,closest_movie_11,closest_movie_12,closest_movie_13,closest_movie_14,closest_movie_15
0,A Trip to the Moon (1902),4661,2652,2698,3834,4434,2995,3742,3814,2064,3587,3608,3376,1931,4317,2644
1,Das Cabinet des Dr. Caligari (1920),3851,143,4130,3,4635,92,16,14,2656,2734,5048,4649,4463,4976,2652
2,The Kid (1921),1164,3859,3608,1151,3452,6,3014,54,31,22,13,4401,799,4019,136
3,Nosferatu (1922),270,4364,4463,1,340,14,390,3814,2371,440,4130,222,143,2656,436
4,Sherlock Jr. (1924),4515,79,19,136,22,8,319,4340,33,174,83,2770,3376,4215,2260


In [369]:
movie = df[df['movie_title'] == "Tenet (2020)"].reset_index()
movie

Unnamed: 0,index,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5,closest_movie_6,closest_movie_7,closest_movie_8,closest_movie_9,closest_movie_10,closest_movie_11,closest_movie_12,closest_movie_13,closest_movie_14,closest_movie_15
0,4938,Tenet (2020),1688,5098,5091,3874,2268,5116,4144,5117,5053,4791,3302,2778,5013,4941,5040


In [370]:
number_movies_returned  = 15
for i in range(number_movies_returned):
    column_name = 'closest_movie_{}'.format(i+1)
    id = movie.loc[0,column_name]
    movie_name = df.loc[id]['movie_title']
    print(movie_name)

The Departed (2006)
1917 (2019)
The Gentlemen (2019)
Jojo Rabbit (2019)
The Invisible Man (2020)
Extraction (2020)
Little Women (2019)
Knives Out (2019)
I'm Thinking of Ending Things (2020)
Uncut Gems (2019)
Ford v Ferrari (2019)
Bad Boys for Life (2020)
The Devil All the Time (2020)
Parasite (Gisaengchung) (2019)
Birds of Prey: And the Fantabulous Emancipation of One Harley Quinn (2020)


In [371]:
# experimentations with similarity metrics - not relevent, run on test data

In [372]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from scipy.spatial import distance
x = np.array(range(16)).reshape((4,4))
x

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [373]:
y = x-np.vstack(x[:,0])
y

array([[0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3]])

In [374]:
x = np.zeros(shape=(4,4))
x

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [375]:
x[0] = [0,0,7,9]
x[1] = [7,7,7,9]
x[2] = [1,1,8,6]
x[3] = [0,0,0,9]
x

array([[0., 0., 7., 9.],
       [7., 7., 7., 9.],
       [1., 1., 8., 6.],
       [0., 0., 0., 9.]])

In [376]:
cosine_similarity(x)

array([[1.        , 0.75509962, 0.95525853, 0.78935222],
       [0.75509962, 1.        , 0.81311912, 0.59603956],
       [0.95525853, 0.81311912, 1.        , 0.59408853],
       [0.78935222, 0.59603956, 0.59408853, 1.        ]])

In [377]:
np.dot(x, x.T)

array([[130., 130., 110.,  81.],
       [130., 228., 124.,  81.],
       [110., 124., 102.,  54.],
       [ 81.,  81.,  54.,  81.]])

In [378]:
y = y / 1.0
y[0, 1] = np.NaN
y

array([[ 0., nan,  2.,  3.],
       [ 0.,  1.,  2.,  3.],
       [ 0.,  1.,  2.,  3.],
       [ 0.,  1.,  2.,  3.]])

In [379]:
y[np.isnan(y)] = 0
y

array([[0., 0., 2., 3.],
       [0., 1., 2., 3.],
       [0., 1., 2., 3.],
       [0., 1., 2., 3.]])

In [380]:
def multiply_by_2(y):
    x = y.copy()
    x[x==0] = 1
    return x*2
multiply_by_2(y)

array([[2., 2., 4., 6.],
       [2., 2., 4., 6.],
       [2., 2., 4., 6.],
       [2., 2., 4., 6.]])

In [381]:
y

array([[0., 0., 2., 3.],
       [0., 1., 2., 3.],
       [0., 1., 2., 3.],
       [0., 1., 2., 3.]])

In [382]:
np.dot(y.T, y)

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  3.,  6.,  9.],
       [ 0.,  6., 16., 24.],
       [ 0.,  9., 24., 36.]])

In [383]:
np.linalg.norm(y, axis=0)

array([0.        , 1.73205081, 4.        , 6.        ])

In [491]:
x[:,0] = [7,9,7,6]
x[:,1] = [7,9,7,8]
x[:,2] = [2,3,4,8]
x[:,3] = [1,2,2,8]
x

array([[7., 7., 2., 1.],
       [9., 9., 3., 2.],
       [7., 7., 4., 2.],
       [6., 8., 8., 8.]])

In [494]:
y = x *x
y

array([[49., 49.,  4.,  1.],
       [81., 81.,  9.,  4.],
       [49., 49., 16.,  4.],
       [36., 64., 64., 64.]])

In [385]:
x1 = x[:, 1]
x2 = x[:, 0]

In [386]:
a = 1- distance.cosine(x1,x2)
a

0.9931240301527878

In [387]:
number_movies_returned = 3
test = np.array([[7,6,np.NaN,np.NaN], [5,np.NaN,1,0], [np.NaN,5,np.NaN,3], [9,np.NaN,7,6]])
test
#np.argpartition(scores, -1*number_movies_returned, axis=1)[:, -1*number_movies_returned:]

array([[ 7.,  6., nan, nan],
       [ 5., nan,  1.,  0.],
       [nan,  5., nan,  3.],
       [ 9., nan,  7.,  6.]])

In [388]:
np.nanmean(test-np.vstack(test[:,1]), axis=0)

array([ 1.,  0., nan, -2.])

In [389]:
def get_users(movie_index, user_by_movie_matrix):
    column = user_by_movie_matrix[:,movie_index]
    users = np.where(np.isnan(column) == False)[0] 
    return users

def get_common_users(movie_index1, movie_index2, user_by_movie_matrix):
    users1 = get_users(movie_index1, user_by_movie_matrix)
    users2 = get_users(movie_index2, user_by_movie_matrix)
    common_users = np.intersect1d(users1, users2, assume_unique=True)
    return common_users

def compute_score(movie_index1, movie_index2, user_by_movie_matrix):
    common_users = get_common_users(movie_index1, movie_index2, user_by_movie_matrix)
    movie1_ratings = np.array(user_by_movie_matrix[common_users, movie_index1])
    movie2_ratings = np.array(user_by_movie_matrix[common_users, movie_index2])

    distance = np.linalg.norm(movie1_ratings - movie2_ratings)

    score = 0
    if distance != 0:
        score = 1 / distance

    return score

In [390]:
# old functions to compare with newer ones - not relevant

In [391]:
number_movies_returned  = 5
def get_closest_movies(column):
    negative_n = -1*number_movies_returned
    closest_movies = np.argpartition(column, negative_n)[negative_n:]
    
    return closest_movies

In [392]:
def get_closest_movies(number_movies, scores):
    all_closest_movies = {}
    for movie_index in range(scores.shape[0]):
        closest_movies_scores = [0]
        closest_movies_indexes = [movie_index]   
        for temp_index in range(scores.shape[0]):
            if temp_index > movie_index:
                temp_score = scores[movie_index, temp_index] 
            else:
                temp_score = scores[temp_index, movie_index]
            min_score = closest_movies_scores[-1]
            if temp_score > min_score:
                closest_movies_scores.append(temp_score)
                closest_movies_indexes.append(temp_index)
                index_sorted = len(closest_movies_scores) - 2
                next_score = closest_movies_scores[index_sorted]
                next_index = closest_movies_indexes[index_sorted]
                while (next_score < temp_score) and index_sorted >= 0 :
                    closest_movies_scores[index_sorted+1] = next_score
                    closest_movies_indexes[index_sorted+1] = next_index
                    closest_movies_scores[index_sorted] = temp_score
                    closest_movies_indexes[index_sorted] = temp_index
                    index_sorted -= 1
                if len(closest_movies_scores) > number_movies:
                    closest_movies_scores = closest_movies_scores[0:number_movies-1]
                    closest_movies_indexes = closest_movies_indexes[0:number_movies-1] 
        if closest_movies_indexes[-1] == movie_index:
            if len(closest_movies_indexes) == 1:
                closest_movies_scores = []
                closest_movies_indexes = []
            else:
                closest_movies_scores = closest_movies_scores[0:-2]
                closest_movies_indexes = closest_movies_indexes[0:-2]
        all_closest_movies[movie_index] = closest_movies_indexes

    return all_closest_movies