In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import nan_euclidean_distances, cosine_similarity

In [46]:
movies = pd.read_csv('data/movies.dat', sep='::', names=['movie_id', 'movie_title', 'genra'], header=None )
movies.head()

Unnamed: 0,movie_id,movie_title,genra
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [47]:
movies.shape[0]

36383

In [48]:
movies = movies.dropna()
movies.shape[0]

36151

In [49]:
movies.shape[0] - movies['movie_id'].unique().shape[0]

2

In [50]:
counts = movies.groupby(['movie_id'])[['movie_title']].size()
counts[counts>1]

movie_id
1979376    2
4160708    2
dtype: int64

In [51]:
movies[movies['movie_id'] == 106519]

Unnamed: 0,movie_id,movie_title,genra
8242,106519,Carlito's Way (1993),Crime|Drama|Thriller


In [52]:
movies[movies['movie_id'] == 1979376]

Unnamed: 0,movie_id,movie_title,genra
21924,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy
21925,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy


In [53]:
movies[movies['movie_id'] == 4160708]

Unnamed: 0,movie_id,movie_title,genra
29305,4160708,Don't Breathe (2016),Crime|Horror|Thriller
29306,4160708,Don't Breathe (2016),Crime|Horror|Thriller


In [54]:
movies.drop_duplicates(subset=['movie_id'], inplace=True)

In [55]:
movies.index = movies['movie_id']
movies = movies.drop(columns=['movie_id'])
movies.head()

Unnamed: 0_level_0,movie_title,genra
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
10,La sortie des usines Lumière (1895),Documentary|Short
12,The Arrival of a Train (1896),Documentary|Short
91,Le manoir du diable (1896),Short|Horror
131,Une nuit terrible (1896),Short|Comedy|Horror


In [11]:
users = pd.read_csv('data/users.dat', sep='::', names=['user_id', 'twitter_id'], header=None )
users.head()

Unnamed: 0,user_id,twitter_id
0,1,139564917
1,2,522540374
2,3,475571186
3,4,215022153
4,5,349681331


In [12]:
print(users.shape)

(69324, 2)


In [13]:
ratings = pd.read_csv('data/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'rating_timestamp'], header=None )
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [14]:
ratings.shape

(888452, 4)

In [61]:
ratings.describe()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
count,888452.0,888452.0,888452.0,888452.0
mean,34879.738435,2187479.0,7.316825,1459300000.0
std,20100.224164,2025072.0,1.853552,69025600.0
min,1.0,8.0,0.0,1362062000.0
25%,17775.0,765443.0,6.0,1396187000.0
50%,34764.5,1714206.0,8.0,1450370000.0
75%,51869.0,2883512.0,9.0,1513955000.0
max,69324.0,12920710.0,10.0,1600911000.0


In [15]:
ratings_new = ratings.drop(columns=['rating_timestamp'])

In [16]:
ratings_new['count_user'] = ratings_new.groupby(['user_id'])['user_id'].transform('count')

In [62]:
ratings_new.head()

Unnamed: 0,user_id,movie_id,rating,count_user
0,1,114508,8,1
1,2,75314,1,21
2,2,102926,9,21
3,2,114369,10,21
4,2,118715,8,21


In [63]:
ratings_new.describe()

Unnamed: 0,user_id,movie_id,rating,count_user
count,888452.0,888452.0,888452.0,888452.0
mean,34879.738435,2187479.0,7.316825,176.492132
std,20100.224164,2025072.0,1.853552,304.982483
min,1.0,8.0,0.0,1.0
25%,17775.0,765443.0,6.0,22.0
50%,34764.5,1714206.0,8.0,73.0
75%,51869.0,2883512.0,9.0,201.0
max,69324.0,12920710.0,10.0,2875.0


In [64]:
ratings_user = ratings_new.query('count_user >= 2')
ratings_user.describe()

Unnamed: 0,user_id,movie_id,rating,count_user
count,858457.0,858457.0,858457.0,858457.0
mean,34891.289295,2173989.0,7.28744,182.623932
std,20103.784633,2012844.0,1.835697,308.464928
min,2.0,8.0,0.0,2.0
25%,17819.0,497465.0,6.0,25.0
50%,34767.0,1707386.0,7.0,78.0
75%,51869.0,2872732.0,9.0,211.0
max,69323.0,12920710.0,10.0,2875.0


In [19]:
ratings_user.shape

(858457, 4)

In [66]:
ratings_user = ratings_user.drop(columns=['count_user'])

In [67]:
ratings_user['count_movie'] = ratings_user.groupby(['movie_id'])['movie_id'].transform('count')
ratings_user.describe()

Unnamed: 0,user_id,movie_id,rating,count_movie
count,858457.0,858457.0,858457.0,858457.0
mean,34891.289295,2173989.0,7.28744,527.442254
std,20103.784633,2012844.0,1.835697,600.505193
min,2.0,8.0,0.0,1.0
25%,17819.0,497465.0,6.0,73.0
50%,34767.0,1707386.0,7.0,301.0
75%,51869.0,2872732.0,9.0,795.0
max,69323.0,12920710.0,10.0,2904.0


In [68]:
ratings_movie = ratings_user.query('count_movie >= 100')
ratings_movie.shape

(604326, 4)

In [69]:
ratings_movie.describe()

Unnamed: 0,user_id,movie_id,rating,count_movie
count,604326.0,604326.0,604326.0,604326.0
mean,34860.290833,2268111.0,7.463874,734.18766
std,20035.575976,1851115.0,1.758628,606.216264
min,2.0,21749.0,0.0,100.0
25%,17910.0,1152836.0,7.0,256.0
50%,34641.0,1853728.0,8.0,529.0
75%,51924.5,2883512.0,9.0,1035.0
max,69323.0,11390040.0,10.0,2904.0


In [22]:
user_by_movie = ratings_movie.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

In [23]:
user_by_movie.shape

(38171, 1675)

In [24]:
movies_used = movies.loc[user_by_movie.columns, 'movie_title']
movies_used

movie_id
21749                           City Lights (1931)
31381                    Gone with the Wind (1939)
33467                          Citizen Kane (1941)
34583                            Casablanca (1942)
36775                      Double Indemnity (1944)
                             ...                  
9243946     El Camino: A Breaking Bad Movie (2019)
9495224          Black Mirror: Bandersnatch (2018)
10039344                          Countdown (2019)
10431500           Yedinci Kogustaki Mucize (2019)
11390036                  A Fall from Grace (2020)
Name: movie_title, Length: 1675, dtype: object

In [25]:
movies_used.shape

(1675,)

In [26]:
user_by_movie_matrix = user_by_movie.to_numpy()

In [56]:
movie_genres_df = movies.loc[user_by_movie.columns, 'genra']
movie_genres_df

movie_id
21749                         Comedy|Drama|Romance
31381                    Drama|History|Romance|War
33467                                Drama|Mystery
34583                            Drama|Romance|War
36775       Crime|Drama|Film-Noir|Mystery|Thriller
                             ...                  
9243946                               Action|Drama
9495224              Drama|Mystery|Sci-Fi|Thriller
10039344                           Horror|Thriller
10431500                                     Drama
11390036                                  Thriller
Name: genra, Length: 1675, dtype: object

In [57]:
movie_genres_df.isna().any()

False

In [40]:
#movie_genres_df = movie_genres_df.fillna('')

In [58]:
def create_genra_list(movie_genres_df):
    genres_set = set()
    for i in range(movie_genres_df.shape[0]):
        genres_list = movie_genres_df.iloc[i].split('|')
        genres_set.update(genres_list)
    return list(genres_set)

In [59]:
genres = create_genra_list(movie_genres_df)
genres

['Music',
 'Sport',
 'Horror',
 'Fantasy',
 'Adventure',
 'Sci-Fi',
 'Thriller',
 'Mystery',
 'Comedy',
 'Romance',
 'Animation',
 'War',
 'Musical',
 'History',
 'Biography',
 'Western',
 'Film-Noir',
 'Action',
 'Drama',
 'Crime',
 'Documentary',
 'Family']

In [70]:
def create_genres_matrix(movie_genres_df, genres):
    numb_movies = movie_genres_df.shape[0]
    numb_genres = len(genres)
    genres_matrix = np.zeros(shape=(numb_movies, numb_genres))
    for i in range(numb_movies):
        genres_row = set(movie_genres_df.iloc[i].split('|'))
        for j,genre in enumerate(genres):
            if genre in genres_row:
                genres_matrix[i,j] = 1
    return genres_matrix

In [71]:
genres_matrix = create_genres_matrix(movie_genres_df, genres)
genres_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [80]:
genre_similarities = cosine_similarity(genres_matrix)
print(genre_similarities)

[[1.         0.57735027 0.40824829 ... 0.         0.57735027 0.        ]
 [0.57735027 1.         0.35355339 ... 0.         0.5        0.        ]
 [0.40824829 0.35355339 1.         ... 0.         0.70710678 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.70710678]
 [0.57735027 0.5        0.70710678 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.70710678 0.         1.        ]]


In [None]:
def get_all_scores(user_by_movie_matrix):
    movies_number = user_by_movie.shape[1]
    scores = np.zeros(shape=(movies_number, movies_number))
    for index1 in range(movies_number):
        diffs = np.subtract(user_by_movie_matrix, np.vstack(user_by_movie_matrix[:, index1]))
        diffs[np.isnan(diffs)] = 0.0
        scores[index1] = np.linalg.norm(diffs, axis=0)
    return scores

In [56]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 19:33:16


In [58]:
scores = get_all_scores(user_by_movie_matrix)

In [60]:
#scores.shape

In [62]:
scores_full = scores + scores.T - np.diag(np.diag(scores))
number_movies_returned  = 5
negative_n = -1*number_movies_returned
closest_movies = np.argpartition(scores_full, negative_n, axis=1)[:, negative_n:]

In [64]:
#closest_movies.shape

In [65]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 19:33:25


In [66]:
all_closest_movies_df = pd.DataFrame()
all_closest_movies_df['movie_title'] = movies_used.values
for i in range(number_movies_returned):
    all_closest_movies_df['closest_movie_{}'.format(i+1)] = closest_movies[:,i]

In [67]:
all_closest_movies_df.head()

Unnamed: 0,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5
0,The Shawshank Redemption (1994),5,147,145,67,28
1,Se7en (1995),140,105,145,122,5
2,The Secret Life of Walter Mitty (2013),144,138,67,147,145
3,Jurassic World (2015),99,144,145,67,147
4,Wonder Woman (2017),5,52,145,2,86


In [68]:
all_closest_movies_df.shape

(148, 6)

In [69]:
database_filename = 'movie_recommendations.db'
table_name = 'Closest_movies'
engine = create_engine('sqlite:///' + database_filename)
all_closest_movies_df.to_sql(table_name, engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
# Extract saved recommendations

In [219]:
database_filepath = 'movie_recommendations_v4.db'
engine = create_engine('sqlite:///'+ database_filepath)
df = pd.read_sql_table('Closest_movies', engine)
engine.dispose()

In [220]:
df.head()

Unnamed: 0,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5
0,It's a Wonderful Life (1946),577,623,569,234,627
1,Rear Window (1954),157,627,136,630,569
2,12 Angry Men (1957),89,15,243,4,173
3,Psycho (1960),194,152,569,627,496
4,"Il buono, il brutto, il cattivo (1966)",323,627,628,169,37


In [228]:
movie = df[df['movie_title'] == "Harry Potter and the Deathly Hallows: Part 2 (2011)"].reset_index()
movie

Unnamed: 0,index,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5


In [224]:
number_movies_returned  = 5
for i in range(number_movies_returned):
    column_name = 'closest_movie_{}'.format(i+1)
    id = movie.loc[0,column_name]
    movie_name = df.loc[id]['movie_title']
    print(movie_name)

Hush (2016)
Kimi no na wa. (2016)
The Florida Project (2017)
A Ghost Story (2017)
Dangal (2016)


In [81]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from scipy.spatial import distance
x = np.array(range(16)).reshape((4,4))
x

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [85]:
y = x-np.vstack(x[:,0])
y

array([[0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3]])

In [120]:
x = np.zeros(shape=(4,4))
x

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [123]:
x[0] = [0,0,7,9]
x[1] = [7,7,7,9]
x[2] = [1,1,8,6]
x[3] = [0,0,0,9]
x

array([[0., 0., 7., 9.],
       [7., 7., 7., 9.],
       [1., 1., 8., 6.],
       [0., 0., 0., 9.]])

In [124]:
cosine_similarity(x)

array([[1.        , 0.75509962, 0.95525853, 0.78935222],
       [0.75509962, 1.        , 0.81311912, 0.59603956],
       [0.95525853, 0.81311912, 1.        , 0.59408853],
       [0.78935222, 0.59603956, 0.59408853, 1.        ]])

In [125]:
np.dot(x, x.T)

array([[130., 130., 110.,  81.],
       [130., 228., 124.,  81.],
       [110., 124., 102.,  54.],
       [ 81.,  81.,  54.,  81.]])

In [115]:
y = y / 1.0
y[0, 1] = np.NaN
y

array([[ 1., nan,  2.,  3.],
       [ 1.,  1.,  2.,  3.],
       [ 1.,  1.,  2.,  3.],
       [ 1.,  1.,  2.,  3.]])

In [116]:
y[np.isnan(y)] = 0
y

array([[1., 0., 2., 3.],
       [1., 1., 2., 3.],
       [1., 1., 2., 3.],
       [1., 1., 2., 3.]])

In [117]:
def multiply_by_2(y):
    x = y.copy()
    x[x==0] = 1
    return x*2
multiply_by_2(y)

array([[2., 2., 4., 6.],
       [2., 2., 4., 6.],
       [2., 2., 4., 6.],
       [2., 2., 4., 6.]])

In [118]:
y

array([[1., 0., 2., 3.],
       [1., 1., 2., 3.],
       [1., 1., 2., 3.],
       [1., 1., 2., 3.]])

In [101]:
np.dot(y.T, y)

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  3.,  6.,  9.],
       [ 0.,  6., 16., 24.],
       [ 0.,  9., 24., 36.]])

In [87]:
np.linalg.norm(y, axis=0)

array([0., 2., 4., 6.])

In [75]:
x[:,0] = [7,9,7,6]
x[:,1] = [7,9,7,8]
x[:,2] = [2,3,4,8]
x[:,3] = [1,2,2,8]
x

array([[7, 7, 2, 1],
       [9, 9, 3, 2],
       [7, 7, 4, 2],
       [6, 8, 8, 8]])

In [171]:
x1 = x[:, 1]
x2 = x[:, 0]

In [172]:
a = 1- distance.cosine(x1,x2)
a

0.9931240301527878

In [237]:
number_movies_returned = 3
scores = np.array([[7,6,np.NaN,np.NaN], [5,np.NaN,1,0], [np.NaN,5,np.NaN,3], [9,np.NaN,7,6]])
scores
#np.argpartition(scores, -1*number_movies_returned, axis=1)[:, -1*number_movies_returned:]

array([[ 7.,  6., nan, nan],
       [ 5., nan,  1.,  0.],
       [nan,  5., nan,  3.],
       [ 9., nan,  7.,  6.]])

In [238]:
np.nanmean(scores-np.vstack(scores[:,1]), axis=0)

array([ 1.,  0., nan, -2.])

In [41]:
def get_users(movie_index, user_by_movie_matrix):
    column = user_by_movie_matrix[:,movie_index]
    users = np.where(np.isnan(column) == False)[0] 
    return users

def get_common_users(movie_index1, movie_index2, user_by_movie_matrix):
    users1 = get_users(movie_index1, user_by_movie_matrix)
    users2 = get_users(movie_index2, user_by_movie_matrix)
    common_users = np.intersect1d(users1, users2, assume_unique=True)
    return common_users

def compute_score(movie_index1, movie_index2, user_by_movie_matrix):
    common_users = get_common_users(movie_index1, movie_index2, user_by_movie_matrix)
    movie1_ratings = np.array(user_by_movie_matrix[common_users, movie_index1])
    movie2_ratings = np.array(user_by_movie_matrix[common_users, movie_index2])

    distance = np.linalg.norm(movie1_ratings - movie2_ratings)

    score = 0
    if distance != 0:
        score = 1 / distance

    return score

In [42]:
number_movies_returned  = 5
def get_closest_movies(column):
    negative_n = -1*number_movies_returned
    closest_movies = np.argpartition(column, negative_n)[negative_n:]
    
    return closest_movies

In [43]:
def get_closest_movies(number_movies, scores):
    all_closest_movies = {}
    for movie_index in range(scores.shape[0]):
        closest_movies_scores = [0]
        closest_movies_indexes = [movie_index]   
        for temp_index in range(scores.shape[0]):
            if temp_index > movie_index:
                temp_score = scores[movie_index, temp_index] 
            else:
                temp_score = scores[temp_index, movie_index]
            min_score = closest_movies_scores[-1]
            if temp_score > min_score:
                closest_movies_scores.append(temp_score)
                closest_movies_indexes.append(temp_index)
                index_sorted = len(closest_movies_scores) - 2
                next_score = closest_movies_scores[index_sorted]
                next_index = closest_movies_indexes[index_sorted]
                while (next_score < temp_score) and index_sorted >= 0 :
                    closest_movies_scores[index_sorted+1] = next_score
                    closest_movies_indexes[index_sorted+1] = next_index
                    closest_movies_scores[index_sorted] = temp_score
                    closest_movies_indexes[index_sorted] = temp_index
                    index_sorted -= 1
                if len(closest_movies_scores) > number_movies:
                    closest_movies_scores = closest_movies_scores[0:number_movies-1]
                    closest_movies_indexes = closest_movies_indexes[0:number_movies-1] 
        if closest_movies_indexes[-1] == movie_index:
            if len(closest_movies_indexes) == 1:
                closest_movies_scores = []
                closest_movies_indexes = []
            else:
                closest_movies_scores = closest_movies_scores[0:-2]
                closest_movies_indexes = closest_movies_indexes[0:-2]
        all_closest_movies[movie_index] = closest_movies_indexes

    return all_closest_movies