In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import nan_euclidean_distances

In [31]:
movies = pd.read_csv('data/movies.dat', sep='::', names=['movie_id', 'movie_title', 'genra'], header=None )
movies.head()

Unnamed: 0,movie_id,movie_title,genra
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [32]:
movies.shape[0] - movies['movie_id'].unique().shape[0]

3

In [33]:
counts = movies.groupby(['movie_id'])[['movie_title']].size()
counts[counts>1]

movie_id
106519     2
1979376    2
4160708    2
dtype: int64

In [34]:
movies[movies['movie_id'] == 106519]

Unnamed: 0,movie_id,movie_title,genra
8241,106519,Carlito's Way (1993),
8242,106519,Carlito's Way (1993),Crime|Drama|Thriller


In [35]:
movies[movies['movie_id'] == 1979376]

Unnamed: 0,movie_id,movie_title,genra
21924,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy
21925,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy


In [36]:
movies[movies['movie_id'] == 4160708]

Unnamed: 0,movie_id,movie_title,genra
29305,4160708,Don't Breathe (2016),Crime|Horror|Thriller
29306,4160708,Don't Breathe (2016),Crime|Horror|Thriller


In [37]:
movies.drop_duplicates(subset=['movie_id'], inplace=True)

In [38]:
movies.index = movies['movie_id']
movies = movies.drop(columns=['movie_id'])
movies.head()

Unnamed: 0_level_0,movie_title,genra
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
10,La sortie des usines Lumière (1895),Documentary|Short
12,The Arrival of a Train (1896),Documentary|Short
25,The Oxford and Cambridge University Boat Race ...,
91,Le manoir du diable (1896),Short|Horror


In [39]:
users = pd.read_csv('data/users.dat', sep='::', names=['user_id', 'twitter_id'], header=None )
users.head()

Unnamed: 0,user_id,twitter_id
0,1,139564917
1,2,522540374
2,3,475571186
3,4,215022153
4,5,349681331


In [40]:
print(users.shape)

(69324, 2)


In [41]:
ratings = pd.read_csv('data/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'rating_timestamp'], header=None )
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [42]:
ratings.shape

(888452, 4)

In [43]:
ratings_new = ratings.drop(columns=['rating_timestamp'])

In [44]:
ratings_new['count_user'] = ratings_new.groupby(['user_id'])['user_id'].transform('count')

In [45]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [46]:
ratings_user = ratings_new.query('count_user >= 5')

In [47]:
ratings_user.shape

(814549, 4)

In [48]:
ratings_user = ratings_user.drop(columns=['count_user'])

In [49]:
ratings_user['count_movie'] = ratings_user.groupby(['movie_id'])['movie_id'].transform('count')
ratings_movie = ratings_user.query('count_movie >= 800')
ratings_movie.shape

(181959, 4)

In [50]:
user_by_movie = ratings_movie.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

In [51]:
user_by_movie.shape

(20794, 148)

In [52]:
movies_used = movies.loc[user_by_movie.columns, 'movie_title']
movies_used

movie_id
111161            The Shawshank Redemption (1994)
114369                               Se7en (1995)
359950     The Secret Life of Walter Mitty (2013)
369610                      Jurassic World (2015)
451279                        Wonder Woman (2017)
                            ...                  
7286456                              Joker (2019)
7653254                     Marriage Story (2019)
8367814                      The Gentlemen (2019)
8579674                               1917 (2019)
8946378                         Knives Out (2019)
Name: movie_title, Length: 148, dtype: object

In [53]:
movies_used.shape

(148,)

In [54]:
user_by_movie_matrix = user_by_movie.to_numpy()

In [None]:
def get_all_scores(user_by_movie_matrix):
    movies_number = user_by_movie.shape[1]
    scores = np.zeros(shape=(movies_number, movies_number))
    for index1 in range(movies_number):
        diffs = np.subtract(user_by_movie_matrix, np.vstack(user_by_movie_matrix[:, index1]))
        diffs[np.isnan(diffs)] = 0.0
        scores[index1] = np.linalg.norm(diffs, axis=0)
    return scores

In [56]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 19:33:16


In [58]:
scores = get_all_scores(user_by_movie_matrix)

In [60]:
#scores.shape

In [62]:
scores_full = scores + scores.T - np.diag(np.diag(scores))
number_movies_returned  = 5
negative_n = -1*number_movies_returned
closest_movies = np.argpartition(scores_full, negative_n, axis=1)[:, negative_n:]

In [64]:
#closest_movies.shape

In [65]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 19:33:25


In [66]:
all_closest_movies_df = pd.DataFrame()
all_closest_movies_df['movie_title'] = movies_used.values
for i in range(number_movies_returned):
    all_closest_movies_df['closest_movie_{}'.format(i+1)] = closest_movies[:,i]

In [67]:
all_closest_movies_df.head()

Unnamed: 0,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5
0,The Shawshank Redemption (1994),5,147,145,67,28
1,Se7en (1995),140,105,145,122,5
2,The Secret Life of Walter Mitty (2013),144,138,67,147,145
3,Jurassic World (2015),99,144,145,67,147
4,Wonder Woman (2017),5,52,145,2,86


In [68]:
all_closest_movies_df.shape

(148, 6)

In [69]:
database_filename = 'movie_recommendations.db'
table_name = 'Closest_movies'
engine = create_engine('sqlite:///' + database_filename)
all_closest_movies_df.to_sql(table_name, engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
# Extract saved recommendations

In [2]:
database_filepath = 'movie_recommendations_v2.db'
engine = create_engine('sqlite:///'+ database_filepath)
df = pd.read_sql_table('Closest_movies', engine)
engine.dispose()

In [3]:
df.head()

Unnamed: 0,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5
0,City Lights (1931),980,667,676,455,1036
1,Gone with the Wind (1939),491,957,1478,712,1257
2,Citizen Kane (1941),925,455,667,957,1276
3,Casablanca (1942),701,1343,1115,1276,455
4,Double Indemnity (1944),1276,425,962,957,455


In [89]:
movie = df[df['movie_title'] == 'Bohemian Rhapsody (2018)'].reset_index()
movie

Unnamed: 0,index,movie_title,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4,closest_movie_5
0,860,Bohemian Rhapsody (2018),1234,610,667,1523,1621


In [90]:
number_movies_returned  = 5
for i in range(number_movies_returned):
    column_name = 'closest_movie_{}'.format(i+1)
    id = movie.loc[0,column_name]
    movie_name = df.loc[id]['movie_title']
    print(movie_name)

Bird Box (2018)
Venom (2018)
Suicide Squad (2016)
Mother! (2017)
Once Upon a Time ...in Hollywood (2019)


In [41]:
def get_users(movie_index, user_by_movie_matrix):
    column = user_by_movie_matrix[:,movie_index]
    users = np.where(np.isnan(column) == False)[0] 
    return users

def get_common_users(movie_index1, movie_index2, user_by_movie_matrix):
    users1 = get_users(movie_index1, user_by_movie_matrix)
    users2 = get_users(movie_index2, user_by_movie_matrix)
    common_users = np.intersect1d(users1, users2, assume_unique=True)
    return common_users

def compute_score(movie_index1, movie_index2, user_by_movie_matrix):
    common_users = get_common_users(movie_index1, movie_index2, user_by_movie_matrix)
    movie1_ratings = np.array(user_by_movie_matrix[common_users, movie_index1])
    movie2_ratings = np.array(user_by_movie_matrix[common_users, movie_index2])

    distance = np.linalg.norm(movie1_ratings - movie2_ratings)

    score = 0
    if distance != 0:
        score = 1 / distance

    return score

In [42]:
number_movies_returned  = 5
def get_closest_movies(column):
    negative_n = -1*number_movies_returned
    closest_movies = np.argpartition(column, negative_n)[negative_n:]
    
    return closest_movies

In [43]:
def get_closest_movies(number_movies, scores):
    all_closest_movies = {}
    for movie_index in range(scores.shape[0]):
        closest_movies_scores = [0]
        closest_movies_indexes = [movie_index]   
        for temp_index in range(scores.shape[0]):
            if temp_index > movie_index:
                temp_score = scores[movie_index, temp_index] 
            else:
                temp_score = scores[temp_index, movie_index]
            min_score = closest_movies_scores[-1]
            if temp_score > min_score:
                closest_movies_scores.append(temp_score)
                closest_movies_indexes.append(temp_index)
                index_sorted = len(closest_movies_scores) - 2
                next_score = closest_movies_scores[index_sorted]
                next_index = closest_movies_indexes[index_sorted]
                while (next_score < temp_score) and index_sorted >= 0 :
                    closest_movies_scores[index_sorted+1] = next_score
                    closest_movies_indexes[index_sorted+1] = next_index
                    closest_movies_scores[index_sorted] = temp_score
                    closest_movies_indexes[index_sorted] = temp_index
                    index_sorted -= 1
                if len(closest_movies_scores) > number_movies:
                    closest_movies_scores = closest_movies_scores[0:number_movies-1]
                    closest_movies_indexes = closest_movies_indexes[0:number_movies-1] 
        if closest_movies_indexes[-1] == movie_index:
            if len(closest_movies_indexes) == 1:
                closest_movies_scores = []
                closest_movies_indexes = []
            else:
                closest_movies_scores = closest_movies_scores[0:-2]
                closest_movies_indexes = closest_movies_indexes[0:-2]
        all_closest_movies[movie_index] = closest_movies_indexes

    return all_closest_movies