In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import nan_euclidean_distances

In [2]:
movies = pd.read_csv('data/movies.dat', sep='::', names=['movie_id', 'movie_title', 'genra'], header=None )
movies.head()

Unnamed: 0,movie_id,movie_title,genra
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [3]:
movies.shape[0] - movies['movie_id'].unique().shape[0]

3

In [4]:
counts = movies.groupby(['movie_id'])[['movie_title']].size()
counts[counts>1]

movie_id
106519     2
1979376    2
4160708    2
dtype: int64

In [5]:
movies[movies['movie_id'] == 106519]

Unnamed: 0,movie_id,movie_title,genra
8241,106519,Carlito's Way (1993),
8242,106519,Carlito's Way (1993),Crime|Drama|Thriller


In [6]:
movies[movies['movie_id'] == 1979376]

Unnamed: 0,movie_id,movie_title,genra
21924,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy
21925,1979376,Toy Story 4 (2019),Animation|Adventure|Comedy|Family|Fantasy


In [7]:
movies[movies['movie_id'] == 4160708]

Unnamed: 0,movie_id,movie_title,genra
29305,4160708,Don't Breathe (2016),Crime|Horror|Thriller
29306,4160708,Don't Breathe (2016),Crime|Horror|Thriller


In [8]:
movies.drop_duplicates(subset=['movie_id'], inplace=True)

In [9]:
movies.index = movies['movie_id']
movies = movies.drop(columns=['movie_id'])
movies.head()

Unnamed: 0_level_0,movie_title,genra
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
10,La sortie des usines Lumière (1895),Documentary|Short
12,The Arrival of a Train (1896),Documentary|Short
25,The Oxford and Cambridge University Boat Race ...,
91,Le manoir du diable (1896),Short|Horror


In [10]:
users = pd.read_csv('data/users.dat', sep='::', names=['user_id', 'twitter_id'], header=None )
users.head()

Unnamed: 0,user_id,twitter_id
0,1,139564917
1,2,522540374
2,3,475571186
3,4,215022153
4,5,349681331


In [11]:
print(users.shape)

(69324, 2)


In [12]:
ratings = pd.read_csv('data/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'rating_timestamp'], header=None )
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [13]:
ratings.shape

(888452, 4)

In [14]:
ratings_new = ratings.drop(columns=['rating_timestamp'])

In [15]:
ratings_new['count_user'] = ratings_new.groupby(['user_id'])['user_id'].transform('count')

In [16]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [17]:
ratings_user = ratings_new.query('count_user >= 5')

In [18]:
ratings_user.shape

(814549, 4)

In [19]:
ratings_user = ratings_user.drop(columns=['count_user'])

In [26]:
ratings_user['count_movie'] = ratings_user.groupby(['movie_id'])['movie_id'].transform('count')
ratings_movie = ratings_user.query('count_movie >= 500')
ratings_movie.shape

(283343, 4)

In [27]:
user_by_movie = ratings_movie.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

In [28]:
user_by_movie.shape

(21983, 309)

In [88]:
movies_used = movies.loc[user_by_movie.columns, 'movie_title']
movies_used

movie_id
10323       Das Cabinet des Dr. Caligari (1920)
12349                            The Kid (1921)
13442                          Nosferatu (1922)
15864                      The Gold Rush (1925)
17136                         Metropolis (1927)
                           ...                 
11388580                  Miss Americana (2020)
11390036               A Fall from Grace (2020)
11464826              The Social Dilemma (2020)
11561866              Masameer the Movie (2020)
11833648                 Shams al-Maaref (2020)
Name: movie_title, Length: 4363, dtype: object

In [89]:
movies_used.shape

(4363,)

In [77]:
user_by_movie_matrix = user_by_movie.to_numpy()

In [78]:
def get_users(movie_index, user_by_movie_matrix):
    column = user_by_movie_matrix[:,movie_index]
    users = np.where(np.isnan(column) == False)[0] 
    return users

def get_common_users(movie_index1, movie_index2, user_by_movie_matrix):
    users1 = get_users(movie_index1, user_by_movie_matrix)
    users2 = get_users(movie_index2, user_by_movie_matrix)
    common_users = np.intersect1d(users1, users2, assume_unique=True)
    return common_users

def compute_score(movie_index1, movie_index2, user_by_movie_matrix):
    common_users = get_common_users(movie_index1, movie_index2, user_by_movie_matrix)
    movie1_ratings = np.array(user_by_movie_matrix[common_users, movie_index1])
    movie2_ratings = np.array(user_by_movie_matrix[common_users, movie_index2])

    distance = np.linalg.norm(movie1_ratings - movie2_ratings)

    score = 0
    if distance != 0:
        score = 1 / distance

    return score

def get_all_scores(user_by_movie_matrix):
    movies_number = user_by_movie.shape[1]
    scores = np.zeros(shape=(movies_number, movies_number))
    for index1 in range(movies_number):
        for index2 in range(index1+1, movies_number):
            scores[index1, index2] = compute_score(index1, index2, user_by_movie_matrix)
    return scores

In [79]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 05:12:47


In [80]:
#from sklearn.metrics.pairwise import nan_euclidean_distances
#scores = nan_euclidean_distances(user_by_movie_matrix, user_by_movie_matrix)

In [81]:
scores = get_all_scores(user_by_movie_matrix)

KeyboardInterrupt: 

In [43]:
#scores[range(9), range(9)]

In [44]:
#scores.shape

In [45]:
#all_closest_movies = get_closest_movies(5, scores)

In [46]:
scores_full = scores + scores.T - np.diag(np.diag(scores))
number_movies_returned  = 5
negative_n = -1*number_movies_returned
closest_movies = np.argpartition(scores_full, negative_n, axis=1)[:, negative_n:]

In [47]:
#closest_movies

In [48]:
#closest_movies.shape

In [49]:
'''
closest_movies_strings = []
for row in closest_movies:
    string_array = numpy.array2string(row, separator=',')
    closest_movies_strings.append(string_array)
closest_movies_strings
'''

"\nclosest_movies_strings = []\nfor row in closest_movies:\n    string_array = numpy.array2string(row, separator=',')\n    closest_movies_strings.append(string_array)\nclosest_movies_strings\n"

In [50]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 05:06:10


In [51]:
all_closest_movies_df = pd.DataFrame()
all_closest_movies_df['movie_title'] = movies_used.values
for i in range(number_movies_returned):
    all_closest_movies_df['closest_movie_{}'.format(i+1)] = closest_movies[:,i]

In [52]:
all_closest_movies_df.head()

Unnamed: 0,closest_movies,movie_title,closest_movie_0,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4
0,,12 Angry Men (1957),119,276,144,28,34
1,,The Godfather (1972),242,119,96,154,70
2,,One Flew Over the Cuckoo's Nest (1975),100,113,154,28,164
3,,Taxi Driver (1976),72,39,96,100,205
4,,Goodfellas (1990),96,129,305,72,154


In [53]:
all_closest_movies_df.shape

(309, 7)

In [54]:
database_filename = 'movie_recommendations.db'
table_name = 'Closest_movies'
engine = create_engine('sqlite:///' + database_filename)
all_closest_movies_df.to_sql(table_name, engine, index=False, if_exists='replace')
engine.dispose()

In [55]:
database_filepath = 'movie_recommendations.db'
engine = create_engine('sqlite:///'+ database_filepath)
df = pd.read_sql_table('Closest_movies', engine)
engine.dispose()

In [56]:
df.head()

Unnamed: 0,closest_movies,movie_title,closest_movie_0,closest_movie_1,closest_movie_2,closest_movie_3,closest_movie_4
0,,12 Angry Men (1957),119,276,144,28,34
1,,The Godfather (1972),242,119,96,154,70
2,,One Flew Over the Cuckoo's Nest (1975),100,113,154,28,164
3,,Taxi Driver (1976),72,39,96,100,205
4,,Goodfellas (1990),96,129,305,72,154


In [57]:
number_movies_returned  = 5
def get_closest_movies(column):
    negative_n = -1*number_movies_returned
    closest_movies = np.argpartition(column, negative_n)[negative_n:]
    
    return closest_movies

In [58]:
def get_closest_movies(number_movies, scores):
    all_closest_movies = {}
    for movie_index in range(scores.shape[0]):
        closest_movies_scores = [0]
        closest_movies_indexes = [movie_index]   
        for temp_index in range(scores.shape[0]):
            if temp_index > movie_index:
                temp_score = scores[movie_index, temp_index] 
            else:
                temp_score = scores[temp_index, movie_index]
            min_score = closest_movies_scores[-1]
            if temp_score > min_score:
                closest_movies_scores.append(temp_score)
                closest_movies_indexes.append(temp_index)
                index_sorted = len(closest_movies_scores) - 2
                next_score = closest_movies_scores[index_sorted]
                next_index = closest_movies_indexes[index_sorted]
                while (next_score < temp_score) and index_sorted >= 0 :
                    closest_movies_scores[index_sorted+1] = next_score
                    closest_movies_indexes[index_sorted+1] = next_index
                    closest_movies_scores[index_sorted] = temp_score
                    closest_movies_indexes[index_sorted] = temp_index
                    index_sorted -= 1
                if len(closest_movies_scores) > number_movies:
                    closest_movies_scores = closest_movies_scores[0:number_movies-1]
                    closest_movies_indexes = closest_movies_indexes[0:number_movies-1] 
        if closest_movies_indexes[-1] == movie_index:
            if len(closest_movies_indexes) == 1:
                closest_movies_scores = []
                closest_movies_indexes = []
            else:
                closest_movies_scores = closest_movies_scores[0:-2]
                closest_movies_indexes = closest_movies_indexes[0:-2]
        all_closest_movies[movie_index] = closest_movies_indexes

    return all_closest_movies

In [59]:
# save movie_id - movie_title - list of n closest movie_ids to db

In [60]:
# create function that extracts n closest movie titles based on movie_ids as int for a given movie from db