In [58]:
import pandas as pd
import numpy as np

In [31]:
movies = pd.read_csv('data/movies.dat', sep='::', names=['movie_id', 'movie_title', 'genra'], header=None )
movies.head()

Unnamed: 0,movie_id,movie_title,genra
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


In [32]:
users = pd.read_csv('data/users.dat', sep='::', names=['user_id', 'twitter_id'], header=None )
users.head()

Unnamed: 0,user_id,twitter_id
0,1,139564917
1,2,522540374
2,3,475571186
3,4,215022153
4,5,349681331


In [33]:
print(users.shape)

(69324, 2)


In [34]:
ratings = pd.read_csv('data/ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'rating_timestamp'], header=None )
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [35]:
ratings.shape

(888452, 4)

In [36]:
ratings_new = ratings.drop(columns=['rating_timestamp'])

In [37]:
ratings_new['count_user'] = ratings_new.groupby(['user_id'])['user_id'].transform('count')

In [38]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,75314,1,1595468524
2,2,102926,9,1590148016
3,2,114369,10,1597555347
4,2,118715,8,1596006798


In [39]:
ratings_user = ratings_new.query('count_user >= 5')

In [40]:
ratings_user.shape

(814549, 4)

In [41]:
ratings_user = ratings_user.drop(columns=['count_user'])

In [52]:
ratings_user['count_movie'] = ratings_user.groupby(['movie_id'])['movie_id'].transform('count')
ratings_movie = ratings_user.query('count_movie >= 5')
ratings_movie.shape

(775513, 4)

In [53]:
user_by_movie = ratings_movie.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

In [54]:
user_by_movie.head()

movie_id,417,439,10323,12349,12364,13257,13427,13442,14341,14429,...,11958344,11987296,12078990,12117854,12133722,12567088,12588160,12724622,12749596,12875782
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [55]:
user_by_movie = user_by_movie.reset_index(drop=True)
user_by_movie.head()

movie_id,417,439,10323,12349,12364,13257,13427,13442,14341,14429,...,11958344,11987296,12078990,12117854,12133722,12567088,12588160,12724622,12749596,12875782
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [56]:
user_by_movie.shape

(22905, 11416)

In [67]:
def get_users(movie, user_by_movie):
    users = user_by_movie[ user_by_movie[movie].isnull() == False ].index.values
    return users

def get_common_users(movie1, movie2, user_by_movie):
    users1 = get_users(movie1, user_by_movie)
    users2 = get_users(movie2, user_by_movie)
    common_users = np.intersect1d(users1, users2, assume_unique=True)
    return common_users

def compute_score(movie1, movie2, user_by_movie):
    common_users = get_common_users(movie1, movie2, user_by_movie)
    movie1_ratings = np.array(user_by_movie.loc[common_users, movie1])
    movie2_ratings = np.array(user_by_movie.loc[common_users, movie2])

    distance = np.linalg.norm(movie1_ratings - movie2_ratings)

    score = 0
    if distance != 0:
        score = 1 / distance

    return score
def get_all_scores(user_by_movie):
    movies_number = user_by_movie.shape[1]
    scores = np.zeros(shape=(movies_number, movies_number))
    for i, movie1 in enumerate(user_by_movie.columns):
        for j, movie2 in enumerate(user_by_movie.columns[i+1:]):
            index1 = i
            index2 = j + (i + 1)
            scores[index1, index2] = compute_score(movie1, movie2, user_by_movie)
    scores_df = pd.DataFrame(scores, columns = user_by_movie.columns, index = user_by_movie.columns)
    return scores_df

In [65]:
scores_df = get_all_scores(user_by_movie)
scores_df.head()

KeyboardInterrupt: 

In [None]:
scores_df.shape

In [None]:
def get_closest_movies(number_movies, scores_df):
    all_closest_movies = {}
    for i, movie in enumerate(scores_df.index):
        closest_movies_scores = [0]
        closest_movies_ids = [movie]   
        for j, temp_id in enumerate(scores_df.index):
            if j > i:
                temp_score = scores_df[i, j] 
            else:
                temp_score = scores_df[j, i]
            min_score = closest_movies_scores[-1]
            if temp_score > min_score:
                closest_movies_scores.append(temp_score)
                closest_movies_ids.append(temp_id)
                index_sorted = len(closest_movies_scores) - 2
                next_score = closest_movies_scores[index_sorted]
                next_id = closest_movies_ids[index_sorted]
                while (next_score < temp_score) and index_sorted >= 0 :
                    closest_movies_scores[index_sorted+1] = next_score
                    closest_movies_ids[index_sorted+1] = next_id
                    closest_movies_scores[index_sorted] = temp_score
                    closest_movies_ids[index_sorted] = temp_id
                    index_sorted -= 1
        if closest_movies_ids[-1] == movie:
            if len(closest_movies_ids) == 1:
                closest_movies_scores = []
                closest_movies_ids = []
            else:
                closest_movies_scores = [0:-2]
                closest_movies_ids = [0:-2]
        if len(closest_movies_ids) > number_movies:
            closest_movies_ids = closest_movies_ids[0:number_movies-1]
        all_closest_movies[movie] = closest_movies_ids

    return all_closest_movies

In [None]:
all_closest_movies = get_closest_movies(number_movies=7, scores_df)
all_closest_movies_df = pd.DataFrame(columns = ['movie_id', 'closest_movies'])
all_closest_movies_df['movie_id'] = list(all_closest_movies.keys())
all_closest_movies_df['closest_movies'] = all_closest_movies_df['closest_movies'].astype(object)
all_closest_movies_df['closest_movies'] = list(all_closest_movies.values())
all_closest_movies_df.head()

In [None]:
# save movie_id - movie_title - list of n closest movie_ids to db

In [None]:
# create function that extracts n closest movie titles based on movie_ids as int for a given movie from db