***
* Author: Amin Mollaei     
* Last modify: 18th July, 2022
***           

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Strings are stored as bytes, we need to convert them
def bytes_to_string(df, n=0):
  df['movie_id'] = df['movie_id'].str.decode('utf-8').fillna(df['movie_id'])
  df['movie_title'] = df['movie_title'].str.decode('utf-8').fillna(df['movie_title'])
  if n == 1:
    df['user_id'] = df['user_id'].str.decode('utf-8').fillna(df['user_id'])

In [None]:
# Use pandas to read data frames
# Upload them to your google drive or remove directory addresses from the path if the files are next to your python code 
import pandas as pd
ratings_df = pd.read_parquet('/content/drive/My Drive/ratings.parquet.gzip')
display(ratings_df.head())

movies_df = pd.read_parquet('/content/drive/My Drive/movies.parquet.gzip').dropna()

bytes_to_string(movies_df)
display(movies_df.head())

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,35.0,"[0, 7]",b'3107',b'Backdraft (1991)',977432193,True,b'130',18,b'technician/engineer',5.0,b'50021'
1,25.0,[7],b'2114',"b'Outsiders, The (1983)'",965932967,False,b'3829',0,b'academic/educator',4.0,b'22307'
2,18.0,"[4, 15]",b'256',b'Junior (1994)',1012103552,False,b'1265',21,b'writer',1.0,b'49321'
3,18.0,"[0, 10]",b'1389',b'Jaws 3-D (1983)',972004605,True,b'2896',14,b'sales/marketing',5.0,b'60073'
4,18.0,[0],b'3635',"b'Spy Who Loved Me, The (1977)'",961180111,True,b'5264',17,b'college/grad student',4.0,b'15217'


Unnamed: 0,movie_genres,movie_id,movie_title
0,"[5, 7]",1729,Jackie Brown (1997)
1,[7],1486,"Quiet Room, The (1996)"
2,[4],3086,March of the Wooden Soldiers (a.k.a. Laurel & ...
3,[0],2965,"Omega Code, The (1999)"
4,[10],2853,"Communion (a.k.a. Alice, Sweet Alice/Holy Terr..."


In [None]:
# Free some memory. This columns are not needed
del ratings_df['bucketized_user_age']
del ratings_df['timestamp']
del ratings_df['user_gender']
del ratings_df['user_occupation_label']
del ratings_df['user_occupation_text']
del ratings_df['user_zip_code']

ratings_df = ratings_df.dropna()

bytes_to_string(ratings_df, 1)

display(ratings_df)

Unnamed: 0,movie_genres,movie_id,movie_title,user_id,user_rating
0,"[0, 7]",3107,Backdraft (1991),130,5.0
1,[7],2114,"Outsiders, The (1983)",3829,4.0
2,"[4, 15]",256,Junior (1994),1265,1.0
3,"[0, 10]",1389,Jaws 3-D (1983),2896,5.0
4,[0],3635,"Spy Who Loved Me, The (1977)",5264,4.0
...,...,...,...,...,...
1000204,"[4, 7]",2290,Stardust Memories (1980),4508,4.0
1000205,"[0, 1, 15]",2643,Superman IV: The Quest for Peace (1987),4167,1.0
1000206,"[7, 16]",593,"Silence of the Lambs, The (1991)",5799,5.0
1000207,[7],1693,Amistad (1997),3224,3.0


In [None]:
# Compare last movie label to total number of labels.
# By looking at the output, we understand that labels are not random
#  so we can have an array with the lenght of number of labels with
#  very little unused space ( (3952-3883)*8 gives us 552 memory waste
#  for each user. In this example it's about 3KB in total. But time complexity
#  will be much much lowered)
import numpy as np

movie_lbls = []

for item in movies_df['movie_id']:
  movie_lbls.append(int(item))

movie_lbls = np.sort(movie_lbls)
print(len(movie_lbls))
print(movie_lbls)
del movies_df

3883
[   1    2    3 ... 3950 3951 3952]


In [None]:
# We now have our observation and can free the memory
last_label = movie_lbls[-1]
del movie_lbls

In [None]:
# Store all movies for every users
# If user 'u' have seen movie '10', users[u][10] will be the rating of user 'u'
#  to movie '10'. Otherwise it'll be np.inf
users = {}
users_top_movies = {}
infinities = np.array([np.inf for i in range(last_label + 1)])
for _, row in ratings_df.iterrows():
  uid = row['user_id']
  users[uid] = infinities.copy()
  users_top_movies[uid] = []

for _, row in ratings_df.iterrows():
  uid = row['user_id']
  mid = int(row['movie_id'])
  ur = row['user_rating']
  title = row['movie_title']
  genres = row['movie_genres']
  users[uid][mid] = ur
  if ur >= 4.5:
    users_top_movies[uid].append( (title, genres) )

In [None]:
# Convert dictionary to list to iterate faster
users = list(users.items())

users[:5]

[('130', array([inf, inf, inf, ..., inf, inf, inf])),
 ('3829', array([inf, inf, inf, ..., inf, inf, inf])),
 ('1265', array([inf,  4.,  1., ..., inf, inf, inf])),
 ('2896', array([inf,  4., inf, ..., inf, inf, inf])),
 ('5264', array([inf,  5., inf, ..., inf, inf, inf]))]

In [None]:
del ratings_df
del infinities

In [None]:
# Distance is (movies for user_a - movies for user_b)
#  It can be cosine distance either.
# Then drop nan and inf
# If two users have seen similar movies, the diff.sum() will grow
#  so it must be divided by the lenght of diff.
def find_distance(list_a, list_b):
  diff = np.abs(list_a - list_b)
  diff = diff[~np.isnan(diff)]
  diff = diff[diff < np.inf]
  return diff.sum() / len(diff)

In [None]:
# Compare each user with all others so it's O(n^2).
# Detect 10 user with most similarity for each user
# Recommend at least 10 movies from that 10 near users. First suggest top movie of each user
#  and if suggested movies are less than 10, use 2nd top movie of each user
#  It's completely obvious thath for 10 movies there is no need to look for
#  2nd top movie of any user.  
# Finally store in google drive.
# Since algorithm is in O(n^2) order and there is a file creation for each user
#  this cell will take time.
MAX_ITERATOINS = 20
cnt = 0
for soure_user, source_userRatings in users:
  distance_to_other_users = {}
  for target_user, target_userRatings in users:
    if soure_user == target_user:
      continue
    d = find_distance(source_userRatings, target_userRatings)
    if not np.isnan(d):
      distance_to_other_users[target_user] =  d
  distance_to_other_users = list(sorted(distance_to_other_users.items(), key=lambda item: item[1]))[:10]

  recomendations_number = 0
  indx = 0
  movies = []
  genres = []

  while recomendations_number < 10 and indx < MAX_ITERATOINS:
    for near_user, _ in distance_to_other_users:
      try:
        recom = users_top_movies[near_user][indx]
        movies.append(recom[0])
        genres.append(recom[1])
        recomendations_number += 1
      except:
        pass
    indx += 1
      

  df = pd.DataFrame({'Movies':movies, 'Genres':genres})  
  df.to_csv(f'/content/drive/My Drive/recomended_movies/{soure_user}.csv')
  cnt += 1

  if cnt%10 == 0:
    print(f'{cnt} out of {len(users)}')

  
  """


10 out of 6040
20 out of 6040
30 out of 6040
40 out of 6040
50 out of 6040
60 out of 6040
70 out of 6040
80 out of 6040
90 out of 6040
100 out of 6040
110 out of 6040
120 out of 6040
130 out of 6040
140 out of 6040
150 out of 6040
160 out of 6040
170 out of 6040
180 out of 6040
190 out of 6040
200 out of 6040
210 out of 6040
220 out of 6040
230 out of 6040
240 out of 6040
250 out of 6040
260 out of 6040
270 out of 6040
280 out of 6040
290 out of 6040
300 out of 6040
310 out of 6040
320 out of 6040
330 out of 6040
340 out of 6040
350 out of 6040
360 out of 6040
370 out of 6040
380 out of 6040
390 out of 6040
400 out of 6040
410 out of 6040
420 out of 6040
430 out of 6040
440 out of 6040
450 out of 6040
460 out of 6040
470 out of 6040
480 out of 6040
490 out of 6040
500 out of 6040
510 out of 6040
520 out of 6040
530 out of 6040
540 out of 6040
550 out of 6040
560 out of 6040
570 out of 6040
580 out of 6040
590 out of 6040
600 out of 6040
610 out of 6040
620 out of 6040
630 out of 6040
6