In [0]:
!pip install surprise

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

import numpy as np
import pandas as pd

# load rating file
drivePrefix = '/content/gdrive/My Drive/Recsys ML/'
driveSuffix = '1301164011_MF_MovieLens/ml-latest-small/ratings.csv'
ratingsdf = pd.read_csv(drivePrefix + driveSuffix, sep=',')

# load movie file
drivePrefix = '/content/gdrive/My Drive/Recsys ML/'
driveSuffix = '1301164011_MF_MovieLens/ml-latest-small/movies.csv'
moviesdf = pd.read_csv(drivePrefix + driveSuffix, sep=',')

# load tag file
drivePrefix = '/content/gdrive/My Drive/Recsys ML/'
driveSuffix = '1301164011_MF_MovieLens/ml-latest-small/tags.csv'
tagsdf = pd.read_csv(drivePrefix + driveSuffix, sep=',')

# R = ratingsdf.pivot("userId","movieId","rating").values
# R = np.nan_to_num(R)

KeyboardInterrupt: ignored

In [0]:
from surprise.model_selection import ShuffleSplit
from surprise import Reader, Dataset, NMF, accuracy
from collections import defaultdict
import numpy as np

def CF(R, biased):
  algo = NMF(biased=biased)
  kf = ShuffleSplit(n_splits=5, test_size=0.2, random_state=7)
  recall = []
  for trainset, testset in kf.split(R):
    algo.fit(trainset)
    predictions = algo.test(testset)
    recl =recall_at_k(predictions, 150, 4)
    recall.append(sum(rec for rec in recl.values()) / len(recl))
  trainset = R.build_full_trainset()  
  algo.fit(trainset)
  #predict ratings for all pairs (u, i) that are NOT in the training set.
  testset = trainset.build_anti_testset()
  predictions = algo.test(testset)
  top_n = get_top_n(predictions, 150)
  print(recall)
  return top_n, recall
  
def get_top_n(predictions, n):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def recall_at_k(predictions, k, threshold):
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return recalls

reader = Reader(rating_scale=(0.5, 5.0))
R = Dataset.load_from_df(ratingsdf.drop(columns=['timestamp']), reader)
top_n_b, recall_b = CF(R,1)
print('bias :',np.mean(recall_b))
top_n, recall = CF(R,0)
print('non bias :',np.mean(recall))
if (recall_b >= recall):
  top_n = top_b_b

[0.3809324622497426, 0.4378726060821973, 0.45844366977705664, 0.4149808099012928, 0.37556467178053865]
bias : 0.41355884395816556
[0.3993591097114339, 0.3869849590525422, 0.37727551668219744, 0.3950299726650992, 0.38205906511236637]
non bias : 0.3881417246447278


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

tempDF = moviesdf.set_index(moviesdf.movieId)

def count_word(df, ref_col, liste):
  keyword_count = dict()
  for s in liste: keyword_count[s] = 0
  for liste_keywords in df[ref_col].str.split('|'):
      if type(liste_keywords) == float and pd.isnull(liste_keywords): continue
      for s in liste_keywords: 
          if pd.notnull(s): keyword_count[s] += 1
  # convert the dictionary in a list to sort the keywords  by frequency
  keyword_occurences = []
  for k,v in keyword_count.items():
      keyword_occurences.append([k,v])
  keyword_occurences.sort(key = lambda x:x[1], reverse = True)
  return keyword_occurences, keyword_count

genre_labels = set()
for s in tempDF['genres'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))

keyword_occurences, dum = count_word(tempDF, 'genres', genre_labels)
for (key,cnt) in keyword_occurences:
    tempDF.loc[tempDF['genres'].str.contains(key), key] = 1
    tempDF[key] = tempDF[key].fillna(0)

tempDF = tempDF.drop(columns=['title','genres','movieId'])
n_cluster =20
kmeans = KMeans(n_clusters=n_cluster)
tempDF['cluster'] = kmeans.fit_predict(tempDF)
# print(genre_cluster)

best_cluster = []
for i in range(1,611):
  ur = ratingsdf[ratingsdf['userId']==i]
  cluster = np.zeros(n_cluster)
  cluster_row = np.zeros(n_cluster)
  for index, row in ur.iterrows():
    ratingcluster = tempDF.cluster[row['movieId']]
    cluster[ratingcluster] = cluster[ratingcluster] + row['rating']
    cluster_row[ratingcluster] = cluster_row[ratingcluster] + 1
  for j in range(n_cluster):
    cluster[j] = cluster[j]/cluster_row[j]
  cluster = np.argsort(cluster)
  cluster = cluster[::-1][:n_cluster-1]
  best_cluster.append(cluster[0])

genre_rec = []
for i in best_cluster:
  genre_rec.append(np.array(tempDF.index[tempDF['cluster']==i]))

  return func(self, *args, **kwargs)


In [0]:
temp = moviesdf.set_index(moviesdf.movieId).drop(columns=['title','genres','movieId'])
temp_df = tagsdf.copy()
temp['tag'] = temp_df.groupby('movieId')['tag'].apply(lambda x: '|'.join(x))
temp['tag'] = temp.tag.str.split('|')
temp.loc[temp['tag'].isnull(),['tag']] = temp.loc[temp['tag'].isnull(),'tag'].apply(lambda x: [])

for index, row in temp.iterrows():
    for v in row['tag']:
      temp.at[index, v] = 1
temp = temp.fillna(0).drop(columns=['tag'])

temp = pd.concat([tempDF, temp], axis=1)

n_cluster = 30
km = KMeans(n_clusters=n_cluster)
temp['cluster'] = km.fit_predict(temp)

best_cluster = []
x=0
for i in range(1,611):
  ur = tagsdf[tagsdf['userId']==i]
  cluster = np.zeros(n_cluster)

  #count the most given tag by user (cluster)
  for index, row in ur.iterrows():    
    numcluster = temp.cluster[row['movieId']]
    cluster[numcluster] = cluster[numcluster] + 1
  
  #count the most given rating>4 by user (cluster)
  ur = ratingsdf[ratingsdf['userId']==i]
  for index, row in ur.iterrows():
    if (row['rating']>=4):
      numcluster = temp.cluster[row['movieId']]
      cluster[numcluster] = cluster[numcluster] + 1

  cluster = np.argsort(cluster)
  cluster = cluster[::-1][:n_cluster-1]
  best_cluster.append(cluster[0])

tag_rec = []
for i in best_cluster:
  tag_rec.append(np.array(temp.index[temp['cluster']==i]))

In [1]:
count = moviesdf.set_index(moviesdf.movieId).drop(columns=['title','genres','movieId'])
final_rec = []
for i in range(610):
  count['counter'] = 0
  for j in top_n[i]:
    count['counter'][j[0]] = count['counter'][j[0]] + 1 
  for k in genre_rec[i]:
    count['counter'][k] = count['counter'][k] + 1 
  for l in tag_rec[i]:
    count['counter'][l] = count['counter'][l] + 1
  count = count.sort_values('counter', ascending=False)
  c=0
  movie_rec = []
  for index, row in count.iterrows():
    if (c<20):
      movie_rec.append(index)
    else: break
    c = c + 1
  final_rec.append(movie_rec)
# final_rec
np.savetxt("User_Recomendation.csv", final_rec, delimiter=",")

NameError: ignored