In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data_base_dir = '../../../datasets/Movielens/'
# data_dir = data_base_dir + 'serendipity-sac2018/'
data_dir = data_base_dir + 'ml-20m/'

genome_scores = data_dir + 'genome-scores.csv'
genome_tags = data_dir + 'genome-tags.csv'
movies = data_dir + 'movies.csv'
ratings = data_dir + 'ratings.csv'
# ratings = data_dir + 'training.csv'
tags = data_dir + 'tags.csv'
# answers = data_dir + 'answers.csv'

genomes_df = pd.read_csv(genome_scores).pivot(index='movieId', columns='tagId', values='relevance')
genome_score_movies = genomes_df.index.values
ratings_df = pd.read_csv(ratings, usecols=range(3), dtype={'userId':np.int64, 'movieId':np.int64, 'rating':np.float64}, low_memory=False)
ratings_df = ratings_df[ratings_df['movieId'].isin(genome_score_movies)]
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
5,1,112,3.5
6,1,151,4.0
7,1,223,4.0
8,1,253,4.0
9,1,260,4.0


In [2]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
    
def get_users_best_silhouette_score(user_id, genome_scores_df):
    user_movie_tags_df = pd.DataFrame()
    user_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].values
    user_movie_tags_df = genome_scores_df[genome_scores_df.index.isin(user_movies)]
    n_movies = user_movie_tags_df.index.size
    
    highest_score = 0
    optimal_cluster_size = 2

    user_movies_matrix = np.nan_to_num(user_movie_tags_df.values[:n_movies])
    
    for cluster_size in range(2, n_movies-1, 1):
        result = AgglomerativeClustering(n_clusters=cluster_size, affinity='euclidean', linkage='ward').fit_predict(user_movies_matrix)
        score = silhouette_score(user_movies_matrix, result, metric='cosine')
#         cluster_size_silhouette_score_d[cluster_size] = score
        
        if score >= highest_score:
            highest_score = score
            optimal_cluster_size = cluster_size
    
    return highest_score, optimal_cluster_size
            
def get_best_silhouette_score_for_all_movies(genome_scores_df):
    score_history = []
    cluster_size_hist = []
    best_score = -1
    best_cluster_size = 0
    
    for cluster_size in range(2, 30, 5):
#         result = KMeans(n_clusters=cluster_size, random_state=171450, n_jobs=-1).fit_predict(genome_scores_df.values)
        result = AgglomerativeClustering(n_clusters=cluster_size, affinity='euclidean', linkage='ward').fit_predict(genome_scores_df.values)

        score = silhouette_score(genome_scores_df.values, result, metric='cosine')
        
        score_history.append(score)
        cluster_size_hist.append(cluster_size)
        
        if best_score < score:
            best_score = score
            best_cluster_size = cluster_size
            
    return score_history, cluster_size_hist, best_score, best_cluster_size

In [3]:
def get_all_users_mean_best_scores(genomes_df, test_users):
    plt.clf()
    score_history = list()
    optimal_cluster_size_hist = list()

    best_user = 0
    best_score = 0

    for user in test_users:
        highest_score, optimal_cluster_size = get_users_best_silhouette_score(user, genomes_df)
        score_history.append(highest_score)

        if best_score < highest_score:
            best_score = highest_score
            best_user = user

        optimal_cluster_size_hist.append(optimal_cluster_size)
    
    print('highest_score', best_score)
    print('best_user', best_user)

    return np.array(score_history).mean()

In [4]:
tag_genome_df = pd.read_csv(genome_scores).pivot(index='movieId', columns='tagId', values='relevance')

def get_results_for_all_dfs(test_users):
    genome_df_list = [tag_genome_df, 'movies_lemmatized_genome_vector_df_bz2',\
                'threshold_0.25_float_movie_genomes_bz2', \
                'movies_lemmatized_threshold_0.25_float_movie_genomes_bz2',\
#                 'threshold_0.3_float_movie_genomes_bz2', \
#                 'movies_lemmatized_threshold_0.3_float_movie_genomes_bz2',\
#                 'threshold_0.35_float_movie_genomes_bz2', \
#                 'movies_lemmatized_threshold_0.35_float_movie_genomes_bz2',\
                'threshold_0.4_float_movie_genomes_bz2', \
                'movies_lemmatized_threshold_0.4_float_movie_genomes_bz2', \
#                 'threshold_0.6_float_movie_genomes_bz2',\
#                 'movies_lemmatized_threshold_0.6_float_movie_genomes_bz2', \
                'threshold_0.7_float_movie_genomes_bz2', \
                'movies_lemmatized_threshold_0.7_float_movie_genomes_bz2']
    df_names = ['full_genomes', 'movies_lemmatized_genome_vector_df_bz2',\
                'threshold_0.25_float_movie_genomes_bz2', \
                'movies_lemmatized_threshold_0.25_float_movie_genomes_bz2',\
#                 'threshold_0.3_float_movie_genomes_bz2', \
#                 'movies_lemmatized_threshold_0.3_float_movie_genomes_bz2',\
#                 'threshold_0.35_float_movie_genomes_bz2', \
#                 'movies_lemmatized_threshold_0.35_float_movie_genomes_bz2',\
                'threshold_0.4_float_movie_genomes_bz2', \
                'movies_lemmatized_threshold_0.4_float_movie_genomes_bz2', \
#                 'threshold_0.6_float_movie_genomes_bz2',\
#                 'movies_lemmatized_threshold_0.6_float_movie_genomes_bz2', \
                'threshold_0.7_float_movie_genomes_bz2', \
                'movies_lemmatized_threshold_0.7_float_movie_genomes_bz2']

    scores_df = pd.DataFrame()

    for index, genomes_df in enumerate(genome_df_list):
        print('processing for: ', df_names[index])
        if type(genomes_df) is str:
            genomes_df = pd.read_pickle(data_dir + 'output/' + genomes_df, compression='bz2')

        mean_best_score = get_all_users_mean_best_scores(genomes_df, test_users)
        print('mean_best_score', mean_best_score, '\n')
        ser = pd.Series()
        ser.name = df_names[index]
        ser['mean_best_score'] = mean_best_score
        scores_df = scores_df.append(ser)

    scores_df.plot(kind='barh')

In [5]:
# only select users who have answered for 5 recommendations,
#  at max there are 5 recommendations per user
all_user_ids = ratings_df['userId'].unique()
all_user_ids.size


138493

In [6]:
# only select users who watched more than 1 movie
# minimum_watched_movie_threshold = 1
# count_df = ratings_df[ratings_df['userId'].isin(all_user_ids)].groupby('userId').count()
# users_below_min_threshold = count_df[count_df['movieId'] <= minimum_watched_movie_threshold].index.values
# all_user_ids = np.setdiff1d(all_user_ids, users_below_min_threshold)
# print(all_user_ids.size)
# all_user_ids
count_df = ratings_df.groupby('userId').count()
# count_df
count_df.describe()

Unnamed: 0,movieId,rating
count,138493.0,138493.0
mean,142.970713,142.970713
std,222.220455,222.220455
min,13.0,13.0
25%,34.0,34.0
50%,67.0,67.0
75%,154.0,154.0
max,6590.0,6590.0


In [7]:
# # divide user groups into 4 based on the number of movies watched by them
threshold1 = count_df['movieId'] <= 34
threshold2 = count_df['movieId'] <= 67
threshold3 = count_df['movieId'] <= 154
threshold4 = count_df['movieId'] > 154

user_group_1 = count_df[threshold1].index.values
user_group_2 = count_df[threshold2].index.values
user_group_3 = count_df[threshold3].index.values
user_group_4 = count_df[threshold4].index.values

user_group_2 = np.setdiff1d(user_group_2, user_group_1)
user_group_3 = np.setdiff1d(user_group_3, user_group_2)
user_group_4 = np.setdiff1d(user_group_4, user_group_3)

print(user_group_1.size)
print(user_group_2.size)
print(user_group_3.size)
print(user_group_4.size)

34763
34545
69462
34486


In [9]:
# states the number of movies watched by each user
count_df

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,174,174
2,61,61
3,187,187
4,28,28
5,66,66
6,24,24
7,276,276
8,70,70
9,35,35
10,38,38


In [None]:
from time import time

test_users = user_group_1

start = time()
get_results_for_all_dfs(test_users)
finish = time() - start
print('Total time: %f seconds' % finish)

processing for:  full_genomes


In [None]:
test_users = user_group_2

start = time()
get_results_for_all_dfs(test_users)

finish = time() - start
print('Total time: %f seconds' % finish)

In [None]:
test_users = user_group_3
start = time()

get_results_for_all_dfs(test_users)

finish = time() - start
print('Total time: %f seconds' % finish)

In [None]:
test_users = user_group_4
start = time()

get_results_for_all_dfs(test_users)

finish = time() - start
print('Total time: %f seconds' % finish)

In [None]:
# file_name = 'movies_lemmatized_threshold_0.25_float_movie_genomes_bz2'
# genomes_df = pd.read_pickle(data_dir + '/output/' + file_name, compression='bz2')
# genomes_df

# known_best_users = [1110, 1050, 391, 71, 81, 41, 31, 1]
# known_best_users = [114756, 127031, 145675, 160486, 200683, 191484, 192643, 206554, 200754]

# start = time()

# get_results_for_all_dfs(test_users)

# finish = time() - start
# print('Total time: %f seconds' % finish)