In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data_base_dir = '../../../datasets/Movielens/'
data_dir2 = data_base_dir + 'Movielens Latest/ml-latest/'
data_dir = data_base_dir + 'serendipity-sac2018/'

output_dir = data_dir + 'output4/'

answers = data_dir + 'answers.csv'
recommendations = data_dir + 'recommendations.csv'

# genome_scores = data_dir + 'tag_genome.csv'
# genome_scores = data_dir + 'ml20mgenome-scores.csv'
genome_scores = data_dir + 'mlLatestgenome-scores.csv'

genome_tags = data_dir + 'genome-tags.csv'
movies = data_dir + 'movies.csv'
training = data_dir + 'training.csv'
tags = data_dir + 'tags.csv'

movies_df = pd.read_csv(movies)

answers_df = pd.read_csv(answers)
ratings_df = pd.read_csv(training)

# read all users and filter ratings df
recommendations_df = pd.read_csv(recommendations)
recommendations_df

all_user_ids = recommendations_df['userId'].unique().tolist()

count_df = answers_df.groupby('userId').count()
count_df[count_df['movieId'] == 5]
all_user_ids.extend(count_df[count_df['movieId'] == 5].index.values.tolist())
all_user_ids = np.unique(np.array(all_user_ids))

all_movie_ids = ratings_df['movieId'].unique()

cut_recommendations = recommendations_df[recommendations_df['movieId'].isin(all_movie_ids)]
count_df = cut_recommendations.groupby('userId').count()
# count_df.describe()
all_user_ids = count_df[count_df['movieId'] == 8].index.values

genome_scores_df = pd.read_csv(genome_scores).pivot(index='movieId', columns='tagId', values='relevance')

tag_genome_movies = genome_scores_df.index.values
tag_genome_movies.size

# filter ratings for movies watched only by these users
ratings_df = ratings_df[ratings_df['userId'].isin(all_user_ids)]

# filter ratings for movies only having tag-genome scores
ratings_df = ratings_df[ratings_df['movieId'].isin(tag_genome_movies)]

In [22]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from time import time

def get_best_clusters(user_movie_tags_df):
    n_movies = user_movie_tags_df.index.size
    highest_score = -99

    best_clustering_result = None
    best_size = 2

    if n_movies <= 3:
        best_clustering_result = AgglomerativeClustering(n_clusters=2,
                                                         affinity='euclidean',
                                                         linkage='ward').fit_predict(
            user_movie_tags_df.values[:n_movies])
    else:
        # exhaustively check silhouette scores for each cluster sizes and select the cluster size with the highest score.
        max_cluster_size = min(n_movies - 1, 100)
        for cluster_size in range(2, max_cluster_size, 2):
            clustering_result = AgglomerativeClustering(n_clusters=cluster_size,
                                                        affinity='euclidean',
                                                        linkage='ward').fit_predict(
                user_movie_tags_df.values[:n_movies])
            score = silhouette_score(user_movie_tags_df.values[:n_movies], clustering_result,
                                     metric='cosine')

            if highest_score < score:
                best_clustering_result = clustering_result
                highest_score = score
                best_size = cluster_size

        return best_clustering_result, best_size

            
def get_users_watched_movies(user_id):
    user_movies = ratings_df[ratings_df['userId'] == user_id][
        'movieId'].values

    return user_movies

import concurrent

def get_clustering_results_for_users(user_ids):
    user_clustering_results_df = pd.DataFrame()

    start_time = time()
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        futures = list()
        for user_id in user_ids:
            # extract list of movies watched by this user
            user_movies_d = get_users_watched_movies(user_id)

            # extract tag-genomes for movies watched by user
            user_movie_tags_df = genome_scores_df[
                genome_scores_df.index.isin(user_movies_d)]

    #         best_clustering_result, n_clusters = get_best_clusters(user_movie_tags_df)
            futures.append(executor.submit(get_best_clusters, user_movie_tags_df))

        for f in futures:
            series = pd.Series(name=user_id)
            series['best_clustering_result'] = f.result()[0]
            series['n_clusters'] = f.result()[1]

            user_clustering_results_df = user_clustering_results_df.append(series)

    finish_time = time() - start_time

    print('total time: %f seconds' % finish_time)
    return user_clustering_results_df

In [27]:
thresholds = [0.25, 0.4, 0.7]

l1 = 'threshold_'
l2 = '_float_movie_genomes_bz2'

thresholded_full_df_names = [l1 + str(threshold) + l2 for threshold in thresholds]

l3 = 'movies_lemmatized_threshold_'
l4 = '_float_movie_genomes_bz2'

lemmatized_threshold_df_names = [l3 + str(threshold) + l4 for threshold in thresholds]
thresholded_full_df_names.extend(lemmatized_threshold_df_names)
thresholded_full_df_names

['threshold_0.25_float_movie_genomes_bz2',
 'threshold_0.4_float_movie_genomes_bz2',
 'threshold_0.7_float_movie_genomes_bz2',
 'movies_lemmatized_threshold_0.25_float_movie_genomes_bz2',
 'movies_lemmatized_threshold_0.4_float_movie_genomes_bz2',
 'movies_lemmatized_threshold_0.7_float_movie_genomes_bz2']

In [30]:
# load all required genome_score dataframes and their names
genome_df_names = list()
genome_df_names.append('genome_scores_df_original_full')
genome_df_names.append('movies_lemmatized_genome_vector_df_bz2')
genome_df_names.extend(thresholded_full_df_names)

# generate names for thresholded and lemmatized thresholded


genome_dfs = list()
genome_dfs.append(genome_scores_df)
# other loaded df's go here
for df_name in genome_df_names[1:]:
    genome_dfs.append(
                pd.read_pickle(output_dir + df_name, compression='bz2')
            )

# compute clustering df for each
for i, genome_df in enumerate(genome_dfs):
    print('computing cluster sizes for df: ', genome_df_names[i])
    clustering_result_df = get_clustering_results_for_users(all_user_ids)
    
    # save pickle for each_with additional name as 'clustering_results_' + df_original_name
    clustering_result_df.to_pickle(output_dir + 'clustering_results_' + genome_df_names[i] + '_bz2', compression='bz2')

computing cluster sizes for df:  genome_scores_df_original_full


KeyboardInterrupt: 

Process ForkProcess-31:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/concurrent/futures/process.py", line 226, in _process_worker
    call_item = call_queue.get(block=True)
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 94, in get
    res = self._recv_bytes()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 411, in _recv_bytes
    return self._recv(size)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
Process ForkProcess-32:
