In [26]:
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances

base_dir = '../../datasets/Movielens/'

# dataset_dir
ml20m = base_dir + 'ml-20m/'
serendipity2018 = base_dir + 'serendipity-sac2018/'

data_dir2 = serendipity2018
answers = serendipity2018 + 'answers.csv'

data_output_dir = base_dir + 'output/'

dataset_files = {
    'ml20m': {
        'genome_scores': 'genome-scores.csv',
        'movies': 'movies.csv',
        'ratings': 'ratings.csv'
    },
    'serendipity2018': {
        'genome_scores': 'tag_genome.csv',
        'movies': 'movies.csv',
        'ratings': 'training.csv'
    }
}

dataset = 'ml20m'

def init_file_names(dataset):
    dataset_dir = None

    if dataset is 'serendipity2018':
        dataset_dir = base_dir + serendipity2018
    elif dataset is 'ml20m':
        dataset_dir = base_dir + ml20m

    genome_scores = dataset_dir + dataset_files[dataset]['genome_scores']
    movies = dataset_dir + dataset_files[dataset]['movies']
    ratings = dataset_dir + dataset_files[dataset]['ratings']

    return genome_scores, movies, ratings


def load_and_preprocess_data(dataset):
    genome_scores, movies, ratings = init_file_names(dataset)

    # load all movies in df,
    movies_df = pd.read_csv(movies)

    # load tag-genome scores df
    genome_scores_df = pd.read_csv(genome_scores)

    # load all ratings
    ratings_df = pd.read_csv(ratings)

    # filter movies only under tag-genome df
    movies_with_tag_genome = genome_scores_df['movieId'].unique()

    # filter-out movies with (no genres listed)
    no_genre_movies = movies_df[movies_df['genres'] == '(no genres listed)']['movieId'].unique()

    all_movie_ids = np.setdiff1d(movies_with_tag_genome, no_genre_movies)

    # store final list of movie ID's
    # udpate genome_scores_df, ratings_df and movies_df to only keep updated movie ID's
    ratings_df = ratings_df[ratings_df['movieId'].isin(all_movie_ids)]
    genome_scores_df = genome_scores_df[genome_scores_df['movieId'].isin(all_movie_ids)]
    movies_df = movies_df[movies_df['movieId'].isin(all_movie_ids)]

    return ratings_df, genome_scores_df, movies_df