In [3]:
import os
import pandas as pd
import numpy as np
import scipy
from nonnegfac.nmf import NMF_ANLS_BLOCKPIVOT
from sklearn.preprocessing import normalize
import h5py
from scipy import sparse
import shutil
from pathlib import Path
import wget

In [2]:
def download_and_unpack_zip_file(url, path, targetdir):
    wget.download(url, path)
    shutil.unpack_archive(path, targetdir)

def prepare_directories():
    directories_to_create = ['downloaded', 'raw', 'dataframes', 'tmp']
    for directory in directories_to_create:
        dirname = '/'.join(['../data', directory])
        Path(dirname).mkdir(parents=True, exist_ok=True)

def download_datasets():
    lfm1b_dataset_url = 'http://drive.jku.at/ssf/s/readFile/share/1056/266403063659030189/publicLink/LFM-1b.zip'
    lfm1b_ugp_dataset_url = 'http://www.cp.jku.at/datasets/LFM-1b/LFM-1b_UGP.zip'
    lfm1b_social_dataset_url = 'https://zenodo.org/record/5585638/files/LFM-1b_social.zip?download=1'

    targetdir = '../data/raw'
    print('Downloading and unpacking LFM-1b dataset (~8GB)...')
    download_and_unpack_zip_file(lfm1b_dataset_url, '../data/downloaded/LFM-1b.zip', targetdir)
    print('Finished.')

    print('Downloading and unpacking LFM-1b_UGP dataset (~166MB)...')
    download_and_unpack_zip_file(lfm1b_ugp_dataset_url, '../data/downloaded/LFM-1b_UGP.zip', targetdir)
    print('Finished.')

    print('Downloading and unpacking LFM-1b_social dataset (~2MB)...')
    download_and_unpack_zip_file(lfm1b_social_dataset_url, '../data/downloaded/LFM-1b_social.zip', targetdir)
    print('Finished.')


def create_lfm1b_users_df(lfm1b_users_filepath):
    lfm1b_users_df = pd.read_csv(lfm1b_users_filepath, sep='\t')
    lfm1b_users_df.to_csv('../data/raw/LFM-1b_social/LFM-1b_users.txt', sep = '\t', index=False)
    return lfm1b_users_df


def create_users_df(edgelist_df,
                    lfm1b_user_info_filepath,
                    lfm1b_user_additional_info_filepath,
                    lfm1b_user_genres_allmusic_filepath,
                    replace_missing_values_with_mean=False):
    users_list = list(edgelist_df.user1_id.unique())
    users_list.extend(list(edgelist_df.user2_id.unique()))
    users_list = list(set(users_list))
    users_df = pd.DataFrame(users_list).rename(columns={0:'user_id'})
    lfm1b_user_info_df = pd.read_csv(lfm1b_user_info_filepath, sep='\t').drop(columns=['registered_unixtime'])
    users_df = users_df.merge(lfm1b_user_info_df, how='left', on='user_id')
    users_df['age_group'] = users_df['age'].apply(get_age_group)
    users_df.loc[users_df['gender'].isnull(), 'gender'] = 'n'
    users_df.loc[users_df['country'].isnull(), 'country'] = 'N/A'
    users_df.loc[users_df['playcount'] == 0, 'playcount'] = 1
    users_df['playcount_lognorm'] = np.log(users_df['playcount'])
    users_df = users_df.drop(columns=['playcount'])
    lfm1b_user_additional_info_df = pd.read_csv(lfm1b_user_additional_info_filepath, sep='\t').rename(columns={'user-id':'user_id'}).replace('?', None).astype({'novelty_artist_avg_month': 'float64',
                            'novelty_artist_avg_year': 'float64',
                            'mainstreaminess_avg_6months':'float64',
                            'relative_le_per_weekday1':'float64',
                            'relative_le_per_weekday2':'float64',
                            'relative_le_per_weekday3':'float64',
                            'relative_le_per_weekday4':'float64',
                            'relative_le_per_weekday5':'float64',
                            'relative_le_per_weekday6':'float64',
                            'relative_le_per_weekday7':'float64',
                            'relative_le_per_hour0':'float64',
                            'relative_le_per_hour1':'float64',
                            'relative_le_per_hour2':'float64',
                            'relative_le_per_hour3':'float64',
                            'relative_le_per_hour4':'float64',
                            'relative_le_per_hour5':'float64',
                            'relative_le_per_hour6':'float64',
                            'relative_le_per_hour7':'float64',
                            'relative_le_per_hour8':'float64',
                            'relative_le_per_hour9':'float64',
                            'relative_le_per_hour10':'float64',
                            'relative_le_per_hour11':'float64',
                            'relative_le_per_hour12':'float64',
                            'relative_le_per_hour13':'float64',
                            'relative_le_per_hour14':'float64',
                            'relative_le_per_hour15':'float64',
                            'relative_le_per_hour16':'float64',
                            'relative_le_per_hour17':'float64',
                            'relative_le_per_hour18':'float64',
                            'relative_le_per_hour19':'float64',
                            'relative_le_per_hour20':'float64',
                            'relative_le_per_hour21':'float64',
                            'relative_le_per_hour22':'float64',
                            'relative_le_per_hour23':'float64'})
    users_df = users_df.merge(lfm1b_user_additional_info_df, how='left', on='user_id')
    users_df.loc[users_df['cnt_listeningevents'] == 0, 'cnt_listeningevents'] = 1
    users_df['cnt_listeningevents_lognorm'] = np.log(users_df['cnt_listeningevents'])
    users_df = users_df.drop(columns=['cnt_listeningevents'])
    users_df.loc[users_df['cnt_distinct_tracks'] == 0, 'cnt_distinct_tracks'] = 1
    users_df['cnt_distinct_tracks_lognorm'] = np.log(users_df['cnt_distinct_tracks'])
    users_df = users_df.drop(columns=['cnt_distinct_tracks'])
    users_df.loc[users_df['cnt_distinct_artists'] == 0, 'cnt_distinct_artists'] = 1
    users_df['cnt_distinct_artists_lognorm'] = np.log(users_df['cnt_distinct_artists'])
    users_df = users_df.drop(columns=['cnt_distinct_artists'])
    users_df.loc[users_df['cnt_listeningevents_per_week'] == 0, 'cnt_listeningevents_per_week'] = 1
    users_df['cnt_listeningevents_per_week_lognorm'] = np.log(users_df['cnt_listeningevents_per_week'])
    users_df = users_df.drop(columns=['cnt_listeningevents_per_week'])
    dummies = pd.get_dummies(users_df.country, prefix='country')
    users_df =  pd.concat([users_df, dummies.set_index(users_df.index)], axis=1)
    dummies = pd.get_dummies(users_df.gender, prefix='gender')
    users_df = pd.concat([users_df, dummies.set_index(users_df.index)], axis=1)
    dummies = pd.get_dummies(users_df.age_group, prefix='age_group')
    users_df = pd.concat([users_df, dummies.set_index(users_df.index)], axis=1)

    lfm1b_user_genres_allmusic_df = pd.read_csv(lfm1b_user_genres_allmusic_filepath, sep='\t')
    if os.path.isfile('../data/tmp/allmusic_diversity_df.csv'):
        allmusic_diversity_df = pd.read_csv('../data/tmp/allmusic_diversity_df.csv', index_col=0)
    else:
        allmusic_diversity_df = create_allmusic_diversity_df(lfm1b_user_genres_allmusic_df, users_df)
    users_df = users_df.merge(allmusic_diversity_df, how='left', on='user_id')

    lfm1b_user_genres_allmusic_df_user_ids = pd.DataFrame(lfm1b_user_genres_allmusic_df.user_id)
    lfm1b_user_genres_allmusic_df_genres = pd.DataFrame(lfm1b_user_genres_allmusic_df.drop(['user_id'], axis=1))
    lfm1b_user_genres_allmusic_df_genres = lfm1b_user_genres_allmusic_df_genres.div(lfm1b_user_genres_allmusic_df_genres.max(axis=1), axis=0).add_prefix('allmusic_')
    lfm1b_user_genres_allmusic_df = pd.concat([lfm1b_user_genres_allmusic_df_user_ids, lfm1b_user_genres_allmusic_df_genres.set_index(lfm1b_user_genres_allmusic_df_user_ids.index)], axis=1)
    users_df = users_df.merge(lfm1b_user_genres_allmusic_df, how='left', on='user_id')

    lfm1b_user_genres_freebase_df = pd.read_csv(lfm1b_user_genres_freebase_filepath, sep='\t')
    if os.path.isfile('../data/tmp/freebase_diversity_df.csv'):
        freebase_diversity_df = pd.read_csv('../data/tmp/freebase_diversity_df.csv', index_col=0)
    else:
        freebase_diversity_df = create_freebase_diversity_df(lfm1b_user_genres_freebase_df, users_df)
    users_df = users_df.merge(freebase_diversity_df, how='left', on='user_id')

    if os.path.isfile('../data/tmp/freebase_genres_matrix_nmf_df.csv'):
        freebase_genres_matrix_nmf_df = pd.read_csv('../data/tmp/freebase_genres_matrix_nmf_df.csv', index_col=0)
    else:
        freebase_genres_matrix_nmf_df = create_freebase_genres_matrix_nmf_df(lfm1b_user_genres_freebase_df)
    users_df = users_df.merge(freebase_genres_matrix_nmf_df, how='left', on='user_id')

    if os.path.isfile('../data/tmp/UAM_normalized_nmf_df.csv'):
        UAM_normalized_nmf_df = pd.read_csv('../data/tmp/UAM_normalized_nmf_df.csv', index_col=0)
    else:
        UAM_normalized_nmf_df = create_UAM_normalized_nmf_df(lfm1b_user_artists_LEs_filepath)
    users_df = users_df.merge(UAM_normalized_nmf_df, how='left', on='user_id')

    if replace_missing_values_with_mean == True:
        users_df = fill_missing_values(users_df)

    column_to_convert_to_user_groups = [
        'freebase_weighted_average_diversity',
        'freebase_genre_coverage_diversity',
        'freebase_entropy_diversity',
        'allmusic_weighted_average_diversity',
        'allmusic_genre_coverage_diversity',
        'allmusic_entropy_diversity',
        'cnt_listeningevents_lognorm',
        'cnt_distinct_tracks_lognorm',
        'cnt_distinct_artists_lognorm',
        'cnt_listeningevents_per_week_lognorm',
        'playcount_lognorm',
        'novelty_artist_avg_month',
        'novelty_artist_avg_6months',
        'novelty_artist_avg_year',
        'mainstreaminess_avg_month',
        'mainstreaminess_avg_6months',
        'mainstreaminess_avg_year',
        'mainstreaminess_global']
    for column in column_to_convert_to_user_groups:
        users_df = divide_column_into_user_groups_and_merge(users_df, column)
        dummies = pd.get_dummies(users_df['user_groups_'+column], prefix='user_groups_'+column)
        users_df = pd.concat([users_df, dummies.set_index(users_df.index)], axis=1)

    outdir = '../data/dataframes/users_dfs'
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    if replace_missing_values_with_mean == True:
        users_df.to_csv('../data/dataframes/users_dfs/users_df_no_missing_values.csv', index=False)
    else:
        users_df.to_csv('../data/dataframes/users_dfs/users_df.csv', index=False)

def create_UAM_normalized_nmf_df(lfm1b_user_artists_LEs_filepath):
    UAM, UAM_user_idx, UAM_artist_idx, user_ids, artist_ids = read_UAM(lfm1b_user_artists_LEs_filepath)
    user_ids_df = pd.DataFrame(user_ids).rename(columns={0:'user_id'})
    UAM = sparse.csr_matrix(UAM)
    UAM_normalized = normalize(UAM, norm='max', axis=1)
    UAM_normalized_nmf = create_NMF_embeddings(UAM_normalized, 20)
    UAM_normalized_nmf_df = pd.DataFrame(UAM_normalized_nmf).add_prefix('UAM_nmf_')
    UAM_normalized_nmf_df = pd.concat([user_ids_df, UAM_normalized_nmf_df], axis=1)
    UAM_normalized_nmf_df.to_csv('../data/tmp/UAM_normalized_nmf_df.csv', index=False)
    return UAM_normalized_nmf_df

def create_freebase_genres_matrix_nmf_df(lfm1b_user_genres_freebase_df):
    lfm1b_user_genres_freebase_df_user_ids = pd.DataFrame(lfm1b_user_genres_freebase_df.user_id)
    lfm1b_user_genres_freebase_df_genres = pd.DataFrame(lfm1b_user_genres_freebase_df.drop(['user_id'], axis=1))
    lfm1b_user_genres_freebase_df_genres = lfm1b_user_genres_freebase_df_genres.add_prefix('freebase_')
    lfm1b_user_genres_freebase_df_genres = pd.concat([lfm1b_user_genres_freebase_df_user_ids, lfm1b_user_genres_freebase_df_genres.set_index(lfm1b_user_genres_freebase_df_user_ids.index)], axis=1)
    freebase_genre_matrix = scipy.sparse.csr_matrix(lfm1b_user_genres_freebase_df_genres.fillna(0).values)
    freebase_genre_matrix_normalized = normalize(freebase_genre_matrix, norm='max', axis=1)
    freebase_genres_matrix_nmf = create_NMF_embeddings(freebase_genre_matrix_normalized, 20)
    freebase_genres_matrix_nmf_df = pd.DataFrame(freebase_genres_matrix_nmf).add_prefix('freebase_nmf_')
    freebase_genres_matrix_nmf_df = pd.concat([freebase_genres_matrix_nmf_df, lfm1b_user_genres_freebase_df_user_ids], axis=1)
    freebase_genres_matrix_nmf_df.to_csv('../data/tmp/freebase_genres_matrix_nmf_df.csv', index=False)
    return freebase_genres_matrix_nmf_df

def create_freebase_diversity_df(lfm1b_user_genres_freebase_df, users_df):
    user_ids = list(users_df.user_id.values)
    diversity_weighted_average_list = []
    diversity_genre_coverage_list = []
    entropy_diversity_list = []
    for user_id in user_ids:
        try:
            user_genres = lfm1b_user_genres_freebase_df[lfm1b_user_genres_freebase_df.user_id==user_id].drop(['user_id'], axis=1).values[0]
            user_genres_normalized = user_genres/np.max(user_genres)
            diversity_weighted_average = np.sum(user_genres_normalized)/len(user_genres_normalized)

            diversity_genre_coverage = len(user_genres[user_genres > 0])/len(user_genres)
            user_genre_counts = dict(zip(list(lfm1b_user_genres_freebase_df[lfm1b_user_genres_freebase_df.user_id==user_id].drop(['user_id'], axis=1).columns),
                                     list(lfm1b_user_genres_freebase_df[lfm1b_user_genres_freebase_df.user_id==user_id].drop(['user_id'], axis=1).values[0])))
            labels = []
            for key in user_genre_counts:
                labels.extend([key] * user_genre_counts[key])
            entropy_diversity = entropy_label_distribution(labels)
        except:
            diversity_weighted_average = None
            diversity_genre_coverage = None
            entropy_diversity = None
        diversity_weighted_average_list.append(diversity_weighted_average)
        diversity_genre_coverage_list.append(diversity_genre_coverage)
        entropy_diversity_list.append(entropy_diversity)
    freebase_diversity_df = pd.DataFrame(
        {'user_id': user_ids,
         'freebase_weighted_average_diversity': diversity_weighted_average_list,
         'freebase_genre_coverage_diversity': diversity_genre_coverage_list,
         'freebase_entropy_diversity': entropy_diversity_list
        })
    freebase_diversity_df.to_csv('../data/tmp/freebase_diversity_df.csv', index=False)
    return freebase_diversity_df

def create_allmusic_diversity_df(lfm1b_user_genres_allmusic_df, users_df):
    user_ids = list(users_df.user_id.values)
    diversity_weighted_average_list = []
    diversity_genre_coverage_list = []
    entropy_diversity_list = []
    for user_id in user_ids:
        try:
            user_genres = lfm1b_user_genres_allmusic_df[lfm1b_user_genres_allmusic_df.user_id==user_id].drop(['user_id'], axis=1).values[0]
            user_genres_normalized = user_genres/np.max(user_genres)
            diversity_weighted_average = np.sum(user_genres_normalized)/len(user_genres_normalized)

            diversity_genre_coverage = len(user_genres[user_genres > 0])/len(user_genres)
            user_genre_counts = dict(zip(list(lfm1b_user_genres_allmusic_df[lfm1b_user_genres_allmusic_df.user_id==user_id].drop(['user_id'], axis=1).columns),
                                     list(lfm1b_user_genres_allmusic_df[lfm1b_user_genres_allmusic_df.user_id==user_id].drop(['user_id'], axis=1).values[0])))
            labels = []
            for key in user_genre_counts:
                labels.extend([key] * user_genre_counts[key])
            entropy_diversity = entropy_label_distribution(labels)
        except:
            diversity_weighted_average = None
            diversity_genre_coverage = None
            entropy_diversity = None
        diversity_weighted_average_list.append(diversity_weighted_average)
        diversity_genre_coverage_list.append(diversity_genre_coverage)
        entropy_diversity_list.append(entropy_diversity)
    allmusic_diversity_df = pd.DataFrame(
        {'user_id': user_ids,
         'allmusic_weighted_average_diversity': diversity_weighted_average_list,
         'allmusic_genre_coverage_diversity': diversity_genre_coverage_list,
         'allmusic_entropy_diversity': entropy_diversity_list
        })
    allmusic_diversity_df.to_csv('../data/tmp/allmusic_diversity_df.csv', index=False)
    return allmusic_diversity_df

def get_age_group(age):
    if age == -1:
        return '-1'
    elif age < 5:
        return '0-4'
    elif age < 10:
        return '5-9'
    elif age < 15:
        return '10-14'
    elif age < 20:
        return '15-19'
    elif age < 25:
        return '20-24'
    elif age < 30:
        return '25-29'
    elif age < 35:
        return '30-34'
    elif age < 40:
        return '35-39'
    elif age < 45:
        return '40-44'
    elif age < 50:
        return '45-49'
    elif age < 55:
        return '50-54'
    elif age < 60:
        return '55-59'
    elif age < 65:
        return '60-64'
    elif age < 70:
        return '65-69'
    elif age < 75:
        return '70-74'
    elif age < 80:
        return '75-79'
    elif age >=80:
        return '80+'

def create_NMF_embeddings(input_matrix, dim):
    W, _, info = NMF_ANLS_BLOCKPIVOT().run(input_matrix, dim, max_iter=100)
    return W

def read_UAM(m_file):
    mf = h5py.File(m_file, 'r')
    user_ids = np.array(mf.get('idx_users')).astype(np.int64)
    artist_ids = np.array(mf.get('idx_artists')).astype(np.int64)
    UAM = sparse.csr_matrix((mf['/LEs/']["data"],
                             mf['/LEs/']["ir"],
                             mf['/LEs/']["jc"])).transpose() 
    UAM_user_idx = UAM.indices 
    UAM_artist_idx = UAM.indptr 
    return UAM, UAM_user_idx, UAM_artist_idx, user_ids, artist_ids

def entropy_label_distribution(labels):
    n_labels = len(labels)
    if n_labels <= 1:
        return 0
    value, counts = np.unique(labels, return_counts=True)
    probs = counts / np.float32(n_labels)
    n_classes = np.count_nonzero(probs)
    if n_classes <= 1:
        return 0.0
    ent = 0.0
    for p in probs:
        ent -= p * np.log(p)
    return ent

def divide_column_into_user_groups_and_merge(df, column_name):
    sorted_values = np.sort(df[column_name].values)
    third_of_a_sum = sum(sorted_values[~np.isnan(sorted_values)])/3
    is_first_third_threshold_set = False
    is_second_third_threshold_set = False
    cummulative_sum = 0
    for value in sorted_values:
        if np.isnan(value):
            continue
        if cummulative_sum > third_of_a_sum and is_first_third_threshold_set==False:
            first_third_threshold = value
            is_first_third_threshold_set = True
        if cummulative_sum > 2*third_of_a_sum and is_second_third_threshold_set==False:
            second_third_threshold = value
            is_second_third_threshold_set = True
        cummulative_sum += value
    user_groups = []
    for value in df[column_name].values:
        if np.isnan(value):
            user_groups.append(None)
        elif value <= first_third_threshold:
            user_groups.append('low')
        elif first_third_threshold < value <= second_third_threshold:
            user_groups.append('medium')
        elif value > second_third_threshold:
            user_groups.append('high')
    user_groups_df = pd.DataFrame(user_groups).rename(columns={0:'user_groups_'+column_name})
    df = pd.concat([df, user_groups_df], axis=1)
    return df

def fill_missing_values(users_df):
    columns_to_be_replaced_with_mean = []
    for column in users_df.columns:
        if users_df[column].isna().any():
            users_df[column] = users_df[column].fillna(users_df[column].mean())
    return users_df

In [3]:
lfm1b_user_info_filepath = '../data/raw/LFM-1b_social/LFM-1b_users.txt'
lfm1b_user_additional_info_filepath = '../data/raw/LFM-1b/LFM-1b_users_additional.txt'
lfm1b_user_genres_allmusic_filepath = '../data/raw/LFM-1b_UGP/LFM-1b_UGP_weightedPC_allmusic.txt'
lfm1b_user_genres_freebase_filepath = '../data/raw/LFM-1b_UGP/LFM-1b_UGP_weightedPC_freebase.txt'
input_edgelist_csv_filepath = '../data/raw/LFM-1b_social/LFM-1b_social_ties.txt'
lfm1b_user_artists_LEs_filepath = '../data/raw/LFM-1b/LFM-1b_LEs.mat'

In [4]:
prepare_directories()
download_datasets()

Downloading and unpacking LFM-1b dataset (~8GB)...
100% [....................................................................] 8672428478 / 8672428478Finished.
Downloading and unpacking LFM-1b_UGP dataset (~166MB)...
100% [......................................................................] 170935289 / 170935289Finished.
Downloading and unpacking LFM-1b_social dataset (~2MB)...
100% [..........................................................................] 1786815 / 1786815Finished.


In [5]:
edgelist_df = pd.read_csv('../data/raw/LFM-1b_social/LFM-1b_social_ties.txt', sep='\t')
users_df_without_missing_values = create_users_df(edgelist_df,
                                                  lfm1b_user_info_filepath,
                                                  lfm1b_user_additional_info_filepath,
                                                  lfm1b_user_genres_allmusic_filepath,
                                                  True)

users_df_without_missing_values = create_users_df(edgelist_df,
                                                  lfm1b_user_info_filepath,
                                                  lfm1b_user_additional_info_filepath,
                                                  lfm1b_user_genres_allmusic_filepath,
                                                  False)

  result = getattr(ufunc, method)(*inputs, **kwargs)


[NMF] Running: 
{
    "A_dim_1": 120175,
    "A_dim_2": 1999,
    "A_type": "<class 'scipy.sparse.csr.csr_matrix'>",
    "alg": "<class 'nonnegfac.nmf.NMF_ANLS_BLOCKPIVOT'>",
    "init": "uniform_random",
    "k": 20,
    "max_iter": 100,
    "max_time": Infinity,
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 334.2760293483734,
    "iterations": 100,
    "norm_A": 346.868402972796,
    "rel_error": 0.0031916893330365516
}
[NMF] Running: 
{
    "A_dim_1": 120175,
    "A_dim_2": 585095,
    "A_type": "<class 'scipy.sparse.csr.csr_matrix'>",
    "alg": "<class 'nonnegfac.nmf.NMF_ANLS_BLOCKPIVOT'>",
    "init": "uniform_random",
    "k": 20,
    "max_iter": 100,
    "max_time": Infinity,
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 1317.9680819511414,
    "iterations": 100,
    "norm_A": 833.7120940073838,
    "rel_error": 0.9458134092364059
}


In [4]:
users_df = pd.read_csv('../data/dataframes/users_dfs/users_df.csv')
users_df_without_missing_values = pd.read_csv('../data/dataframes/users_dfs/users_df_no_missing_values.csv')