In [20]:
#libraries
import numpy as np
import random
import pandas as pd
import json

In [21]:
data = pd.read_csv('final_artist_data.csv', delimiter = ';')
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data.Genre.unique()

array(['Indie', 'Pop', 'Hip Hop/ Rap', nan, 'Rock', 'Rock ', 'Techno'],
      dtype=object)

In [30]:
#rename genre for songs with whitespace behind 'Rock' (see unique genres)
data.loc[list(data[data.Genre == 'Rock '].index),'Genre'] = 'Rock'
#remove rows with nan values (see unique genre values)
data = data[~data.Genre.isna()]
data = data.reset_index()

In [31]:
#Defining metrics for personas (1,2,3)
song_range = {'1':[1000,3000],
              '2':[1000,3000],
              '3':[200,500]}
weights_gen = {'1':[40,5,5,40,10],
               '2':[0,40,40,10,10],
               '3':[30,30,5,5,30]}
weights_known = {'1':[20,80],
                 '2':[70,30],
                 '3':[80,20]}

In [32]:
#pass which persona you want to have returned (input: '1','2','3')
def Persona(persona):
    genres = ['Indie', 'Pop', 'Hip Hop/ Rap', 'Rock', 'Techno']
    popularity = ['wellknown', 'lesser-known']
    
    #Retrieve number of songs and weights for respective persona
    n_songs = random.randint(song_range[persona][0],
                             song_range[persona][1])
    my_weights_gen = weights_gen[persona]
    my_weights_known = weights_known[persona]
    
    #create empty dataframe to fill later
    songs = pd.DataFrame()
    
    #each iteration simulates one song listened to
    for n in range(n_songs):
        #pick genre and if song is popular or not based on Persona odds
        genre = random.choices(genres,
                               weights = my_weights_gen,
                               k = 1)
        popular = random.choices(popularity,
                                weights = my_weights_known,
                                k=1)
        #create sub df of possible songs for given genre and popularity
        if popular[0] == 'wellknown':
            df_sub = data[(data['Genre'] == genre[0]) & (data['Popularity'] > 30)]
        else:
            df_sub = data[(data['Genre'] == genre[0]) & (data['Popularity'] < 30)]
        
        #pick random index from indexes matchin the criteria (of sub_df)
        index = random.sample(list(df_sub.index),
                              k = 1)
        
        #pick song with that index from data
        song = data[data.index == index[0]]
        #add song to songs
        songs = pd.concat([songs,song])
    
    #create count of each element (to eliminate duplicates but get count of each song)
    persona = songs.groupby(songs.columns.tolist(),as_index=False).size()
    #rename 'size' column
    persona = persona.rename(columns = {'size':'Listening frequency'})
    #sort df
    persona = persona.sort_values(by = ['Listening frequency'], ascending = False)
    return persona

Note: The generation of personas takes some time, since we are simulating 100 users per persona. We will add the file 'user_data.csv' to our hand in. Feel free to change the 'user_per_persona' number when you want to test the code.

In [33]:
#create empty user df to fill below
user_data = pd.DataFrame()
#define personas to iterate through
personas = ['1','2','3']

#define number of simulations per persona
n_user_per_persona = 100

#dummy to make user ids unique across personas
user_id = 1

for persona in personas:
    for i in range(n_user_per_persona):
        user = Persona(persona)
        user['User-ID'] = i + user_id
        user_data = pd.concat([user_data, user])
    user_id = user_id + n_user_per_persona

#normalize listening frequency
user_data['%_Frequency'] = user_data.groupby(['User-ID'], group_keys=False)['Listening frequency'].apply(lambda x: x*100 / sum(x))

In [34]:
user_data

Unnamed: 0,index,Artist,Top Track,Popularity,Genre,",",Listening frequency,User-ID,%_Frequency
37,37,Oceanator,Bad Brain Daze,24.0,Indie,",",26,1,1.403130
30,30,Go Cactus,No Money,7.0,Indie,",",23,1,1.241230
206,321,Gauche,Flash,8.0,Rock,",",22,1,1.187264
225,345,Graveland,Black Metal War!,29.0,Rock,",",21,1,1.133297
224,344,Rockettothesky,Mothering Silence,22.0,Rock,",",20,1,1.079331
...,...,...,...,...,...,...,...,...,...
95,172,Beyoncé,Halo,89.0,Pop,",",1,300,0.223714
93,169,Lady Gaga,Just Dance,88.0,Pop,",",1,300,0.223714
91,166,Lady Gaga,Shallow,88.0,Pop,",",1,300,0.223714
90,165,Lady Gaga,Bloody Mary,88.0,Pop,",",1,300,0.223714


In [35]:
user_data.to_csv('user_data.csv')