In [83]:
import pandas as pd
from collections import Counter
import re

Reading files

In [159]:
full_male = pd.read_excel('male_complete.xlsx')
full_female = pd.read_excel('female_complete.xlsx')

In [160]:
# Creating subsets from the data to work with
male = full_male[['day', 'time', 'hub name','playlist name', 'type']]
female = full_female[['day', 'time', 'hub name','playlist name', 'type']]

In [115]:
print(len(male), len(female))

4247 4859


In [161]:
def clean(string):
    '''Cleans texts for semantic comparison'''
    string = str(string).lower().strip()

    # Remove any characters that are not alpha numerical or white spaces or equal sign (because of Ed Sheeran album)
    string = re.sub(r'[^a-z\s\=A-Z0-9]', '', string)
    return string

In [162]:
# Clean all data by removing trailing whitespaces and lowercasing all letters
female = female.copy()
female['hub name'] = female['hub name'].apply(clean)
female['playlist name'] = female['playlist name'].apply(clean)

male = male.copy()
male['hub name'] = male['hub name'].apply(clean)
male['playlist name'] = male['playlist name'].apply(clean)

Now that data is clean, textual analysis can begin

In [163]:
# Remove last row which is added as NAN row with playlist name as index
print(male.tail(1), female.tail(1))
# male.drop(male.tail(1).index,inplace=True)
# female.drop(female.tail(1).index,inplace=True)
# print(male.tail(1), female.tail(1))

            day      time                       hub name  \
4246 2022-04-20  22:00:00  discover more from pink floyd   

                  playlist name   type  
4246  the dark side of the moon  album               day      time hub name playlist name      type
4858 2022-04-20  22:00:00     mood    lofi beats  playlist


In [164]:
# Check which playlists in total appeared the most for both users
similar_playlists = male[male['playlist name'].isin(female['playlist name'])]
similar_playlists = similar_playlists[similar_playlists.type == 'playlist']
similar_playlists

Unnamed: 0,day,time,hub name,playlist name,type
0,2022-04-14,10:00:00,to get you started,taylor swift mix,playlist
1,2022-04-14,10:00:00,to get you started,bad bunny mix,playlist
2,2022-04-14,10:00:00,to get you started,bts mix,playlist
4,2022-04-14,10:00:00,try something else,top 50 netherlands,playlist
5,2022-04-14,10:00:00,try something else,top 2000,playlist
...,...,...,...,...,...
4212,2022-04-20,22:00:00,current hits and the 10s,de hits uit 2019,playlist
4214,2022-04-20,22:00:00,current hits and the 10s,de hits uit 2016,playlist
4219,2022-04-20,22:00:00,women of pop,women of pop,playlist
4221,2022-04-20,22:00:00,women of pop,this is adele,playlist


In [165]:
m_playlist = male[male.type == 'playlist']
f_playlist = female[female.type == 'playlist']

In [166]:
# Looking for the most common recommended playlists per user
m = dict(Counter(m_playlist['playlist name']).most_common())
mdf = pd.DataFrame({'male': m.keys(), 'count': m.values()})

f = dict(Counter(f_playlist['playlist name']).most_common())
fdf = pd.DataFrame({'female': f.keys(), 'count': f.values()})

display(mdf.head(20), fdf.head(20))

Unnamed: 0,male,count
0,je moerstaal,79
1,t koffiehuis,51
2,fresh pop,49
3,all out 2010s,48
4,stay tuned,47
5,all out 2000s,38
6,beste van nl,37
7,chill hits,37
8,soft pop hits,37
9,rock classics,37


Unnamed: 0,female,count
0,songs to sing in the shower,84
1,lofi beats,71
2,peaceful piano,64
3,top songs global,58
4,top 50 global,57
5,fresh pop,57
6,instrumental study,54
7,hit rewind,54
8,summer 22,48
9,vibes,47


In [167]:
most_common_playlists = {'male': dict(Counter(m_playlist['playlist name']).most_common()), 'female': dict(Counter(f_playlist['playlist name']).most_common())}
new = pd.DataFrame(most_common_playlists).fillna(0)
new['diff'] = abs(new['male'] - new['female'])
new.sort_values(by = 'diff', ascending=False, inplace=True)
new.head(60)

Unnamed: 0,male,female,diff
lofi beats,11.0,71.0,60.0
top songs global,2.0,58.0,56.0
top 50 global,2.0,57.0,55.0
songs to sing in the shower,30.0,84.0,54.0
peaceful piano,11.0,64.0,53.0
instrumental study,1.0,54.0,53.0
je moerstaal,79.0,31.0,48.0
todays top hits,3.0,43.0,40.0
top 50 netherlands,2.0,40.0,38.0
techno bunker,2.0,40.0,38.0


### Differences in playlist recommendations between both users

In [168]:
# Looking for top playlists both users were recommended that the other user was not
male_count = dict(Counter(m_playlist['playlist name'][~m_playlist['playlist name'].isin(f_playlist['playlist name'])]).most_common())
female_count = dict(Counter(f_playlist['playlist name'][~f_playlist['playlist name'].isin(m_playlist['playlist name'])]).most_common())

display(
pd.DataFrame({'male': male_count.keys(), 'count' : male_count.values()}).head(20),
pd.DataFrame({'female': female_count.keys(), 'count' : female_count.values()}).head(20)
)

Unnamed: 0,male,count
0,big on the internet,35
1,mood booster,31
2,happy mix,26
3,moody mix,23
4,reggaeton mix,23
5,pop mix,23
6,rock mix,22
7,soft rock,20
8,this is taylor swift,20
9,70s mix,20


Unnamed: 0,female,count
0,top songs nederland,31
1,viral hits nl,28
2,classic pop picks,27
3,are be,27
4,hot hits nl,25
5,zomerhits 20102020,19
6,homework motivation,15
7,singled out,15
8,all new all now,15
9,sing along kpop,14


Tagging times after morning, afternoon and evening

In [169]:
copy = male.copy()
copy['time'] = copy.time.astype(str).apply(lambda x: int(x[:2]))

# Translate the times to dayphases
for i, time in copy.time.iteritems():
    if time < 12:
        copy.iloc[i] = 'morning'
    elif time >=12<19:
        copy.iloc[i] = 'afternoon'
    else:
        copy.iloc[i] = 'evening'
print(copy.time)

0         morning
1         morning
2         morning
3         morning
4         morning
          ...    
4242    afternoon
4243    afternoon
4244    afternoon
4245    afternoon
4246    afternoon
Name: time, Length: 4247, dtype: object


In [170]:
male[copy['time'] == 'morning']

Unnamed: 0,day,time,hub name,playlist name,type
0,2022-04-14,10:00:00,to get you started,taylor swift mix,playlist
1,2022-04-14,10:00:00,to get you started,bad bunny mix,playlist
2,2022-04-14,10:00:00,to get you started,bts mix,playlist
3,2022-04-14,10:00:00,try something else,hot hits,playlist
4,2022-04-14,10:00:00,try something else,top 50 netherlands,playlist
...,...,...,...,...,...
3757,2022-04-20,09:40:00,made for male name,daily mix 6,playlist
3758,2022-04-20,09:40:00,made for male name,daily drive,playlist
3759,2022-04-20,09:40:00,new and rising artists,radar global,playlist
3760,2022-04-20,09:40:00,new and rising artists,radar canada,playlist


In [171]:
# Calculating how many times the woman received dinner hubs
Counter(female['playlist name'][female['hub name'] == 'dinner'])

Counter({'dinner with friends': 5,
         'dinner music': 5,
         'feel good dinner': 5,
         'kitchen swagger': 5,
         'latin dinner': 5,
         'jazzy dinner': 5,
         'bossa nova dinner': 5,
         'the perfect italian dinner': 4,
         'dinner lounge': 5,
         'dinner entre amis': 5})

In [178]:
# Counting workout playlists
display(pd.DataFrame(Counter(female['playlist name'][female['hub name'] == 'workout']).most_common()))
display(pd.DataFrame(Counter(male['playlist name'][male['hub name'] == 'workout']).most_common()))

Unnamed: 0,0,1
0,power hour,32
1,top hits workout,31
2,beast mode,31
3,fun run,31
4,motivation mix,31
5,workout,31
6,cardio,31
7,run this town,31
8,power workout,29
9,yoga mediation,13


Unnamed: 0,0,1
0,motivation mix,10
1,fun run,10
2,power hour,10
3,run this town,10
4,top hits workout,9
5,beast mode,9
6,power workout,9
7,cardio,9
8,workout,8
9,fast pop run,6


In [179]:
print(dict(Counter(male['playlist name'][male['hub name'] == 'mood']).most_common()).keys(),'\n',
dict(Counter(female['playlist name'][female['hub name'] == 'mood']).most_common()).keys())

dict_keys(['t koffiehuis', 'you  me', 'calm before the storm', 'easy on sunday', 'feelin myself', 'feeling good feeling great', 'happy tunes', 'chill hits', 'peaceful piano', 'life sucks', 'lofi beats', 'songs to sing in the shower', 'techno bunker', 'broken heart', 'latin pop classics']) 
 dict_keys(['life sucks', 'broken heart', 'peaceful piano', 'instrumental study', 'vibes', 't koffiehuis', 'happy tunes', 'chill hits', 'songs to sing in the shower', 'lofi beats', 'easy 80s', 'techno bunker', 'all the feels', 'intense studying', 'soft pop hits'])


Differences between how often female vs male received lo-fi beats and life sucks

In [180]:
print(len(male[male['playlist name'] =='life sucks']), len(female[female['playlist name'] =='life sucks']))
print(len(male[male['playlist name'] =='lofi beats']), len(female[female['playlist name'] =='lofi beats']))

12 42
11 71


Checking for hub names hinting towards sentiment playlists, ignoring 'more like' or similar playlists as they are targeted towards music discovery for artists, similar playlists or algorithmically generated, sentiment neutral playlists

In [181]:
female_hubs = list(f_playlist['hub name'])
female_hubs = [e for e in female_hubs if not e.startswith('more like') and not e.startswith('recommended')]
set(female_hubs)

{'artists you like',
 'chill',
 'dinner',
 'featured charts',
 'fresh new music',
 'instrumental',
 'made for female name',
 'mood',
 'party',
 'singalong',
 'suggested artists',
 'throwback',
 'to get you started',
 'todays biggest hits',
 'try something else',
 'workout'}

In [184]:
male_hubs = list(m_playlist['hub name'])
male_hubs = [e for e in male_hubs 
if not e.startswith('more') and 
not e.startswith('recommended') and 
not e.startswith('for fans') and 
not e.startswith('discover') and
not e.endswith('name') and
not e.endswith('albums') and
not e.endswith('charts') and
not e.endswith('artists') and

# no personalized playlists
'you' not in e]

Counter(male_hubs).most_common()

[('throwback', 274),
 ('blijf op de hoogte', 266),
 ('global cultures on the main stage', 171),
 ('fresh new music', 167),
 ('dream playlist rotation', 132),
 ('dutch language pop', 131),
 ('hip hop', 117),
 ('sad songs', 110),
 ('chill', 98),
 ('pop', 97),
 ('workout', 93),
 ('happy', 86),
 ('rock', 86),
 ('mood', 66),
 ('singalong', 65),
 ('party', 27),
 ('the best dance playlists', 10),
 ('playlists celebrating women', 10),
 ('love', 10),
 ('current hits and the 10s', 10),
 ('women of pop', 10),
 ('try something else', 9),
 ('todays biggest hits', 9),
 ('instrumental', 8),
 ('sad love', 8),
 ('celebrating 5 years of american teen', 2),
 ('weeknd vibes', 2)]

Activity or mood related playlists extracted from here are
'chill,', 'dinner,', 'instrumental,', 'mood,', 'party,', 'singalong,', 'throwback,', 'workout'

In [187]:
# looking for mood playlists
Counter(f_playlist['playlist name'][f_playlist['hub name'].isin(['mood','vibes'])]).most_common()

[('life sucks', 31),
 ('broken heart', 31),
 ('peaceful piano', 28),
 ('instrumental study', 28),
 ('vibes', 15),
 ('t koffiehuis', 14),
 ('happy tunes', 14),
 ('chill hits', 14),
 ('songs to sing in the shower', 14),
 ('lofi beats', 13),
 ('easy 80s', 4),
 ('techno bunker', 1),
 ('all the feels', 1),
 ('intense studying', 1),
 ('soft pop hits', 1)]

Checking how often the user was recommended certain mood playlists

In [158]:
f_playlist['day'][f_playlist['playlist name'] == 'life sucks']
set(f_playlist['day'][f_playlist['playlist name'] == 'songs to sing in the shower'])

{Timestamp('2022-04-14 00:00:00'),
 Timestamp('2022-04-15 00:00:00'),
 Timestamp('2022-04-16 00:00:00'),
 Timestamp('2022-04-17 00:00:00'),
 Timestamp('2022-04-18 00:00:00'),
 Timestamp('2022-04-19 00:00:00'),
 Timestamp('2022-04-20 00:00:00')}

In [188]:
# Male mood playlists
Counter(male['playlist name'][male['hub name'].isin(['mood','vibes','sad songs','sad vibes','happy'])]).most_common()

[('feeling good feeling great', 18),
 ('life sucks', 12),
 ('broken heart', 12),
 ('happy beats', 11),
 ('factor happy', 11),
 ('vibes', 11),
 ('beautiful day', 11),
 ('a walk alone', 11),
 ('the midnight hour', 11),
 ('sad beats', 11),
 ('sad classical', 11),
 ('dark  stormy', 11),
 ('tender', 11),
 ('t koffiehuis', 10),
 ('you  me', 10),
 ('calm before the storm', 10),
 ('sad covers', 10),
 ('songs to sing in the shower', 9),
 ('easy on sunday', 9),
 ('feelin myself', 9),
 ('preparty mood', 9),
 ('confidence boost', 8),
 ('rainy day jazz', 7),
 ('happy tunes', 6),
 ('deep dark indie', 4),
 ('je moerstaal', 3),
 ('classic road trip songs', 2),
 ('chill hits', 1),
 ('peaceful piano', 1),
 ('lofi beats', 1),
 ('techno bunker', 1),
 ('melancholy', 1),
 ('latin pop classics', 1)]

### Workout playlists

In [191]:
pd.DataFrame(Counter(male['playlist name'][male['hub name'].isin(['workout'])]).most_common())

Unnamed: 0,0,1
0,motivation mix,10
1,fun run,10
2,power hour,10
3,run this town,10
4,top hits workout,9
5,beast mode,9
6,power workout,9
7,cardio,9
8,workout,8
9,fast pop run,6


In [192]:
pd.DataFrame(Counter(female['playlist name'][female['hub name'].isin(['workout'])]).most_common())

Unnamed: 0,0,1
0,power hour,32
1,top hits workout,31
2,beast mode,31
3,fun run,31
4,motivation mix,31
5,workout,31
6,cardio,31
7,run this town,31
8,power workout,29
9,yoga mediation,13
