In [104]:
import pandas as pd
from collections import Counter
import re

In [105]:
full_male = pd.read_excel('male_complete.xlsx')

In [106]:
full_female = pd.read_excel('female_complete.xlsx')

In [107]:
male = full_male[['day', 'time', 'hub name','playlist name', 'type']]

In [108]:
female = full_female[['day', 'time', 'hub name','playlist name', 'type']]

In [109]:
print(len(male), len(female))

4265 4859


In [110]:
def clean(string):
    '''Cleans texts for semantic comparison'''
    string = str(string).lower().strip()

    # Remove any characters that are not alpha numerical or white spaces or equal sign (because of Ed Sheeran album)
    string = re.sub(r'[^a-z\s\=A-Z0-9]', '', string)
    return string

In [111]:
# Clean all data by removing trailing whitespaces and lowercasing all letters
female = female.copy()
female['hub name'] = female['hub name'].apply(clean)
female.loc['playlist name'] = female['playlist name'].apply(clean)

male = male.copy()
male['hub name'] = male['hub name'].apply(clean)
male.loc['playlist name'] = male['playlist name'].apply(clean)

In [112]:
male.replace(to_replace = 'Kris Kross Amsterdam', value = 'Kriss Kross Amsterdam', inplace = True)
female.replace(to_replace = 'Kris Kross Amsterdam', value = 'Kriss Kross Amsterdam', inplace = True)

In [114]:
# Remove last row which is added as NAN row with playlist name as index
print(male.tail(1), female.tail(1))
male.drop(male.tail(1).index,inplace=True)
female.drop(female.tail(1).index,inplace=True)

              day time hub name playlist name type
playlist name NaT  NaN      NaN           NaN  NaN               day time hub name playlist name type
playlist name NaT  NaN      NaN           NaN  NaN


In [115]:
print(male.tail(1), female.tail(1))

            day      time                       hub name  \
4264 2022-04-20  22:00:00  discover more from pink floyd   

                  playlist name   type  
4264  The Dark Side of the Moon  album               day      time hub name playlist name      type
4858 2022-04-20  22:00:00     mood   lofi beats   playlist


In [116]:
similar_hubs = male[male['hub name'].isin(female['hub name'])]

In [101]:
# The hubs, male and female user share
Counter(similar_hubs['hub name']).most_common()

[('throwback', 275),
 ('fresh new music', 167),
 ('recommended radio', 125),
 ('more like harry styles', 111),
 ('recommended for you today', 101),
 ('chill', 98),
 ('workout', 95),
 ('more like charlie puth', 83),
 ('suggested artists', 76),
 ('mood', 68),
 ('singalong', 65),
 ('more like jack harlow', 44),
 ('party', 27),
 ('more like imagine dragons', 25),
 ('featured charts', 11),
 ('recommended for today', 10),
 ('try something else', 9),
 ('todays biggest hits', 9),
 ('instrumental', 8),
 ('to get you started', 3)]

In [117]:
male_unique = male[~male['hub name'].isin(female['hub name'])]
Counter(male_unique['hub name']).most_common(25)

[('blijf op de hoogte', 266),
 ('your top mixes', 260),
 ('discover something new', 252),
 ('made for male name', 179),
 ('global cultures on the main stage', 171),
 ('dream playlist rotation', 132),
 ('dutch language pop', 131),
 ('hip hop', 117),
 ('sad songs', 110),
 ('pop', 97),
 ('popular new releases', 90),
 ('based on your recent listening', 89),
 ('popular albums', 89),
 ('rock', 87),
 ('happy', 86),
 ('discover more from pink floyd', 79),
 ('trending now', 77),
 ('for fans of taylor swift', 73),
 ('more of what you like', 56),
 ('new and rising artists', 48),
 ('best of artists', 33),
 ('more like queen', 30),
 ('more like lauren spencersmith', 30),
 ('popular artists', 28),
 ('for fans of bad bunny', 24)]

In [118]:
# 
female_unique = female[~female['hub name'].isin(male['hub name'])]
Counter(female_unique['hub name']).most_common(25)

[('more like pop rising', 145),
 ('more like todays top hits', 136),
 ('more like camila cabello', 86),
 ('more like ed sheeran', 81),
 ('more like jax jones', 60),
 ('more like latto', 60),
 ('dinner', 49),
 ('made for female name', 47),
 ('more like kris kross amsterdam', 40),
 ('more like antoon', 40),
 ('more like coldplay', 40),
 ('more like hot hits nl', 30),
 ('artists you like', 24)]

In [None]:
pd.DataFrame(Counter(female['hub name']).most_common(25), Counter(male['hub name']).most_common(25))


In [121]:
# How many unique hubs both users have
print(len(set(male_unique['hub name'])), len(set(female_unique['hub name'])))

46 13


In [138]:
# Most common playlists both users received
most_common_playlists = {'male': dict(Counter(male['playlist name']).most_common()), 'female': dict(Counter(female['playlist name']).most_common())}
# Counter(male['playlist name']).most_common(25)
# Counter(female['playlist name']).most_common(25)


In [139]:
new = pd.DataFrame(most_common_playlists)

In [143]:
# new.head(50)
new.sort_values(by = 'female', ascending = False).head(50)

Unnamed: 0,male,female
Songs to Sing in the Shower,31.0,72.0
lofi beats,6.0,64.0
Top Songs Global,2.0,58.0
Fresh Pop,49.0,57.0
Top 50 Global,2.0,57.0
Instrumental Study,1.0,54.0
Summer '22,25.0,48.0
t Koffiehuis,50.0,44.0
Stay Tuned!,47.0,43.0
Today's Top Hits,5.0,43.0


In [135]:
len(set(male['playlist name']))

601

In [136]:
len(set(female['playlist name']))

500

In [137]:
female[~female['playlist name'].isin(male['playlist name'])]

Unnamed: 0,day,time,hub name,playlist name,type
1,2022-04-14,10:00:00,featured charts,Top Songs Nederland,playlist
10,2022-04-14,10:00:00,throwback,Salsa Classics,playlist
13,2022-04-14,10:00:00,throwback,10s Latino,playlist
14,2022-04-14,10:00:00,throwback,Leyendas Urbanas,playlist
22,2022-04-14,10:00:00,chill,Deep House Rules,playlist
...,...,...,...,...,...
4842,2022-04-20,22:00:00,more like coldplay,Spotify Session - London,EP
4843,2022-04-20,22:00:00,more like coldplay,Fast Car,single
4850,2022-04-20,22:00:00,singalong,Sing-along: 90's to Now,playlist
4852,2022-04-20,22:00:00,singalong,Sing along Indie Hits,playlist
