In [2]:
import pandas as pd
import numpy as np

In [3]:
## Data File Path
FEATURES_PARQUET  = '../../data/features/featuresv2.parquet'

In [185]:
features = pd.read_parquet(FEATURES_PARQUET)
features.shape

(578618, 305)

In [5]:
features.columns

MultiIndex([(      'track_id',                 ''),
            ('audio_features',     'acousticness'),
            ('audio_features',     'danceability'),
            ('audio_features',           'energy'),
            ('audio_features', 'instrumentalness'),
            ('audio_features',         'liveness'),
            ('audio_features',      'speechiness'),
            ('audio_features',            'tempo'),
            ('audio_features',          'valence'),
            (      'metadata',       'album_date'),
            ...
            (         'track',          'license'),
            (         'track',          'listens'),
            (         'track',         'lyricist'),
            (         'track',           'number'),
            (         'track',        'publisher'),
            (         'track',             'tags'),
            (         'track',            'title'),
            (         'track',              'tid'),
            (         'track',           'artist

In [6]:
features.columns.levels[0]

Index(['album', 'artist', 'audio_features', 'metadata', 'ranks', 'set',
       'social_features', 'temporal_features', 'track', 'track_id'],
      dtype='object')

# Physical Activities

We need to select the relevant tags for these physical activities:

1 – lying \
2 – sitting \
3 – standing \
4 – walking \
5 – running \
6 – cycling \
7 – Nordic walking \
9 – watching TV \
10 – computer work \
11 – car driving \
12 – ascending stairs \
13 – descending stairs \
16 – vacuum cleaning \
17 – ironing \
18 – folding laundry \
19 – house cleaning \
20 – playing soccer \
24 – rope jumping

Some of these activities require the same energy level and mental demand so music recommendation can be the same for some activities. A possible clustering could be:

CAN POSSIBLY MAKE A CLUSTERING ALGO FOR THE DATA

Cluster 1: Relaxing Activities

lying \
sitting \
watching TV (doesn't need a music recommendation anymore)

Cluster 2: Light Physical Activity with Low Mental Demand

standing \
walking \
Nordic walking \
ascending stairs \
descending stairs

Cluster 3: Light Activity with Moderate Mental Demand (Chores)

vacuum cleaning \
ironing \
folding laundry \
house cleaning

Cluster 4: Moderate to Vigorous Physical Activity with Moderate Mental Demand (Exercise)

playing soccer \
cycling \
running \
rope jumping

Cluster 5: Moderately Focused Mental Work

car driving


Cluster 6: Focused Mental Work

computer work


# Tag Selection

Now we aim to identify the relevant tags

In [7]:
unique_tags = features.track.tag.value_counts().reset_index().rename(columns={"index": "value", 0: "count"})
unique_tags

Unnamed: 0,tag,count
0,rock,6643
1,pop,4741
2,alternative,4179
3,electronic,3511
4,indie,3168
...,...,...
70534,StudyMusic,1
70535,malcolm,1
70536,lifehouse storm,1
70537,calmative,1


## Relaxing Activities

In [117]:
tag = "lying|sitting|relaxing|chill"
mask = features.track.tag.str.match(tag,case=False)
relaxing_titles = features[mask]
print(relaxing_titles.shape[0])
print(len(relaxing_titles.track.title.unique()))
relaxing_titles.track.tag.value_counts()

7103
861


tag
chillout                   2364
chill                      1849
relaxing                    721
chill out                   341
chilled                     222
                           ... 
relaxing calm and fresh       1
chillout dnb                  1
Chill Out Sinistro            1
chill aout                    1
chillout seen live            1
Name: count, Length: 273, dtype: int64

## Light Activities

In [170]:
tag = "standing|walking|walk|^stand$"
mask = features.track.tag.str.match(tag,case=False)
light_acts_titles = features[mask]
print(light_acts_titles.shape[0])
print(len(light_acts_titles.track.title.unique()))
light_acts_titles.track.tag.value_counts()

84
40


tag
walkman music                                           11
Walking down the street smilin                           8
Walking tunes                                            7
walking to                                               6
Walking in the streets                                   5
walk on                                                  4
walker                                                   4
walk on the wild side                                    4
Walkabout                                                4
walking blues                                            4
walking                                                  3
walking songs                                            3
walking home alone                                       2
walk through the stars                                   2
walking song                                             2
standing tone                                            2
walking the streets at night with your headphones on

## Chores

In [165]:
tag = "cleaning|chores|housework|clean"
mask = features.track.tag.str.match(tag,case=False)
chores_titles = features[mask]
print(chores_titles.shape[0])
print(len(chores_titles.track.title.unique()))
chores_titles.track.tag.value_counts()

118
55


tag
clean                 72
Cleaning mood         16
cleanup soundtrack     5
clean lyrics           5
Houseworks             4
Cleaning Day lol       4
cleaning               3
clean riffs            2
clean vocals           2
cleaninghouse          1
clean vocal metal      1
cleanE                 1
Housework helper       1
cleansing              1
Name: count, dtype: int64

## Exercise

In [154]:
# only include tags with more than 10 songs: workout, running, workout songs
tag = "workout|running|gym|exercise|work-out|working out|exercising|jogging|fitness|cardio"
mask = features.track.tag.str.match(tag,case=False)
exercise_titles = features[mask]
print(exercise_titles.shape[0])
print(len(exercise_titles.track.title.unique()))
exercise_titles.track.tag.value_counts()

274
105


tag
Workout                                       112
running                                        44
workout songs                                  11
running eyes                                   10
exercise                                        9
gym                                             7
fitness music                                   7
Exercise muzak                                  6
workout tunes                                   6
workoutdance                                    6
workout dance                                   5
Workout music                                   5
running down hills and dancing in the rain      5
running out of the church                       4
running music                                   4
jogging                                         4
gym music                                       3
gymidegym                                       3
gymicrae                                        3
running in circles                            

In [159]:
# only include tags with more than 10 songs: workout, running, workout songs
tag = r"^workout$|^running$|^workout songs$"
mask = features.track.tag.str.match(tag,case=False)
exercise_titles = features[mask]
exercise_titles.track.tag.value_counts()

tag
Workout          112
running           44
workout songs     11
Name: count, dtype: int64

## Driving

In [161]:
# can include only tags with more than 10 songs: Driving, driving music, driving songs
tag = 'driving|roadtrip|drive|road-trip'
mask = features.track.tag.str.match(tag,case=False)
driving_titles = features[mask]
print(driving_titles.shape[0])
print(len(driving_titles.track.title.unique()))
driving_titles.track.tag.value_counts()

577
237


tag
Driving                             299
drive                                41
driving music                        40
roadtrip                             39
Driving Songs                        11
                                   ... 
driving at night in the eighties      1
driver                                1
driver parallel lines                 1
driving down the interstate           1
driving on a summer night             1
Name: count, Length: 65, dtype: int64

In [156]:
tag = r'^driving$|driving music|driving songs'
mask = features.track.tag.str.match(tag,case=False)
driving_titles = features[mask]
driving_titles.track.tag.value_counts()

tag
Driving          299
driving music     40
Driving Songs     11
Name: count, dtype: int64

## Work

In [164]:
# only include: focusing, studying, focused, study, study music
tag = "focus|study|office|work$|non-vocal|no lyrics"
mask = features.track.tag.str.match(tag,case=False)
work_titles = features[mask]
print(work_titles.shape[0])
print(len(work_titles.track.title.unique()))
work_titles.track.tag.value_counts()

310
112


tag
work                      191
focusing                   26
no lyrics                  18
Studying                   14
focused                    14
study                      11
study music                10
study piano                 4
Office Music                4
Focus                       4
Study Background Music      3
office                      3
study and sleep             2
studytime                   2
Study Mix                   2
StudyMusic                  1
study fiona apple           1
Name: count, dtype: int64

In [108]:
# only include: work, work music, working music
tag = "work"
mask = features.track.tag.str.match(tag,case=False)
work_titles = features[mask]
work_titles.track.tag.value_counts()

tag
work                                   191
Workout                                112
worktronica                             22
work music                              19
working music                           14
workout songs                           11
work tunes                               8
Work Mix                                 8
workie                                   7
workoutdance                             6
work hardcore                            6
workout tunes                            6
work relax                               6
working with pixels                      6
workin music                             5
workout dance                            5
Workout music                            5
working                                  5
workdays                                 4
workworkwork                             3
Work Toons                               3
work songs                               3
workforce1                               3
working

In [158]:
# only include: focusing, studying, focused, study, study music
tag = r"focusing|studying|focused|^study$|^study music$|^work$|^working music$|^work music$"
mask = features.track.tag.str.match(tag,case=False)
work_titles = features[mask]
work_titles.track.tag.value_counts()

tag
work             191
focusing          26
work music        19
Studying          14
working music     14
focused           14
study             11
study music       10
Name: count, dtype: int64

## Final Tags

In [179]:
#v5
# relax_tag = "sitting|lying"
# light_act_tag = "standing|walking"
# chores_tag = "cleaning|chores|housework"
# exercise_tag = r"^workout$|^running$|^workout songs$|gym|exercise"
# driving_tag = r'^driving$|driving music|driving songs|road trip|roadtrip'
# work_tag = r"focusing|studying|focused|^study$|^study music$|^work$|^working music$|^work music$"

#v6
relax_tag = "lying|sitting|relaxing"
light_act_tag = "standing|walking|walk"
chores_tag = "cleaning|chores|housework|clean"
exercise_tag = "workout|running|gym|exercise"
driving_tag = "driving"
work_tag = "focus|study|office|work$"

#v7
relax_tag = "lying|sitting|relaxing"
chores_tag = "cleaning|chores|housework|clean"
exercise_tag = "workout|running|gym|exercise|work-out|working out|exercising|jogging|fitness|cardio"
driving_tag = 'driving|roadtrip|drive|road-trip'
work_tag = "focus|study|office|work$|non-vocal|no lyrics"

In [180]:
relax_mask = features.track.tag.str.match(relax_tag,case=False)
#light_act_mask = features.track.tag.str.match(light_act_tag,case=False) remove by v7
chores_mask = features.track.tag.str.match(chores_tag,case=False)
exercise_mask = features.track.tag.str.match(exercise_tag,case=False)
driving_mask = features.track.tag.str.match(driving_tag,case=False)
work_mask = features.track.tag.str.match(work_tag,case=False)


In [181]:
condlist = [relax_mask,chores_mask,exercise_mask,driving_mask,work_mask]
choicelist = [1,2,3,4,5,]
features['track','tag_cluster'] = np.select(condlist,choicelist,0)
features.track.tag_cluster.value_counts()

tag_cluster
0    576427
1       912
4       577
5       310
3       274
2       118
Name: count, dtype: int64

In [182]:
songs_w_tag_cluster = features[features['track','tag_cluster'] != 0] 

In [183]:
songs_w_tag_cluster.track.tag_cluster.value_counts()

tag_cluster
1    912
4    577
5    310
3    274
2    118
Name: count, dtype: int64

In [184]:
songs_w_tag_cluster.to_parquet('../../data/features/featuresv7_w_tagclusters.parquet')

In [178]:
#songs_w_tag_cluster.track.tag_cluster.value_counts()