In [1]:
# Standard libraries
from itertools import chain
from ast import literal_eval

# Scientific libraries
import pandas as pd
from numpy import array

# Parallelization library
from joblib import Parallel, delayed

# Utility functions
from helpers.helpers_spotify import get_enao_genre_data
from helpers.helpers_pandas import contains_any

Load the data

In [2]:
df = pd.read_pickle('../Data/pkl/01_spotify_data_(notna).pkl')

# Import EveryNoise data

  
<img src="../Images/Enao-genres-map.jpg" alt="Every Noise" style="height: 1000px;"/>

## Get smilar Genres

### scrapper for *genremap*

Goes to the website and enters each genre link and scraps the html. Grabs the related genres, opposite genres, and uses the font size for each as a  indicator of edge 'weight'. Also gets the Spotify playlist links.

Source:
- [https://github.com/sofielange98/every-noise-network-analysis](https://github.com/sofielange98/every-noise-network-analysis)
- [https://github.com/aweitz/EveryNoise](https://github.com/aweitz/EveryNoise)

In [3]:
def genre_scraper(genres):
    """Scrap EveryNoise data associated to a list of genres.

    Args:
        genres (list): list of genres for which to scrap ENAO data

    Returns:
        pd.DataFrame: the info scrapped for each genre, organized in a DF
    """
    if isinstance(genres, str):
        genres = [genres]
    genres_data = Parallel(n_jobs=-1)(delayed(get_enao_genre_data)(genre) for genre in genres)
    return pd.DataFrame(genres_data)

In [4]:
my_unique_genres = list(set(chain.from_iterable(df['genres'].to_list())))
print(f"The spotify data includes {len(my_unique_genres)} different genres:")
my_unique_genres

The spotify data includes 930 different genres:


['musique peule',
 'basel indie',
 'west african jazz',
 'puglia indie',
 'slack-key guitar',
 'french hip hop',
 'scandipop',
 'arab pop',
 'binaural',
 'traprun',
 'organic house',
 'modern cello',
 'neo-psychedelic',
 'indie r&b',
 'pop urbaine',
 'bow pop',
 'swedish americana',
 'bebop',
 'abstract idm',
 'trova',
 'cumbia funk',
 'solo wave',
 'deconstructed club',
 'double drumming',
 'persian neo-traditional',
 'nz electronic',
 'freakbeat',
 'psybass',
 'ambient guitar',
 'choro',
 'munich electronic',
 'funk rock',
 'jazz colombiano',
 'art rock',
 'circuit',
 'persian electronic',
 'jazztronica',
 'bomba y plena',
 'vapor twitch',
 'zouglou',
 'latin classical',
 'aussietronica',
 'deep tropical house',
 'ethnotronica',
 'experimental vocal',
 'balafon',
 'brass band',
 'greek jazz',
 'dark pop',
 'hard rock',
 'pibroch',
 'cuban rumba',
 'smooth jazz',
 'operatic pop',
 'arab folk',
 'intelligent dance music',
 'psydub',
 'kora',
 'wave',
 'turkish folk',
 'dark wave',
 'oa

In [5]:
# Extract ENAO data for each genre
enao_df = genre_scraper(my_unique_genres)
enao_df

Unnamed: 0,genre,sim_genres,sim_weights,opp_genres,opp_weights,main_artists,artists_weights,spotify_url
0,musique peule,"[traditional soul, malian blues, world, musiqu...","[101, 101, 104, 100, 104, 102, 100, 107, 100, ...","[derby indie, italian post punk, gothic post-p...","[115, 122, 105, 100, 130, 116, 100, 105, 160, ...","[Ambah Barry, Aamadu-Burayma, Moussa Baho, Bab...","[100, 100, 100, 100, 100, 100, 100, 100, 100, ...",https://open.spotify.com/playlist/3PyovaGk3JYK...
1,basel indie,"[stuttgart indie, fort worth indie, indie rock...","[102, 103, 102, 102, 103, 104, 103, 102, 103, ...","[rumba, malagasy folk, nova musica paulista, g...","[144, 107, 123, 127, 135, 130, 134, 102, 110, ...","[Nobody Reads, Static Frames, Combineharvester...","[100, 100, 100, 100, 103, 100, 101, 100, 123, ...",https://open.spotify.com/playlist/4nwdZdimMWgw...
2,west african jazz,"[oberkrainer, telugu devotional, folklore para...","[100, 100, 102, 102, 102, 100, 100, 100, 101, ...","[derby indie, shanghai indie, deconstructed cl...","[127, 110, 128, 118, 153, 123, 103, 106, 136, ...","[Le Palm-Jazz de Macenta, Djoli Band, Orchestr...","[101, 100, 101, 100, 100, 114, 114, 102, 100, ...",https://open.spotify.com/playlist/0WRIEExyzsjt...
3,puglia indie,"[baltic post-punk, indietronica, uae indie, ch...","[100, 100, 100, 101, 101, 102, 101, 100, 102, ...","[eastern bloc groove, musica andina chilena, c...","[120, 160, 117, 105, 112, 108, 118, 102, 116, ...","[AcomeandromedA, Fonokit, Ugo Busatto, Bundamo...","[100, 102, 100, 104, 100, 101, 101, 100, 160, ...",https://open.spotify.com/playlist/0u7nZalGkBE0...
4,slack-key guitar,"[hammered dulcimer, hawaiian, gamelan, string ...","[101, 123, 107, 102, 101, 100, 101, 105]","[neue deutsche harte, chilean hardcore, rap me...","[114, 139, 118, 106, 112, 160, 105, 104, 104, ...","[Keola & Kapono Beamer, Ken Emerson, James ""Bl...","[115, 110, 105, 120, 124, 119, 108, 133, 105, ...",https://open.spotify.com/playlist/3AcB5ov4dAX8...
...,...,...,...,...,...,...,...,...
925,avant-garde jazz,"[straight-ahead jazz, jazz, free jazz, classic...","[101, 119, 144, 102, 100, 100, 110, 120, 100, ...","[k-pop reality show, hands up, dance-punk, t-p...","[103, 100, 116, 127, 103, 121, 100, 111, 149, ...","[Bobby Hutcherson, Eric Dolphy Quintet, Ed Bla...","[129, 102, 104, 108, 104, 116, 100, 104, 119, ...",https://open.spotify.com/playlist/72YNCrIywbtP...
926,rain,"[shush, sound, environmental, sleep, water]","[105, 115, 108, 100, 107]","[bachata dominicana, funk viral, indonesian vi...","[102, 125, 122, 112, 133, 112, 112, 134, 100, ...","[Attono, Orage et pluit, PDC RAIN, Dream Wonde...","[139, 118, 124, 120, 140, 152, 123, 122, 121, ...",https://open.spotify.com/playlist/4NVJ5W2V7pga...
927,flamenco guitar,"[french jazz, gong, jazz catala, nuevo flamenc...","[100, 101, 100, 120, 123, 115, 118]","[neue deutsche harte, neon pop punk, indie roc...","[149, 125, 139, 107, 108, 100, 126, 105, 127, ...","[Esteban De Sanlucar, Paco Serrano, Pepe MartÃ...","[101, 101, 101, 102, 104, 100, 106, 101, 111, ...",https://open.spotify.com/playlist/0IqRtRiN7mb3...
928,south sudanese pop,"[reunion pop, african reggae, ugandan pop, hip...","[101, 101, 100, 103, 101, 103, 100, 101, 101, ...","[neofolk, japanese psychedelic rock, bagpipe, ...","[153, 130, 155, 148, 149, 128, 131, 138, 104, ...","[Dau Dau, Alijoma, General Paolino, Sevena The...","[116, 102, 100, 105, 100, 102, 104, 101, 103, ...",https://open.spotify.com/playlist/2domD3pH2tbu...


In [6]:
# check that we every genre has been scrapped
if len(enao_df) != len(my_unique_genres):
    raise ValueError("Some genres were not found")

### Check for incomplete data

In [7]:
# Check which genres do not have similar genres associated
no_sim_genres = ~enao_df['sim_genres'].astype(bool)
enao_df[no_sim_genres]

Unnamed: 0,genre,sim_genres,sim_weights,opp_genres,opp_weights,main_artists,artists_weights,spotify_url
199,classical organ,[],[],"[ragga jungle, trival, italo dance, jersey clu...","[122, 101, 109, 110, 101, 102, 110, 143, 160, ...","[Flor Peeters, Rosalinde Haas, Gunther Rost, F...","[101, 100, 101, 106, 101, 103, 102, 108, 102, ...",https://open.spotify.com/playlist/56PPF7Jg6OuD...
487,rominimal,[],[],"[police band, scottish smallpipe, zampogna, po...","[129, 130, 138, 117, 107, 110, 106, 100, 137, ...","[Lowris, Priku, Melle-J, Silat Beksi, Anam Nes...","[105, 125, 101, 120, 102, 100, 114, 111, 103, ...",https://open.spotify.com/playlist/6tbzdys5a2LP...


In [8]:
# Collect genres that do not have similar genres associated
genres_wihout_sim_genres = enao_df[no_sim_genres]['genre'].to_list()
genres_wihout_sim_genres

['classical organ', 'rominimal']

In [9]:
# Check which tracks are associated with those genres
df[contains_any(df['genres'], genres_wihout_sim_genres)]

Unnamed: 0,spotify_id,title,artists,artists_id,genres,popularity,energy,valence,danceability,acousticness,loudness,speechiness,instrumentalness,liveness,tempo,key,mode,time_signature,duration_ms
1551,1iUpKFWs8Uskzc1VXhLdLk,The Lament for Ten (People),[Cunla],[2E5WOH3t67FGeRjJcjkhZc],"[alternative roots rock, anglican liturgy, bag...",0,0.0391,0.405,0.173,0.996,-15.739,0.0356,0.925,0.117,66.656,5,0,5,216450
1609,7fbHdEoEiGFaQW8VtZKd4S,Don't Let Go,[Markus Homm],[6lU2Le0VfhyLnb8x0bOqil],"[minimal tech house, rominimal]",10,0.494,0.316,0.808,0.00857,-15.485,0.0622,0.647,0.0363,124.001,11,0,4,373722


In [10]:
# approximate similar genres 
for val in genres_wihout_sim_genres:
    mask = contains_any(df['genres'], val)  # tracks associated with this genre in out spotify liked songs
    lst = df[mask]['genres'].to_list()      # get +/- similar genres
    sim_genres = set(chain(*lst))           # flatten sim genres list
    sim_genres.discard(val)                 # discard the genre itself
    print(f"Similar to {val}: {list(sim_genres)}")
    idx = enao_df.index[enao_df['genre'] == val].tolist()  # find this genre back in ENAO df
    if len(idx) != 1:                       # there should be only one row 
        raise ValueError(f"Error! Non unique row with genre {val} (found at {idx})")
    enao_df.at[idx[0], 'sim_genres'] = list(sim_genres)            # replace sim_genres by approximation
    enao_df.at[idx[0], 'sim_weights'] = [100] * len(sim_genres)    # default value of sim_weights = 100

Similar to classical organ: ['scottish smallpipe', 'chanson paillarde', 'contrabass', 'uilleann pipes', 'spanish jazz', 'bagpipe', 'italian classical guitar', 'shanty', 'anglican liturgy', 'flemish folk', 'classical guitar', 'bagpipe marching band', 'breton folk', 'celtic harp', 'string folk', 'police band', 'fingerstyle', 'alternative roots rock', 'irish folk', 'celtic']
Similar to rominimal: ['minimal tech house']


In [11]:
# Double check that it was replaced properly
enao_df[no_sim_genres]

Unnamed: 0,genre,sim_genres,sim_weights,opp_genres,opp_weights,main_artists,artists_weights,spotify_url
199,classical organ,"[scottish smallpipe, chanson paillarde, contra...","[100, 100, 100, 100, 100, 100, 100, 100, 100, ...","[ragga jungle, trival, italo dance, jersey clu...","[122, 101, 109, 110, 101, 102, 110, 143, 160, ...","[Flor Peeters, Rosalinde Haas, Gunther Rost, F...","[101, 100, 101, 106, 101, 103, 102, 108, 102, ...",https://open.spotify.com/playlist/56PPF7Jg6OuD...
487,rominimal,[minimal tech house],[100],"[police band, scottish smallpipe, zampogna, po...","[129, 130, 138, 117, 107, 110, 106, 100, 137, ...","[Lowris, Priku, Melle-J, Silat Beksi, Anam Nes...","[105, 125, 101, 120, 102, 100, 114, 111, 103, ...",https://open.spotify.com/playlist/6tbzdys5a2LP...


In [12]:
# Check if we still have missing data
if enao_df['sim_genres'].astype(bool).any() == False:
    raise ValueError("Some similar genres are still missing")

## Get genre position from ENAO map

In [13]:
# download latest genre map from https://github.com/ben-tanen/spotify-genre-map/tree/master/data
# make sure to click on 'raw' button, before copying the URL
url = 'https://raw.githubusercontent.com/ben-tanen/spotify-genre-map/master/data/enao-genres-20220402.csv'
enao_df_pos = pd.read_csv(url)

In [14]:
# Extract position from the dataframe
enao_df_pos.drop(columns=enao_df_pos.columns.difference(['genre', 'top', 'left', 'color']), inplace=True)
# for dim in ['top', 'left']: 
#     enao_df_pos['top'] = enao_df_pos['top'].str[:-2].astype(int).rename('y')    # remove px + convert to int

enao_df_pos['x'] = enao_df_pos['left'].str[:-2].astype(int)
enao_df_pos['y'] = enao_df_pos['top'].str[:-2].astype(int)
enao_df_pos['y'] = enao_df_pos['y'].max() - enao_df_pos['y']
enao_df_pos['color'] = enao_df_pos['color'].str[3:].apply(lambda x: array(literal_eval(str(x))) / 255)

enao_df_pos.drop(columns=['top', 'left'], inplace=True)
enao_df_pos.head()

Unnamed: 0,genre,color,x,y
0,rap tico,"[0.6941176470588235, 0.5294117647058824, 0.133...",1110,11820
1,lovers rock,"[0.5568627450980392, 0.5686274509803921, 0.050...",1180,14390
2,progressive uplifting trance,"[0.8862745098039215, 0.45098039215686275, 0.62...",525,19699
3,kompa chretien,"[0.6509803921568628, 0.5490196078431373, 0.047...",733,13041
4,piano blues,"[0.4, 0.5333333333333333, 0.17647058823529413]",920,4116


In [15]:
# Only keep the genres that are present in our spotify liked songs
mask = enao_df_pos['genre'].isin(my_unique_genres)
pos_dct = enao_df_pos[mask].set_index('genre').to_dict()
pos_dct

{'color': {'lovers rock': array([0.55686275, 0.56862745, 0.05098039]),
  'folk rock italiano': array([0.51764706, 0.54117647, 0.04313725]),
  'cha-cha-cha': array([0.49803922, 0.52156863, 0.24313725]),
  'blues': array([0.41176471, 0.5254902 , 0.12941176]),
  'musica afroperuana': array([0.56470588, 0.54117647, 0.10980392]),
  'musica indigena latinoamericana': array([0.56470588, 0.56078431, 0.05490196]),
  'rap latina': array([0.6745098 , 0.56078431, 0.02352941]),
  'contemporary vocal jazz': array([0.2745098 , 0.60392157, 0.04705882]),
  'fingerstyle': array([0.29803922, 0.65882353, 0.69411765]),
  'mambo': array([0.54901961, 0.50196078, 0.27058824]),
  'dutch classical piano': array([0.13333333, 0.64313725, 0.85490196]),
  'malagasy folk': array([0.55294118, 0.5372549 , 0.17254902]),
  'ecuadorian pop': array([0.65882353, 0.55294118, 0.01960784]),
  'venda pop': array([0.62745098, 0.56470588, 0.06666667]),
  'reggae en espanol': array([0.66666667, 0.53333333, 0.03529412]),
  'classi

In [16]:
# Add columns "x" and "y" in ou ENAO df
for col in ['x', 'y', 'color']:
    enao_df[col] = enao_df['genre'].map(pos_dct[col])
enao_df

Unnamed: 0,genre,sim_genres,sim_weights,opp_genres,opp_weights,main_artists,artists_weights,spotify_url,x,y,color
0,musique peule,"[traditional soul, malian blues, world, musiqu...","[101, 101, 104, 100, 104, 102, 100, 107, 100, ...","[derby indie, italian post punk, gothic post-p...","[115, 122, 105, 100, 130, 116, 100, 105, 160, ...","[Ambah Barry, Aamadu-Burayma, Moussa Baho, Bab...","[100, 100, 100, 100, 100, 100, 100, 100, 100, ...",https://open.spotify.com/playlist/3PyovaGk3JYK...,950.0,8464.0,"[0.5647058823529412, 0.5254901960784314, 0.141..."
1,basel indie,"[stuttgart indie, fort worth indie, indie rock...","[102, 103, 102, 102, 103, 104, 103, 102, 103, ...","[rumba, malagasy folk, nova musica paulista, g...","[144, 107, 123, 127, 135, 130, 134, 102, 110, ...","[Nobody Reads, Static Frames, Combineharvester...","[100, 100, 100, 100, 103, 100, 101, 100, 123, ...",https://open.spotify.com/playlist/4nwdZdimMWgw...,484.0,13405.0,"[0.6549019607843137, 0.4980392156862745, 0.243..."
2,west african jazz,"[oberkrainer, telugu devotional, folklore para...","[100, 100, 102, 102, 102, 100, 100, 100, 101, ...","[derby indie, shanghai indie, deconstructed cl...","[127, 110, 128, 118, 153, 123, 103, 106, 136, ...","[Le Palm-Jazz de Macenta, Djoli Band, Orchestr...","[101, 100, 101, 100, 100, 114, 114, 102, 100, ...",https://open.spotify.com/playlist/0WRIEExyzsjt...,948.0,8129.0,"[0.6392156862745098, 0.5372549019607843, 0.156..."
3,puglia indie,"[baltic post-punk, indietronica, uae indie, ch...","[100, 100, 100, 101, 101, 102, 101, 100, 102, ...","[eastern bloc groove, musica andina chilena, c...","[120, 160, 117, 105, 112, 108, 118, 102, 116, ...","[AcomeandromedA, Fonokit, Ugo Busatto, Bundamo...","[100, 102, 100, 104, 100, 101, 101, 100, 160, ...",https://open.spotify.com/playlist/0u7nZalGkBE0...,624.0,15558.0,"[0.6509803921568628, 0.5333333333333333, 0.298..."
4,slack-key guitar,"[hammered dulcimer, hawaiian, gamelan, string ...","[101, 123, 107, 102, 101, 100, 101, 105]","[neue deutsche harte, chilean hardcore, rap me...","[114, 139, 118, 106, 112, 160, 105, 104, 104, ...","[Keola & Kapono Beamer, Ken Emerson, James ""Bl...","[115, 110, 105, 120, 124, 119, 108, 133, 105, ...",https://open.spotify.com/playlist/3AcB5ov4dAX8...,803.0,5133.0,"[0.3215686274509804, 0.6588235294117647, 0.470..."
...,...,...,...,...,...,...,...,...,...,...,...
925,avant-garde jazz,"[straight-ahead jazz, jazz, free jazz, classic...","[101, 119, 144, 102, 100, 100, 110, 120, 100, ...","[k-pop reality show, hands up, dance-punk, t-p...","[103, 100, 116, 127, 103, 121, 100, 111, 149, ...","[Bobby Hutcherson, Eric Dolphy Quintet, Ed Bla...","[129, 102, 104, 108, 104, 116, 100, 104, 119, ...",https://open.spotify.com/playlist/72YNCrIywbtP...,760.0,3710.0,"[0.32941176470588235, 0.5607843137254902, 0.39..."
926,rain,"[shush, sound, environmental, sleep, water]","[105, 115, 108, 100, 107]","[bachata dominicana, funk viral, indonesian vi...","[102, 125, 122, 112, 133, 112, 112, 134, 100, ...","[Attono, Orage et pluit, PDC RAIN, Dream Wonde...","[139, 118, 124, 120, 140, 152, 123, 122, 121, ...",https://open.spotify.com/playlist/4NVJ5W2V7pga...,64.0,5199.0,"[0.6509803921568628, 0.0, 0.7333333333333333]"
927,flamenco guitar,"[french jazz, gong, jazz catala, nuevo flamenc...","[100, 101, 100, 120, 123, 115, 118]","[neue deutsche harte, neon pop punk, indie roc...","[149, 125, 139, 107, 108, 100, 126, 105, 127, ...","[Esteban De Sanlucar, Paco Serrano, Pepe MartÃ...","[101, 101, 101, 102, 104, 100, 106, 101, 111, ...",https://open.spotify.com/playlist/0IqRtRiN7mb3...,946.0,3996.0,"[0.4549019607843137, 0.5686274509803921, 0.682..."
928,south sudanese pop,"[reunion pop, african reggae, ugandan pop, hip...","[101, 101, 100, 103, 101, 103, 100, 101, 101, ...","[neofolk, japanese psychedelic rock, bagpipe, ...","[153, 130, 155, 148, 149, 128, 131, 138, 104, ...","[Dau Dau, Alijoma, General Paolino, Sevena The...","[116, 102, 100, 105, 100, 102, 104, 101, 103, ...",https://open.spotify.com/playlist/2domD3pH2tbu...,1091.0,15406.0,"[0.6784313725490196, 0.5529411764705883, 0.054..."


## Save the results

In [17]:
# Save the data
enao_df.to_pickle('../Data/pkl/02_everynoise_data.pkl')
enao_df.to_csv('../Data/csv/02_everynoise_data.csv')  # For visualization