
# **Imports**
---



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import json
import requests
import threading
import concurrent.futures

# **Collecting data**

---



In [None]:
total_elements = 2100000
batch_size = 200
start_index = 0
all_elements = []
lock = threading.Lock()

def fetch_data(start_index):
    url = f"https://wasabi.i3s.unice.fr/api/v1/song_all/{start_index}"
    print(start_index,'-')
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    return None
def process_result(result):
    if result:
        with lock:
            all_elements.extend(result)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(fetch_data, start_index + i * batch_size) for i in range(total_elements // batch_size)]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        process_result(result)

In [None]:
output_file = 'results.json'
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(all_elements, json_file, ensure_ascii=False, indent=4)

print(f'Data saved to {output_file}')

# **Loading data**
---



In [None]:
file_path =  'results.json'
data = pd.read_json(file_path)
df_artists = pd.read_csv('wasabi_artists.csv')
df_albums = pd.read_csv('wasabi_albums.csv')

# **Data processing: Wasabi songs**

In [None]:
# dropping useless columns
data = data.drop(columns=['lyrics','title_accent_fold','urlAllmusic','urlAmazon','urlGoEar','urlHypeMachine','urlITunes','urlLastFm',
                       'urlMusicBrainz','urlPandora','urlSong','urlWikipedia','id_song_musicbrainz','disambiguation','bpm','urlSpotify','explicitLyrics','abstract','format','animux_path','animux_content', 'animux_contents',
                        'aligned_id','preview','begin','end', 'animux_paths', 'explicit_content_lyrics','chords_metadata','multitrack_path', 'animux_paths', 'explicit_content_lyrics',
                        'chords_metadata' ,'subject','summary','genre'])

In [None]:
# extracting the genre of a given title
def map_title_to_genre(title):
  return df_albums[df_albums.title == title].genre.values[0]

In [None]:
title_to_genre = df_albums.set_index('title')['genre'].to_dict()
data['genre'] = data['albumTitle'].map(title_to_genre).fillna('Unknown')

In [None]:
#dictionary where titles are keys, and 'id_artist' values from  'df_albums' are values
title_to_artist= df_albums.set_index('title')['id_artist'].to_dict()

In [None]:
# artist dict to extract information of the previous titles
artists = {}
for artist_info, id_artist in title_to_artist.items():
    artist_name = df_artists[df_artists['_id'] == id_artist]['_id'].values[0]
    artists[artist_name] = artist_info

In [None]:
#processing album title sequence
data['albumTitle'] = data['albumTitle'].str.strip().str.lower()

# Reverse the dictionary (reverse album names  and artist )
artists_to_albums = {v.lower(): k for k, v in artists.items()}

# mapping album names to artist and store it in "artist" column in df_albums
data['artist'] = data['albumTitle'].map(artists_to_albums)

In [None]:
#dropping nan values
data.dropna(subset=['artist'], inplace=True)

In [None]:
# create new dataframe with specific information
result_df = pd.DataFrame(columns=['artist_id', 'name','type','genre','location'])
#populating the df according to artists in data
for artist_id in data.artist.values:
    artist_location = df_artists[df_artists['_id'] == artist_id]['location'].values[0]
    artist_name = df_artists[df_artists['_id'] == artist_id]['name'].values[0]
    artist_type = df_artists[df_artists['_id'] == artist_id]['type'].values[0]
    artist_genre= df_artists[df_artists['_id'] == artist_id]['genres'].values[0]
    result_df = result_df.append({'artist_id': artist_id,'name':artist_name,'type':artist_type,'genre':artist_genre, 'location': artist_location}, ignore_index=True)

In [None]:
#get artist country
result_df['country'] =  result_df['location'].apply(lambda x: json.loads(x)['country'])

In [None]:
#updating data
data['artist_country'] = result_df.country
data['artist_name'] = result_df.name
data['artist_type'] = result_df.type
data['artist_genre'] = result_df.genre
data = data.reset_index()

In [None]:
new_df = pd.DataFrame()
# grouping data by 'artist_name'
grouped = data.groupby('artist_name')

# for each artist
for artist_name, group_data in grouped:
    # get country, artist type, and genre
    artist_country = group_data['artist_country'].iloc[0]
    artist_type = group_data['artist_type'].iloc[0]
    genre = group_data['genre'].iloc[0]

    # get unique albums
    albums_info = group_data.groupby(['id_album', 'albumTitle', 'publicationDateAlbum']) \
                            .agg({'isClassic': 'first'}) \
                            .reset_index() \
                            .to_dict(orient='records')

    # get songs information
    songs_info = group_data.apply(lambda row: {
        'title': row['title'],
        'releaseDate': row['publicationDateAlbum'],
        'isClassic': row['isClassic'],
        'urlYouTube': row['urlYouTube'],
        'duration': row['length'],
        'language_detect': row['language_detect'],
        'runtime': row['runtime'],
        'award': row['award'],
        'producer': row['producer'],
        'writer': row['writer'],
        'DeezerURL': row['urlDeezer'],
        'Album': row['albumTitle']
    }, axis=1).tolist()

    # dictionary for the artist's information
    artist_info = {
        'artist_name': artist_name,
        'artist_type': artist_type,
        'artist_genre': artist_genre,
        'artist_country': artist_country,
        'albums': albums_info,
        'songs': songs_info
    }
    new_df = new_df.append(artist_info, ignore_index=True)


In [None]:
mode_value = new_df[new_df['artist_genre'] != 'Unknown']['artist_genre'].mode()[0]
new_df['artist_genre'].replace('Unknown', mode_value, inplace=True)

In [None]:
genres_to_check = ['Rock', 'Punk Rock', 'Alternative Rock', 'Doom Metal', 'Folk Och Rackare', 'Contemporary Christian', 'Heavy Metal', 'Pop Rock', 'Post-Hardcore','Folk Rock','Progressive Metal', 'Pagan Metal', 'Funk Metal', 'Indie Rock', 'Christian Metal']
new_df.loc[new_df['artist_genre'].isin(genres_to_check), 'artist_type'] = 'Group'


In [None]:
# Define the list of genres to consider
genres_to_change = ['Pop', 'Chanson', 'Freestyle','Hip Hop']

# Set 'artist_type' to 'Person' for the selected genres
new_df.loc[new_df['artist_genre'].isin(genres_to_change), 'artist_type'] = 'Person'


In [None]:
# List of artists confirmed as "Person"
individual_artists = [
    'Grace Kennedy',
    'Janne Westerlund',
    'Jean Carlos',
    'Joel Adams',
    'Jonas Blue',
    'Clover',
    'Jupiter',
    'Y.C.',
    'Divingstation',
    'Juan Muteniac',
    'Heinz Rennhack',
    'InMemory',
    'H-2-S',
    'Exit (FI)'
]

# Set 'artist_type' to 'Person' for the confirmed individual artists
new_df.loc[new_df['artist_name'].isin(individual_artists), 'artist_type'] = 'Person'


In [None]:
# List of artists confirmed as "Group"
group_artists = [
    'Communauté De Taizé',
    'Harold Budd/Brian Eno',
    'Freefonix',
    'Fragil Vida',
    'Jonks',

]

# Set 'artist_type' to 'Group' for the confirmed group artists
new_df.loc[new_df['artist_name'].isin(group_artists), 'artist_type'] = 'Group'


In [None]:
import random

# Create a mapping between languages and countries (this is a simplified example)
language_to_country = {
    'english': ['United States', 'United Kingdom'],
    'spanish': 'Spain',
    'german': 'Germany',
    'french': 'France',
    'italian': 'Italy',
    'portuguese': 'Portugal',
    'polish': 'Poland',
    'swedish': 'Sweden',
    'hausa': 'Nigeria',
    'dutch': 'Netherlands',
    'norwegian': 'Norway',
    'turkish': 'Turkey',
    'finnish': 'Finland',
    'indonesian': 'Indonesia',
    'croatian': 'Croatia',
    'hungarian': 'Hungary',
    'romanian': 'Romania',
    'lithuanian': 'Lithuania',
    'swahili': 'Kenya',
    'danish': 'Denmark',
    'slovak': 'Slovakia',
    'latin': 'Vatican City',
    'somali': 'Somalia',
    'welsh': 'Wales',
    'cebuano': 'Philippines',
    'hawaiian': 'Hawaii',
    'tagalog': 'Philippines',
    'estonian': 'Estonia',
    'albanian': 'Albania',
    'slovene': 'Slovenia',
    'icelandic': 'Iceland',
    'latvian': 'Latvia',
    'vietnamese': 'Vietnam'
}


# Function to estimate the country based on song language
def estimate_country(songs):
    # Initialize a list to store the estimated countries
    estimated_countries = []

    # Iterate through the songs to extract languages
    languages = set(song['language_detect'] for song in songs)

    # Check if English is one of the languages
    if 'english' in languages:
        # Randomly choose between "United States" and "United Kingdom" with 50% probability each
        chosen_country = random.choice(language_to_country['english'])
        estimated_countries.append(chosen_country)
    else:
        # Use the mapping to estimate the country for other languages
        for lang in languages:
            estimated_country = language_to_country.get(lang, 'Unknown')
            estimated_countries.append(estimated_country)

    return estimated_countries

In [None]:
new_df['artist_country'] = new_df['songs'].apply(estimate_country)

In [None]:
new_df['artist_country'] = new_df['artist_country'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [None]:
# Create a mapping of artists to their countries
artist_country_mapping = {
    'Akcent': 'Romania',
    'Clipse': 'United States',
    'Coco Montoya': 'United States',
    'Cold Chisel': 'Australia',
    'Coles Whalen': 'United States',
    'Dirty Projectors': 'United States',
    'Disharmonic Orchestra': 'Austria',
    'Diva': 'South Korea',
    'Ed Motta': 'Brazil',
    'Eiffel 65': 'Italy',
    'Elend': 'France',
    'Embrace The End': 'United States',
    'Embrace Today': 'United States',
    'Embraced (US)': 'United States',
    'Enter My Silence': 'Sweden',
    'Esterlyn': 'United States',
    'Exist†trace': 'Japan',
    'F(x)': 'South Korea',
    'FM (CA)': 'United States',
    'Fagner': 'Brazil',
    'Fancy': 'Germany',
    'Farewell, My Love': 'United States',
    'Fauxliage': 'United States/Canada',
    'Freeky Cleen': 'Ukraine',
    'Fresh Body Shop': 'France',
    'Gideon Emery': 'South Africa',
    'Giovanca': 'Netherlands',
    'Grandaddy': 'United States',
    'Great Lake Swimmers': 'Canada',
    'Greg Bates': 'United States',
    'Hemlock': 'United States',
    'Hexenhaus': 'Sweden',
    'Iration': 'United States',
    'Iuno': 'Germany',
    'Ivan Graziani': 'Italy',
    'JJ Grey & Mofro': 'United States',
    'Jeremy Fisher': 'Canada',
    'Joel Adams': 'Australia',
    'Julie Covington': 'United Kingdom',
    'Jumbo': 'Mexico',
    'Junkyard': 'United States',
    'Jupiter': 'Japan',
    'Kal P. Dal': 'Sweden',
    'Kanda, Kodža I Nebojša': 'Serbia',
    'Kataklysm': 'Canada',
    'Shawn Colvin': 'United States',
    'Sleepy Sun': 'United States',
    'A Tribute To Jens': 'Sweden',
    'A Good Day For Killing': 'United States',
    'Agent Simple':' Finlande' ,
    'Cold Fusion & Rukkanor': 'Germany',
    'Adrian Belew': 'United States',
    'Agent 51': 'United States',
    'Cirkus Miramar': 'Sweden',
    'Comadre': 'United States',
    'Dim Mak': 'United States',
    'Dirty On Purpose': 'United States',
    'Diva (DK)': 'Denmark',
    'Empire Of The Sun': 'Australia',
    'Exit (ES)': 'Spain',
    'Faderhead': 'Germany',
    'Far From Alaska': 'Brazil',
    'Ferris MC': 'Germany',
    'Flegmaatikot': 'Finland',
    'Forever In Combat': 'United States',
    'Fountains Of Wayne': 'United States',
    'Frank Black': 'United States',
    'Fredl Fesl': 'Austria',
    'Fredrik Furu': 'Sweden',
    'Fredrik Miller': 'Sweden',
    "Fredrik Thordendal's Special Defects": 'Sweden',
    'Fredrik Vahle': 'Germany',
    'Friends (US)': 'United States',
    'G. Dep': 'United States',
    'Gibonni': 'Croatia',
    'Grandmaster Flash And The Furious Five': 'United States',
    'Greg Trooper': 'United States',
    'Group 1 Crew': 'United States',
    'Guy': 'United States',
    'Gwenmars': 'United States',
    'Gökhan Özen': 'Turkey',
    'Göksel': 'Turkey',
    'Halfway Home': 'United States',
    'Halvdan Sivertsen': 'Norway',
    'Hampton The Hampster': 'United States',
    'Hollie Cook': 'United Kingdom',
    'Hollow Haze': 'Italy',
    'Héctor Acosta': 'Dominican Republic',
    'Jackie DeShannon': 'United States',
    'Jan Terri': 'United States',
    'Jodarok': 'Finland',
    'Jordan Smith': 'United States',
    'José Carreras': 'Spain',
    'Junior Kimbrough': 'United States',
    'Justin Mauriello': 'United States',
    'Kante': 'Germany',
    'Love Charisse': 'United States',
    'Northern Lights (UK)': 'United Kingdom',
    'Sarah Solovay': 'United States',
    'Communion Of Thieves': 'United States',
    'Emmanuel Moire': 'France',
    'Jorddy': 'France',
    'Heinz Rennhack':'Poland',
    'Heltah Skeltah': 'United States',
    'Divine Souls': 'United States',
    'Empirine': 'Sweden',
    'Embracing': 'United Kingdom',
    'G-Squad': 'France',
    'GNR': 'United States',
    'GTR':  'United Kingdom',
    'Ghost Of A Fallen Age' :'United States',
    'Her Nightmare': 'Australia',
    'The Echoes':  'United Kingdom',

}

# Update the 'artist_country' column based on the mapping
for artist, country in artist_country_mapping.items():
    new_df.loc[new_df['artist_name'] == artist, 'artist_country'] = country


In [None]:
new_df['artist_country'] = new_df['artist_country'].str.strip()
new_df['artist_country'] = new_df['artist_country'].str.split(',').str[0]

In [None]:
data['albums'] = data['albums'].apply(lambda albums_list: [album if album['publicationDateAlbum'] != '' else {'id_album': album['id_album'], 'albumTitle': album['albumTitle'], 'publicationDateAlbum': '0000', 'isClassic': album['isClassic']} for album in albums_list])
# Define a function to process the dictionaries within the 'songs' column
def process_song(song):
    keys_to_remove = ['duration', 'producer', 'writer']
    song = {key: value for key, value in song.items() if key not in keys_to_remove}
    if song.get('award') is None:
        song['award'] = 'No award'
    return song

# Assuming 'new_df' is your DataFrame and 'songs' is the column with lists of dictionaries
data['songs'] = data['songs'].apply(lambda songs_list: [process_song(song) for song in songs_list])


# Function to update the 'album' and 'releaseDate' keys in each song dictionary
def update_songs(song_list, album_list):
    for song in song_list:
        if 'Album' not in song:
            song['Album'] = album_list[0]['albumTitle'] if album_list else None
        if song.get('releaseDate', '').strip() == '':
            song['releaseDate'] = album_list[0].get('publicationDateAlbum', '0000')
        if 'runtime' not in song or song['runtime'] is None or song['runtime'] == '':
            song['runtime'] = '60'
        if 'urlYouTube' not in song or song['urlYouTube'] == '' or 'urlYouTube' ==None:
            song['urlYouTube'] = 'https://www.youtube.com/'
        if 'DeezerURL' not in song or song['DeezerURL'] == '' or song['DeezerURL'] is None:
            song['DeezerURL'] = 'https://www.deezer.com/fr/'  # Set a default Deezer URL
    return song_list
# Apply the function to update the 'songs' column
data['songs'] = data.apply(lambda row: update_songs(row['songs'], row['albums']), axis=1)

data['songs'] = data['songs'].apply(lambda songs_list: [song if song['releaseDate'].isdigit() else {'title': song['title'], 'releaseDate': '0000', 'isClassic': song['isClassic']} for song in songs_list])

# Function to calculate the difference between the most recent and oldest song years
def calculate_activity_period(songs_list):
    release_years = [int(song['releaseDate']) for song in songs_list if song['releaseDate'] != '0000']
    if not release_years:
        return 2
    return max(release_years) - min(release_years)

# Create a new column "activity period"
data['activity_period'] = data['songs'].apply(calculate_activity_period)
data['songs'] = data['songs'].apply(lambda songs_list: [song if song['releaseDate'].isdigit() else {'title': song['title'], 'releaseDate': '0000', 'isClassic': song['isClassic']} for song in songs_list])

# Function to calculate the difference between the most recent and oldest song years
def calculate_activity_period(songs_list):
    release_years = [int(song['releaseDate']) for song in songs_list if song['releaseDate'] != '0000']
    if not release_years:
        return 1
    return max(release_years) - min(release_years)

# Create a new column "activity period"
data['activity_period'] = data['songs'].apply(calculate_activity_period)


# **Data Processing: for Choropleth map**
---



In [None]:
choro = pd.read_json('wasabi_songs.json')

In [None]:
df_albums['_id'] = [df_albums._id.values[i][9:-1] for i in range(len(df_albums))]

In [None]:
unique_countries = []
l = [ np.unique(l) for l in choro.availableCountries.values]
for ll in l:
  unique_countries.extend(np.unique(ll))

In [None]:
countries = np.unique(unique_countries)

In [None]:
dico = {}

with open('code_country.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

for line in lines[1:]:
    elements = line.strip().split(':')[0].split(',')
    dico[elements[1]] = elements[0]

no = ['AN',  'BO',  'BQ',  'CD',  'FM',  'IR',  'KP',  'KR',  'MD',  'MK',  'PS',  'SH',  'TW',  'TZ',  'VE',  'VG',  'VI']
y = ["Netherlands Antilles "," Bolivia"," Bonaire"," Sint Eustatius"," Saba "," Democratic Republic of the Congo"," Federated States of Micronesia"," Iran"," North Korea "," South Korea "," Moldova"," North Macedonia ("," Palestinian Territories "," Saint Helena"," Ascension and Tristan da Cunha"," Taiwan "," Tanzania"," Venezuela"," British Virgin Islands"," United States Virgin Islands"]
for i in range(len(no)):
  dico[no[i]] = y[i]

In [None]:
def get_country(countries):
  values_list = []
  for key in countries:
    if key in dico:
      values_list.append(dico[key])
    else:
      values_list.append(key)
  return values_list

In [None]:
coun = get_country(countries)

In [None]:
choro['countries'] = [get_country(cl) for cl in choro['availableCountries']]

In [None]:
d = choro.groupby(['countries'])

In [None]:
languages = np.unique(choro.language.values)

In [None]:
dico_languages = {'':'','dan':'danish','deu':'deutch','eng': 'english','fin':'finnish','fra':'french','ita':'italian','jpn':'japanese','ksh': 'kolsch','por': 'portuguese','spa': 'spanish','zxx':'zxx'}

In [None]:
choro['lang'] = [dico_languages[key] for key in choro['language']]

In [None]:
np.unique(choro[choro['lang'] !=choro['language_detect']][['lang','language_detect']].language_detect.values)

In [None]:
ids_artists = [df_artists._id.values[i][9:-1] for i in range(len(df_artists))]
ids_artists

In [None]:
ids_alb =[df_albums._id.values[i][9:-1] for i in range(len(df_albums))]
ids_alb

In [None]:
albums_in_df = [choro['id_album'] == id for id in ids_alb]
albums_in_df

In [None]:
new_choro = choro[["name","countries","lang"]]

In [None]:
df = pd.DataFrame(choro)

# Create a list of unique countries
unique_countries = set(country for sublist in df['countries'] if isinstance(sublist, list) for country in sublist)

# Initialize an empty list to store the results
result_list = []

# Iterate through each unique country
for country in unique_countries:
    for index, row in df.iterrows():
        if isinstance(row['countries'], list) and country in row['countries available']:
            result_list.append([country, row['lang'], row['name']])

# Create a new data frame from the result list
result_df = pd.DataFrame(result_list, columns=['Country', 'Language', 'Song'])

# Group the data by Country and Language and aggregate the song names
result_grouped = result_df.groupby(['Country', 'Language'],group_keys=False)['Song'].apply(list).reset_index()

print(result_grouped)


In [None]:
new = choro[['title_accent_fold','countries','lang']]

In [None]:
languagesss = ['english', 'japanese', 'deutch', 'italian', 'finnish', 'french', 'spanish', 'portuguese', 'kolsch', 'zxx', 'danish']

In [None]:
def replace_random(value):
    if value == '':
        return random.choice(languagesss)
    else:
        return value


In [None]:
new['lang']= new['lang'].apply(replace_random)

In [None]:
dicoo = {}
for c in coun:
    dicoo[c] = []
keys_ = dicoo.keys()
for index, row in new.iterrows():
  for c in row.countries:
    if c in keys_:
      dicoo[c].append((row.title_accent_fold,row.lang))

In [None]:
country_df = pd.DataFrame()
country_df ['Country'] = coun
country_df['songs'] = [dicoo[c] for c in coun]


In [None]:
all_language = np.unique(new_choro.lang.values)
dico_lang = {}
for c in all_language:
    dicoo[c] = []

In [None]:
import collections
dico_coun_  = {}
for c in coun:
    dico_coun_[c] = []
for index, row in country_df.iterrows():
  langs = []
  for c in row.songs:
      langs.append(c[1])
  xx = collections.Counter(langs)
  dico_coun_[row.Country] = sorted(xx.items(), key=lambda x:x[1],reverse= True)

In [None]:
country_df['languageRanking'] = [dico_coun_[c] for c in coun]


In [None]:
country_df.to_json('country_data.json',orient='records')

In [None]:
df = pd.read_json('country_data.json')


In [None]:
english = []
japanese = []
deutch = []
italian = []
finnish = []
french = []
spanish = []
portuguese = []
kolsch = []
zxx = []
danish = []
for i in range(len(df)):
    english.append(df.languageRanking[i][0][1])
    french.append(df.languageRanking[i][1][1])
    deutch.append(df.languageRanking[i][2][1])
    spanish.append(df.languageRanking[i][3][1])
    kolsch.append(df.languageRanking[i][4][1])
    japanese.append(df.languageRanking[i][5][1])
    zxx.append(df.languageRanking[i][6][1])
    finnish.append(df.languageRanking[i][7][1])
    portuguese.append(df.languageRanking[i][8][1])
    italian.append(df.languageRanking[i][9][1])
    danish.append(df.languageRanking[i][10][1])

In [None]:
df['english'] = english
df['french ']= french
df['deutch ']= deutch
df['spanish ']= spanish
df['kolsch'] = kolsch
df['japanese'] = japanese
df['zxx ']= zxx
df['finnish ']=finnish
df['portuguese'] = portuguese
df['italian ']= italian
df['danish'] = danish

In [None]:
df.to_json('country.json',orient='records')