## Project 3 - Group 7 ##

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

**Extracting Data From Last.FM**

In [None]:
# WARNING: Do not run this cell unless you want to replace the CSV file
import time
from IPython.core.display import clear_output
import requests

API_KEY = '3b7edf89f3e5d5af2746988a93b5c39e'
headers = {'user-agent': 'machinelearning'}
url = 'https://ws.audioscrobbler.com/2.0/'

responses = []
page = 1
total_pages = 99999

while page <= 2: # set the number of pages we want to download. replace with "total_pages" to download everything.
    payload = {'method': 'chart.gettoptracks', 'api_key': API_KEY, 'format': 'json', 'page' : page}

    print("Requesting page {}/{}".format(page, total_pages)) # print some output so we can see the status
    clear_output(wait = True)

    response = requests.get(url, headers=headers, params=payload)
    if response.status_code != 200: # if we get an error, print the response and halt the loop
        print(response.text)
        break

    page = int(response.json()['tracks']['@attr']['page']) # extract pagination info
    total_pages = int(response.json()['tracks']['@attr']['totalPages'])

    responses.append(response) # append response
    time.sleep(0.1) #avoid requesting too fast. I don't know if we will actually get banned without this
    page += 1 # increment the page number

frames = [pd.DataFrame(r.json()['tracks']['track']) for r in responses]
df = pd.concat(frames)
df.drop(['duration', 'playcount', 'listeners', 'streamable', 'image', 'url', 'mbid'], axis=1, inplace=True)
df = df.reset_index(drop=True)
df.rename(columns = {'name':'track'}, inplace = True)

# clean artist column and get artist_list[]
temp = []
artist_list = []
key = 'name'
for i in df.artist:
    temp.append(i)
for d in temp:
    artist_list.append(d[key])
df.artist = artist_list

#get track_list
track_list = []
for i in df.track:
    track_list.append(i)

responses = []
counter = 0
while counter < len(track_list):
    payload = {'method': 'track.gettoptags', 'artist': artist_list[counter], 'track': track_list[counter], 'api_key': API_KEY, 'format': 'json', 'page' : page}
    print("Requesting tag {}/{}".format(counter, len(track_list))) # print some output so we can see the status
    clear_output(wait = True)
    response = requests.get(url, headers=headers, params=payload)
    if response.status_code != 200: # if we get an error, print the response and halt the loop
        print(response.text)
        break
    responses.append(response)
    time.sleep(0.1) #avoid requesting too fast. I don't know if we will actually get banned without this
    counter +=1

genres = []
for r in responses:
    genres.append(r.json()['toptags']['tag'][0]['name'])
df['genre'] = genres

# save df as csv file
df.to_csv('dataset.csv', index=False)
clear_output(wait = True)
print("CSV file written")

In [3]:
dataset = pd.read_csv('dataset.csv')
dataset

Unnamed: 0,track,artist,genre
0,As It Was,Harry Styles,indie pop
1,N95,Kendrick Lamar,rap
2,United In Grief,Kendrick Lamar,jazz rap
3,Worldwide Steppers,Kendrick Lamar,kodak black
4,Die Hard,Kendrick Lamar,rnb
...,...,...,...
145,King Kunta,Kendrick Lamar,Hip-Hop
146,Golden,Harry Styles,indie pop
147,Softcore,The Neighbourhood,alternative
148,Heartless,Kanye West,Kanye West


**Simplify Genres**


In [4]:
# Simplify Genres

# pop <- noise pop <- pop rap <- indie pop <- dream pop <- chamber pop <- synthpop
# Hip-Hop <- Conscious Hip Hop <- experimental hip hop
# rap <- Kanye West <- uh huh honey <- Baby Keem <- jazz rap

# we can add more to the lists as we extract the final dataframe

pop = ['noise pop', 'pop rap', 'indie pop', 'dream pop', 'chamber pop', 'synthpop', 'pop rock']
hip_hop = ['Conscious Hip Hop', 'experimental hip hop']
rap = ['Kanye West', 'uh huh honey', 'Baby Keem', 'jazz rap', 'kodak black']
rock = ['indie rock', 'Grunge']
soul = ['Neo-Soul']

dataset.genre = dataset.genre.replace(pop, 'pop')
dataset.genre = dataset.genre.replace(hip_hop, 'Hip-Hop')
dataset.genre = dataset.genre.replace(rap, 'rap')
dataset.genre = dataset.genre.replace(rock, 'rock')
dataset.genre = dataset.genre.replace(soul, 'soul')
dataset

Unnamed: 0,track,artist,genre
0,As It Was,Harry Styles,pop
1,N95,Kendrick Lamar,rap
2,United In Grief,Kendrick Lamar,rap
3,Worldwide Steppers,Kendrick Lamar,rap
4,Die Hard,Kendrick Lamar,rnb
...,...,...,...
145,King Kunta,Kendrick Lamar,Hip-Hop
146,Golden,Harry Styles,pop
147,Softcore,The Neighbourhood,alternative
148,Heartless,Kanye West,rap


**Classification Learning**

In [75]:
genre_list = dataset.genre.unique()
# Note: *will make a function if genre list gets to big*
mapping = { 'pop'          : 1,
            'rap'          : 2,
            'rnb'          : 3,
            'Hip-Hop'      : 4,
            'trap'         : 5,
            'rock'         : 6,
            'indie'        : 7,
            'synthwave'    : 8,
            'psychedelic'  : 9,
            'Disco'        : 10,
            'alternative'  : 11,
            'Ballad'       : 12,
            'soul'         : 13,
            'fip'          : 14,
            'post-hardcore': 15
            }
# map the tags
dataset['genre_tag'] = dataset.genre.map(mapping)
dataset

Unnamed: 0,track,artist,genre,genre_tag
108,traitor,Olivia Rodrigo,Ballad,12.0
87,About Damn Time,Lizzo,Disco,10.0
36,About Damn Time,Lizzo,Disco,10.0
139,Nobody,Mitski,Disco,10.0
44,Family Ties (with Kendrick Lamar),Baby Keem,Hip-Hop,4.0
...,...,...,...,...
141,PUFFIN ON ZOOTIEZ,Future,trap,5.0
9,Count Me Out,Kendrick Lamar,trap,5.0
121,Get Into It (Yuh),Doja Cat,trap,5.0
143,712PM,Future,trap,5.0


In [76]:
# sort
dataset.sort_values(by=['genre'], inplace=True)

# group the datset
group = dataset.groupby(dataset.genre_tag)

# allocation
pop_set = group.get_group(1); hiphop_set = group.get_group(4); indie_set = group.get_group(7); disco_set = group.get_group(10); soul_set = group.get_group(13); 
rap_set = group.get_group(2); trap_set = group.get_group(5); synthwave_set = group.get_group(8); alternative_set = group.get_group(11); fip_set = group.get_group(14);
rnb_set = group.get_group(3); rock_set = group.get_group(6); psychedelic_set = group.get_group(9); Ballad_set = group.get_group(12); posthardcore_set = group.get_group(15);

In [77]:
from math import floor, ceil

# jaro_distance function compares two strings similarity
def jaro_distance(s1, s2):    
    # If the s are equal
    if (s1 == s2):
        return 1.0
 
    # Length of two s
    len1 = len(s1)
    len2 = len(s2)
 
    # Maximum distance upto which matching is allowed
    max_dist = floor(max(len1, len2) / 2) - 1
    match = 0  # Count of matches
 
    # Hash for matches
    hash_s1 = [0] * len(s1)
    hash_s2 = [0] * len(s2)
 
    # Traverse through the first
    for i in range(len1):
        # Check if there is any matches
        for j in range(max(0, i - max_dist), min(len2, i + max_dist + 1)):
            # If there is a match
            if (s1[i] == s2[j] and hash_s2[j] == 0):
                hash_s1[i] = 1
                hash_s2[j] = 1
                match += 1
                break
    # If there is no match
    if (match == 0):
        return 0.0
 
    # Number of transpositions
    t = 0
    point = 0
 
    # Count number of occurrences where two characters match but
    # there is a third matched character in between the indices
    for i in range(len1):
        if (hash_s1[i]):
            # Find the next matched character in second
            while (hash_s2[point] == 0):
                point += 1
            if (s1[i] != s2[point]):
                t += 1
            point += 1
    t = t//2
    
    # Return the Jaro Similarity
    return (match/ len1 + match / len2 + (match - t) / match)/ 3.0

# df to list
def df_to_list(df):
    value_list = []
    for i in df:
        value_list.append(i)
    return value_list

In [79]:
# jaro_scores are calculated by taking each track in the genre list and comparing it to the other tracks in the genre
def jaro_scores(value_list, iterator):
    jaro_scores = []
    i = 0
    while i <= len(value_list) - 1:
        jaro_scores.append(jaro_distance(value_list[iterator], value_list[i]))
        i += 1
    import statistics
    return statistics.mean(jaro_scores)

# jaro_means will take the list of scores and average them for the final score
def jaro_means(value_list):
    jaro_mean = [] 
    for i in range(len(value_list)):
        jaro_mean.append(jaro_scores(value_list, i))
    return jaro_mean

# this step will need to be repeated across all genres
poplist = df_to_list(pop_set.track)
pop_means = jaro_means(poplist)
pop_set['track_scores'] = pop_means
pop_set

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_set['track_scores'] = pop_means


Unnamed: 0,track,artist,genre,genre_tag,track_scores
125,Style,Taylor Swift,pop,1.0,0.391684
21,good 4 u,Olivia Rodrigo,pop,1.0,0.484544
149,Cruel Summer,Taylor Swift,pop,1.0,0.47599
105,Space Song,Beach House,pop,1.0,0.49585
88,deja vu,Olivia Rodrigo,pop,1.0,0.486648
146,Golden,Harry Styles,pop,1.0,0.416805
83,This Love (Taylor’s Version),Taylor Swift,pop,1.0,0.535748
71,good 4 u,Olivia Rodrigo,pop,1.0,0.484544
0,As It Was,Harry Styles,pop,1.0,0.503678
89,Happier Than Ever,Billie Eilish,pop,1.0,0.525073
