In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as img
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import seaborn as sns

In [2]:
# fungsi membantu
def isNaN(value) :
    try:
        import math
        return math.isnan(float(value))
    except:
        return False

<h2>Penjelasan Atribut</h2>

<table>
  <tr>
    <th>Num</th>
    <th>Attribute Name</th>
    <th>Description</th>
  </tr>
  <tr>
    <td>1</td>
    <td>title</td>
    <td>Title</td>
  </tr>
  <tr>
    <td>2</td>
    <td>artist</td>
    <td>Artist</td>
  </tr>
  <tr>
    <td>3</td>
    <td>genre</td>
    <td>Genre of the song</td>
  </tr>
  <tr>
    <td>4</td>
    <td>year</td>
    <td>Year of the song (due to re-releases, the year might not correspond to the release year of the original song)</td>
  </tr>
  <tr>
    <td>5</td>
    <td>bpm</td>
    <td>Beats per minute</td>
  </tr>
  <tr>
    <td>6</td>
    <td>nrgy</td>
    <td>Energy of a song, the higher the value the more energetic the song is</td>
  </tr>
  <tr>
    <td>7</td>
    <td>dnce</td>
    <td>The higher the value, the easier it is to dance to this song.</td>
  </tr>
  <tr>
    <td>8</td>
    <td>dB</td>
    <td>The higher the value, the louder the song</td>
  </tr>
  <tr>
    <td>9</td>
    <td>live</td>
    <td>The higher the value, the more likely the song is a live recording.</td>
  </tr>
  <tr>
    <td>10</td>
    <td>val</td>
    <td>The higher the value, the more positive the mood for the song.</td></tr>
  <tr>
    <td>11</td>
    <td>dur</td>
    <td>The duration of the song</td>
  </tr>
  <tr>
    <td>12</td>
    <td>acous</td>
    <td>The higher the value the more acoustic the song is.</td>
  </tr>
  <tr>
    <td>13</td>
    <td>spch</td>
    <td>The higher the value the more spoken words the song contains.</td>
  </tr>
  <tr>
    <td>14</td>
    <td>popularity</td>
    <td>The higher the value the more popular the song is.</td>
  </tr>
  <tr>
    <td>15</td>
    <td>has_win_award</td>
    <td>Boolean value to indicate if the song has won an award or not. Value of 1 if the song has already won one or more awards otherwise 0 if the song hasn’t won any awards.</td>
  </tr>
</table>

load semua csv

In [3]:
df_1950 = pd.read_csv("1950.csv")
df_1960 = pd.read_csv("1960.csv")
df_1970 = pd.read_csv("1970.csv")
df_1980 = pd.read_csv("1980.csv")
df_1990 = pd.read_csv("1990.csv")
df_2000 = pd.read_csv("2000.csv")
df_2010 = pd.read_csv("2010.csv")
df_top10 = pd.read_csv("top10s.csv")
df_all = pd.read_csv('all_no_duplicates.csv')

In [None]:
# Penggabungan seluruh data frame
df_all = pd.concat(\
    [df_1950,df_1960\
        ,df_1970,df_1980\
        ,df_1990,df_2000\
        ,df_2010,df_top10],ignore_index=True, sort=False)

df_all.to_csv('all_sort_from_1950_to_top10.csv',index=False)

# drop duplicate
df_all = df_all.drop_duplicates()
# drop all nan
df_all_clear = df_all.dropna()
# ubah data has_win_award dari float ke boolean
df_all = df_all.astype({"has_win_award":bool})
df_all_clear = df_all_clear.astype({"has_win_award":bool})

df_all.to_csv('all_no_duplicates.csv',index=False)

Preprocessing

In [22]:
def super_genre_helper(genre):
    genre_dict = {'blues': ['blues','african blues','blues rock','blues shouter','british blues','canadian blues','chicago blues','classic female blues','contemporary r&b','country blues','delta blues','detroit blues','electric blues','gospel blues','hill country blues','hokum blues','jump blues','kansas city blues','louisiana blues','memphis blues','new orleans blues','piedmont blues','punk blues','rhythm and blues','doo-wop','soul blues','st. louis blues','swamp blues','texas blues','west coast blues','acoustic blues'],
        'country': ['country','alternative country','cowpunk','americana','australian country','bakersfield sound','bluegrass','progressive bluegrass','reactionary bluegrass','country blues','country pop','country rap','country rock','cajun','cajun fiddle','christian country','close harmony','dansband','hokum','honky tonk','instrumental country','nashville sound','neotraditional country','new mexico music','outlaw country','progressive country','red dirt','rockabilly','hellbilly','psychobilly','sertanejo','tejano','texas country','traditional country music','truck-driving country','western (cowboy)','cowboy pop','new mexico','texas country','tejano','western swing','zydeco','cowboy western','contemporary country'],
        'easy listening': ['easy listening','background music','elevator music (muzak)','barococo','beautiful music','chill-out','furniture music','light music','lounge music','middle of the road','new-age music'],
        'electronic': ['electronic','ambient','ambient dub','dark ambient','ambient industrial','dungeon synth','isolationism','drone','illbient','new-age','andean new-age','neoclassical new-age','space music','reductionism','lowercase','onkyokei','bass music','footwork','future bass','kawaii future bass','jungle terror','uk bass','wave','hardwave','breakbeat','acid breaks','baltimore club','jersey club','philly club','big beat','breakbeat hardcore','4-beat','darkcore','hardcore breaks','broken beat','florida breaks','nu skool breaks','progressive breaks','psychedelic breakbeat','chill-out','downtempo','psybient','psydub','trip hop','trip rock','chopped and screwed','disco','afro/cosmic music','electro-disco','hi-nrg','eurobeat','eurodance','italo dance','italo disco','spacesynth','space disco','eurodisco','nu-disco','post-disco','boogie','city pop','disco edits','drum and bass','darkstep','drumfunk','drumstep','hardstep','intelligent drum and bass','atmospheric drum and bass','jazzstep','jump-up','liquid funk','neurofunk','sambass','techstep','dub','dub poetry','dubtronica','electroacoustic music','acousmatic music','electroacoustic improvisation','live electronics','musique concrète','soundscape','electronic rock','dance-rock','alternative dance','madchester','baggy','new rave','dance-punk','krautrock','new wave','cold wave','dark wave','neoclassical dark wave','neue deutsche todeskunst','ethereal wave','nu-gaze','minimal wave','neue deutsche welle','new romantic','sophisti-pop','synth-pop','dance-pop','disco polo','electroclash','electropop','hyperpop','indietronica','post-rock','space rock','synth-metal','electrogrind','electronicore','synth-punk','electronica','folktronica','jazztronica','laptronica','livetronica','progressive electronic','kosmische musik','ethnic electronica / regional edm','asian underground','african electronic dance music','afrobeats','azonto','coupé-décalé','kuduro','mahraganat','shangaan electro','budots','changa tuki','dancehall pop','funk carioca','funk ostentação','melodic funk','proibidão','rasteirinha','merenhouse','nortec','rabòday','rara tech','russ music','shamstep','tribal guarachero','worldbeat','manila sound','funk fusion genres','acid jazz','funktronica','synth-funk','jungle','ragga jungle','hardcore','bouncy techno','breakcore','raggacore','digital hardcore','doomcore','frenchcore','gabber','early hardcore','mainstream hardcore','happy hardcore','uk hardcore','industrial hardcore','j-core','lento violento','mákina','speedcore','extratone','flashcore','hypertone','splittercore','hardstyle','dubstyle','euphoric frenchcore','euphoric hardstyle','jumpstyle','rawstyle','trapstyle','hauntology','chillwave','hypnagogic pop','synthwave','darksynth','sovietwave','vaporwave','dreampunk','future funk','hardvapour','mallsoft','hip hop fusion genres','afroswing','alternative hip hop','hipster hop','cloud rap','crunk','crunkcore','snap music','electro','latin freestyle','emo rap','instrumental hip hop','lofi hip hop','miami bass','mumble rap','trap','afro trap','drill','uk drill','latin trap','phonk','trap (edm)','uk trap','house music','acid house','afro house','amapiano','gqom','kidandali','kwaito','ambient house','balearic beat','ballroom','bass house','blog house','brazilian bass','slap house','chicago hard house','chicago house','deep house','disco house','diva house','hardbag','electro house','big room house','complextro','dutch house','fidget house','melbourne bounce','electro swing','eurohouse','french house','funky house','future house','garage house','ghetto house','ghettotech','juke house','hip house','italo house','jazz house','latin house','melodic house','microhouse','moombahcore','moombahton','moombahsoul','new jersey sound','outsider house','progressive house','soulful house','tech house','tribal house','tropical house','uk hard house','hard nrg','pumping house','hardbass','scouse house','industrial / post-industrial','deconstructed club','electro-industrial','dark electro','aggrotech','electronic body music (ebm)','futurepop','new beat','industrial hip hop','industrial metal','cyber metal','neue deutsche härte','industrial rock','martial industrial','witch house','intelligent dance music (idm)',"drill 'n' bass",'glitch','glitch hop','neo soul','nightcore','noise music','danger music','harsh noise','harsh noise wall','japanoise','power electronics','death industrial','power noise','plunderphonics','sampledelia','techno','acid techno','ambient techno','birmingham sound','bleep techno','detroit techno','dub techno','hardtechno','free tekno','jungletek','raggatek','industrial techno','minimal techno','schaffel','toytown techno','tecno brega','trance music','acid trance','balearic trance','dream trance','eurotrance','hands up','goa trance','nitzhonot','hard trance','progressive trance','psychedelic trance','dark psytrance','full-on','minimal psytrance','progressive psytrance','suomisaundi','zenonesque','tech trance','uplifting trance','vocal trance','uk garage','2-step garage','bassline','breakstep','dubstep','brostep','post-dubstep','reggaestep','riddim','future garage','grime','grindie','speed garage','uk funky','funkstep','wonky','video game music','chiptune','bitpop','nintendocore','skweee','fm synthesis','sequencer music','afrobeat','german dance','big room','bubble trance','house','australian dance','belgian dance','belgian edm','aussietronica','brit funk','new wave pop','electronic trap','edm','metropopolis','bubblegum dance'],
        'contemporary folk': ['contemporary folk','american folk revival','americana','anti-folk','british folk revival','celtic music','chalga','corrido','filk','folk rock','celtic rock','freak folk','indie folk','industrial folk','mariachi','ranchera','neofolk','progressive folk','protest song','psychedelic folk','singer-songwriter','nueva canción','skiffle','sung poetry','yodeling','canadian folk','appalachian folk','drone folk','british folk','irish singer-songwriter'],
        'hip hop': ['hip hop','alternative hip hop','hipster hop','boom bap','bounce','british hip hop','road rap','chopped and screwed','chopper','cloud rap','comedy hip hop','crunk','crunkcore','country rap','east coast hip hop','emo rap','freestyle rap','g-funk','hardcore hip hop','dirty rap','gangsta rap','mafioso rap','horrorcore','memphis rap','hip hop soul','hyphy',"jerkin'",'industrial hip hop','instrumental hip hop','jazz rap','latin hip hop','chicano rap','lofi hip hop','miami bass','mumble rap','nerdcore','chap hop','new jack swing','political hip hop','conscious hip hop','pop rap','progressive rap','punk rap','ragga hip hop','rap opera','rap rock','rap metal','trap metal','rapcore','religious hip hop','christian hip hop','jewish hip hop','snap','southern hip hop','trap','drill','brooklyn drill','uk drill','latin trap','phonk','tread','turntablism','underground hip hop','west coast hip hop','bronx hip hop','g funk','dirty south rap','dfw rap','atl hip hop','chicago rap','australian hip hop','detroit hip hop','canadian hip hop'],
        'jazz': ['jazz','acid jazz','afro-cuban jazz','alt-jazz','avant-garde jazz','bebop','boogie-woogie','bossa nova','brazilian jazz','british dance band','cape jazz','chamber jazz','continental jazz','cool jazz','crossover jazz','dixieland','ethno jazz','european free jazz','free funk','free improvisation','free jazz','gypsy jazz','hard bop','jazz blues','jazz-funk','jazz fusion','jazz rap','jazz rock','kansas city jazz','latin jazz','livetronica','m-base','mainstream jazz','modal jazz','neo-bop jazz','neo-swing','novelty ragtime','nu jazz','orchestral jazz','post-bop','progressive jazz','punk jazz','ragtime','samba-jazz','shibuya-kei','ska jazz','smooth jazz','soul jazz','straight-ahead jazz','stride jazz','swing','trad jazz','third stream','vocal jazz','west coast jazz'],
        'pop': ['pop','adult album alternative','adult contemporary','ambient pop','arabic pop','art pop','baroque pop','bedroom pop','brill building','britpop','bubblegum pop','c-pop','cantopop','hokkien pop','mandopop','canción','canzone','chalga','chamber pop','chanson','christian pop','classic hits','classical crossover','country pop','cringe pop','dance-pop','disco polo','electropop','europop','austropop','eurobeat','french pop','italo dance','italo disco','laïkó','nederpop','neomelodic music','russian pop','fado','folk pop','hyperpop','indie pop','twee pop','indian pop','iranian pop','j-pop','city pop','shibuya-kei','jangle pop','jazz pop','k-pop','korean hip hop','korean rock',"t'ong guitar",'trot','latin ballad','latin pop','mexican pop','new romantic','oldies','operatic pop','opm','pinoy pop','pop rap','pop rock','pop punk','emo pop','neon pop','power pop','soft rock','surf pop','yacht rock','pop soul','progressive pop','psychedelic pop','rebetiko','schlager','sophisti-pop','space age pop','sunshine pop','swamp pop','synthpop','teen pop','traditional pop','turbo-folk','turkish pop','vispop','wonky pop','worldbeat','yé-yé','adult standards','brill building pop','deep adult standards','british comedy','classic uk pop','afropop','classic danish pop','neo mellow','barbadian pop','bow pop','moroccan pop','indie poptimism','danish pop','belgian pop','italian pop','irish pop','colombian pop','acoustic pop','candy pop','folk-pop','dance pop','canadian pop','australian pop','australian talent show','french indie pop'],
        'r&b and soul': ['r&b and soul','alternative r&b','contemporary r&b','disco','freestyle','go-go','funk','deep funk','minneapolis sound','psychedelic funk','gospel music','new jack swing','post-disco','boogie','rhythm and blues','doo-wop','soul','blue-eyed soul','hip hop soul','neo soul','northern soul','psychedelic soul','southern soul','classic soul','beach music','boogaloo','canadian contemporary r&b','british soul','chicago soul','r&b','classic girl group','escape room','hip pop'],
        'rock': ['rock','afro rock','alternative rock','alternative dance','britpop','post-britpop','dream pop','goth rock','shoegaze','blackgaze','grunge','post-grunge','indie rock','dunedin sound','math rock','post-punk revival','madchester','baggy','noise pop','sadcore','slowcore','beat','british invasion','freakbeat','nederbeat','blues rock','boogie rock','british rhythm and blues','chamber pop','christian rock','classic rock','comedy rock','country rock','dark cabaret','electronic rock','electronicore','new wave','cold wave','dark wave','ethereal wave','experimental rock','art rock','industrial rock','post-punk','dance-punk','gothic rock','no wave','noise rock','post-rock','post-metal','folk rock','british folk rock','celtic rock','medieval folk rock','funk rock','garage rock','glam rock','hard rock','heartland rock','instrumental rock','jazz fusion','jazz rock','latin rock','mangue bit','metal','paisley underground','desert rock','pop rock','jangle pop','power pop','soft rock','yacht rock','progressive rock','art rock','avant-prog','rock in opposition','canterbury scene','flamenco rock','krautrock','neo-progressive rock','new prog','post-progressive','space rock','symphonic rock','psychedelic rock','acid rock','neo-psychedelia','raga rock','pub rock (australia)','pub rock (united kingdom)','punk rock','rap rock','rapcore','reggae rock','rock and roll','rockabilly','rock opera','roots rock','southern rock','stoner rock','swamp rock','sufi rock','surf rock','tropical rock','visual kei','nagoya kei','wizard rock','worldbeat','world fusion','merseybeat','album rock','rock-and-roll','australian rock','dance rock','glam punk','german alternative rock','modern rock','mellow gold','permanent wave'],
        'metal': ['metal','alternative metal','funk metal','nu metal','rap metal','avant-garde metal','black metal','blackened death metal','atmospheric black metal','blackgaze','melodic black metal','national socialist black metal','symphonic black metal','viking metal','war metal','christian metal','unblack metal','death metal',"death 'n' roll",'deathgrind','melodic death metal','technical death metal','doom metal','death-doom','drone metal','folk metal','celtic metal','medieval metal','pagan metal','glam metal','gothic metal','heavy metal','industrial metal','kawaii metal','latin metal','metalcore','deathcore','mathcore','melodic metalcore','neoclassical metal','neue deutsche härte','nintendocore','pirate metal','post-metal','power metal','progressive metal','djent','sludge metal','speed metal','symphonic metal','thrash metal','crossover thrash','groove metal'],
        'punk': ['punk','anarcho punk','crust punk','d-beat','art punk','christian punk','deathrock','digital hardcore','folk punk','celtic punk','cowpunk','gypsy punk','garage punk','grindcore','crustgrind','goregrind','noisegrind','pornogrind','hardcore punk','crossover thrash','melodic hardcore','post-hardcore','emo','emo pop','screamo','powerviolence','street punk','thrashcore','horror punk','nazi punk','oi!','pop punk','easycore','neon pop','psychobilly','punk pathetique','riot grrrl','ska punk','skate punk']}
    genre_list = ''
    found = False
    for k,v in genre_dict.items():
        if genre in v:
            if genre_list == '': genre_list+=k
            else: genre_list += '|'+k
            found = True
    if not found: return genre
    return genre_list

def super_genre(df):
    df_tmp = df.copy()
    for i in df_tmp.index:
        df_tmp.loc[i,'genre'] = super_genre_helper(df_tmp.iloc[i]['genre'])
    return df_tmp

genre_all = ['blues','country','easy listening','electronic','contemporary folk','hip hop','jazz','pop','r&b and soul','rock','metal','punk']

In [28]:
df_all_super_genre = super_genre(df_all)
df_all_super_genre.to_csv('all_super_genre.csv',index=False)
df_all_super_genre['genre'].unique()

array(['pop', 'blues|r&b and soul', nan, 'r&b and soul', 'blues', 'jazz',
       'rock', 'contemporary folk', 'blues|rock', 'country', 'electronic',
       'hollywood', 'pop|rock', 'electronic|r&b and soul',
       'classic country pop', 'country|rock', 'native american', 'metal',
       'jazz|rock', 'hip hop', 'boy band', 'contemporary folk|rock',
       'latin', 'electronic|pop', 'country|hip hop', 'electronic|hip hop',
       'alaska indie', 'canadian latin'], dtype=object)

In [None]:
# ubah data has_win_award dari float ke boolean
df_1950 = df_1950.astype({"has_win_award":bool})
df_1960 = df_1960.astype({"has_win_award":bool})
df_1970 = df_1970.astype({"has_win_award":bool})
df_1980 = df_1980.astype({"has_win_award":bool})
df_1990 = df_1990.astype({"has_win_award":bool})
df_2000 = df_2000.astype({"has_win_award":bool})
df_2010 = df_2010.astype({"has_win_award":bool})
df_top10 = df_top10.astype({"has_win_award":bool})

In [None]:
# cek duplikasi
print(sum(df_1950.duplicated()))
print(sum(df_1960.duplicated()))
print(sum(df_1970.duplicated()))
print(sum(df_1980.duplicated()))
print(sum(df_1990.duplicated()))
print(sum(df_2000.duplicated()))
print(sum(df_2010.duplicated()))
print(sum(df_top10.duplicated()))

In [None]:
# cek null di kolom yang mana
def cek_apakah_ada_null(data_frame):
    cek = data_frame.isnull()
    columns = list(cek.columns)
    c_null = []
    for i in columns:
        if sum(cek[i]) != 0:
            c_null.append(i)
    return c_null

In [None]:
print(cek_apakah_ada_null(df_1950))
print(cek_apakah_ada_null(df_1960))
print(cek_apakah_ada_null(df_1970))
print(cek_apakah_ada_null(df_1980))
print(cek_apakah_ada_null(df_1990))
print(cek_apakah_ada_null(df_2000))
print(cek_apakah_ada_null(df_2010))
print(cek_apakah_ada_null(df_top10))

In [None]:
sns.heatmap(df_all.corr())

In [None]:
sns.heatmap(df_1950.corr())

In [None]:
sns.heatmap(df_1960.corr())

In [None]:
sns.heatmap(df_1970.corr())

In [None]:
sns.heatmap(df_1980.corr())

In [None]:
sns.heatmap(df_1990.corr())

In [None]:
sns.heatmap(df_2000.corr())

In [None]:
sns.heatmap(df_2010.corr())

In [None]:
sns.heatmap(df_top10.corr())

Hanya energy, dB, dan acoustic saja yang dapat mendeskripsikan sebuah lagu


In [None]:
attribute_spotify_high_corr = ["nrgy","dB","acous"]

In [None]:
# buat scaler
def scaler_spotify(df, attribute_spotify):
    sc = MinMaxScaler()
    data_scaled = sc.fit_transform(df.loc[:,attribute_spotify])
    return data_scaled

def scaler_spotify_with_PCA(df, attribute_spotify):
    sc = MinMaxScaler()
    data_scaled = sc.fit_transform(df.loc[:,attribute_spotify])
    pca = PCA(n_components=0.95)
    pca.fit(data_scaled)
    data_scaled = pca.transform(data_scaled)
    return data_scaled

Clustering

In [None]:
# menggunakan clustering kmeans
# mencari nilai optimal dari k
def optimal_kmeans(scaled_data):
    sil = []
    k = range(2,25)
    for i in k:
        cluster_data = KMeans(n_clusters=i)\
            .fit(scaled_data)
        sil.append(silhouette_score(scaled_data\
            ,cluster_data.labels_,metric='euclidean'))
    plt.plot(k,sil, "bx-")
    plt.xlabel("K")
    plt.ylabel("sil")
    plt.show()
    return sil.index(max(sil))+2

In [None]:
attribute_spotify = ["bpm","nrgy","dnce","dB","live","val","dur","acous","spch","popularity","has_win_award"]

In [None]:
df_1950_std = scaler_spotify_with_PCA(df_1950, attribute_spotify_high_corr)
k = optimal_kmeans(df_1950_std)
df_1950_cd = KMeans(n_clusters=k)\
    .fit_predict(df_1950_std)
df_1950_cluster = df_1950.copy()
df_1950_cluster['cluster'] = df_1950_cd
print(k)

In [None]:
df_1960_std = scaler_spotify(df_1960, attribute_spotify_high_corr)
k = optimal_kmeans(df_1960_std)
df_1960_cd = KMeans(n_clusters=k).fit_predict(df_1960_std)
df_1960_cluster = df_1960.copy()
df_1960_cluster['cluster'] = df_1960_cd
print(k)

In [None]:
df_1970_std = scaler_spotify(df_1970, attribute_spotify_high_corr)
k = optimal_kmeans(df_1970_std)
df_1970_cd = KMeans(n_clusters=k).fit_predict(df_1970_std)
df_1970_cluster = df_1970.copy()
df_1970_cluster['cluster'] = df_1970_cd
print(k)

In [None]:
df_1980_std = scaler_spotify(df_1980, attribute_spotify_high_corr)
k = optimal_kmeans(df_1980_std)
df_1980_cd = KMeans(n_clusters=k).fit_predict(df_1980_std)
df_1980_cluster = df_1980.copy()
df_1980_cluster['cluster'] = df_1980_cd
print(k)

In [None]:
df_1990_std = scaler_spotify(df_1990, attribute_spotify_high_corr)
k = optimal_kmeans(df_1990_std)
df_1990_cd = KMeans(n_clusters=k).fit_predict(df_1990_std)
df_1990_cluster = df_1990.copy()
df_1990_cluster['cluster'] = df_1990_cd
print(k)

In [None]:
df_2000_std = scaler_spotify(df_2000, attribute_spotify_high_corr)
k = optimal_kmeans(df_2000_std)
df_2000_cd = KMeans(n_clusters=k).fit_predict(df_2000_std)
df_2000_cluster = df_2000.copy()
df_2000_cluster['cluster'] = df_2000_cd
print(k)

In [None]:
df_2010_std = scaler_spotify(df_2010, attribute_spotify_high_corr)
k = optimal_kmeans(df_2010_std)
df_2010_cd = KMeans(n_clusters=k).fit_predict(df_2010_std)
df_2010_cluster = df_2010.copy()
df_2010_cluster['cluster'] = df_2010_cd
print(k)

In [None]:
df_top10_std = scaler_spotify(df=df_top10\
    ,attribute_spotify=attribute_spotify_high_corr)
k = optimal_kmeans(df_top10_std)
df_top10_cd = KMeans(n_clusters=k).fit_predict(df_top10_std)
df_top10_cluster = df_top10.copy()
df_top10_cluster['cluster'] = df_top10_cd
print(k)

In [None]:
df_all_std = scaler_spotify(df_all, attribute_spotify_high_corr)
k = optimal_kmeans(df_all_std)
df_all_cd = KMeans(n_clusters=k).fit_predict(df_all_std)
df_all_cluster = df_all.copy()
df_all_cluster['cluster'] = df_all_cd
print(k)

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(df_all.loc[:,attribute_spotify_high_corr], df_all_cd, metric='euclidean')

Analisa

Artis yang populer

In [None]:
sum(df_top10.duplicated())

In [None]:
artist_total = pd.unique(df_all.loc[:,'artist'])
print("Total Artist : %d" % (len(artist_total)))

In [None]:
genre_total = pd.unique(df_all.loc[:,'genre'])
print("Total Genre : %d" % (len(genre_total)))

In [None]:
df_all['artist'].value_counts()

Artis yang paling populer dapat dihitung dari jumlah nilai lagu yang populer

In [None]:
def dict_artist_popularity(df):
    artist_popularity = dict()
    for i in df.index:
        artist = df['artist'][i]
        popularity = df['popularity'][i]
        artist_popularity[artist] = artist_popularity.setdefault(artist,0)\
            + popularity
    return artist_popularity

In [None]:
artis_populer = dict_artist_popularity(df_all.loc[df_all['year'] == 2010])
max(artis_populer, key=artis_populer.get)

Mencari genre lagu yang populer dapat menggunakan hal yang sama saat mencari artis yang paling populer

In [None]:
def dict_genre_popularity(df):
    genre_popularity = dict()
    for i in df.index:
        genre = df['genre'][i]
        popularity = df['popularity'][i]
        genre_popularity[genre] = genre_popularity.setdefault(genre,0)\
            + popularity
    return genre_popularity

In [None]:
genre_populer = dict_genre_popularity(df_top10[df_top10['year'] == 2010])
max(genre_populer, key=genre_populer.get)

Untuk melakukan list genre musik dari artis dapat dilakukan dengan melakukan list setiap artis

In [None]:
def dict_artist_genre(df):
    artist_genre = dict()
    for i in df.index:
        artist = df['artist'][i]
        genre = df['genre'][i]
        if isNaN(genre):
            artist_genre.setdefault(artist,set())
            continue
        artist_genre.setdefault(artist,set()).add(genre)
    return artist_genre

In [None]:
def dict_artist_music(df):
    artist_music = dict()
    for i in df.index:
        artist = df['artist'][i]
        music = df['title'][i]
        if isNaN(music):
            artist_music.setdefault(artist,set())
            continue
        artist_music.setdefault(artist,set()).add(music)
    return artist_music

In [None]:
def dict_genre_music(df):
    genre_music = dict()
    for i in df.index:
        genre = df['genre'][i]
        music = df['title'][i]
        if isNaN(music):
            genre_music.setdefault(genre,set())
            continue
        genre_music.setdefault(genre,set()).add(music)
    return genre_music

In [None]:
artist_genre = dict_artist_genre(df_all)

for i in artist_genre.keys():
    artist_genre[i] = len(artist_genre[i])

genre_max = max(artist_genre.values())
genre_max

In [None]:
music = dict_artist_music(df_all)
df_all.loc[df_all['artist'] == 'Taylor Swift']

Pengaruh durasi waktu dengan popularitas lagu

In [None]:
attribute_spotify_durration_popularity = ["dur","popularity"]

In [None]:
df_durr_popularity = df_all.loc[:,attribute_spotify_durration_popularity]
df_durr_popularity.corr()

Klasifikasi Genre Lagu

In [None]:
df_all_clear.head()

In [None]:
X = df_all_clear.loc[:,attribute_spotify_high_corr]
y = df_all_clear['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

X_train = scaler_spotify(X_train, attribute_spotify_high_corr)
X_test = scaler_spotify(X_test, attribute_spotify_high_corr)

In [None]:
knn_genre = KNeighborsClassifier(n_neighbors=3)
knn_genre.fit(X_train, y_train)

y_pred = knn_genre.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Prediksi popularitas
<br>
Jika dilihat dari nilai korelasi yang dimiliki, maka fitur dnce, dB, dan acous miliki nilai yang tinggi

In [None]:
attribute_spotify_popularity = ['dnce','dB','acous']
X = np.array(df_all.loc[:,attribute_spotify_popularity])
y = np.array(df_all.loc[:,'popularity'])

lr_popularity = LinearRegression().fit(X,y)

In [None]:
lr_popularity.score(X,y)

In [None]:
lr_popularity.coef_

In [None]:
lr_popularity.intercept_

In [None]:
label_0 = df_all_cluster.loc[df_all_cluster['cluster'] == 0]
label_1 = df_all_cluster.loc[df_all_cluster['cluster'] == 1]
label_2 = df_all_cluster.loc[df_all_cluster['cluster'] == 2]

In [None]:
plot3d = plt.axes(projection='3d')
plot3d.set_xlabel('nrgy', fontweight ='bold')
plot3d.set_ylabel('acous', fontweight ='bold')
plot3d.set_zlabel('dB', fontweight ='bold')
plot3d.scatter3D(label_0['nrgy'], label_0['acous'], label_0['dB'], color='red')
plot3d.scatter3D(label_1['nrgy'], label_1['acous'], label_1['dB'], color='blue')
plot3d.scatter3D(label_2['nrgy'], label_2['acous'], label_2['dB'], color='green')
plt.show