In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as img
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn import svm
from collections import Counter

In [None]:
# fungsi membantu
def isNaN(value) :
    try:
        import math
        return math.isnan(float(value))
    except:
        return False

<h2>Penjelasan Atribut</h2>

<table>
  <tr>
    <th>Num</th>
    <th>Attribute Name</th>
    <th>Description</th>
  </tr>
  <tr>
    <td>1</td>
    <td>title</td>
    <td>Title</td>
  </tr>
  <tr>
    <td>2</td>
    <td>artist</td>
    <td>Artist</td>
  </tr>
  <tr>
    <td>3</td>
    <td>genre</td>
    <td>Genre of the song</td>
  </tr>
  <tr>
    <td>4</td>
    <td>year</td>
    <td>Year of the song (due to re-releases, the year might not correspond to the release year of the original song)</td>
  </tr>
  <tr>
    <td>5</td>
    <td>bpm</td>
    <td>Beats per minute</td>
  </tr>
  <tr>
    <td>6</td>
    <td>nrgy</td>
    <td>Energy of a song, the higher the value the more energetic the song is</td>
  </tr>
  <tr>
    <td>7</td>
    <td>dnce</td>
    <td>The higher the value, the easier it is to dance to this song.</td>
  </tr>
  <tr>
    <td>8</td>
    <td>dB</td>
    <td>The higher the value, the louder the song</td>
  </tr>
  <tr>
    <td>9</td>
    <td>live</td>
    <td>The higher the value, the more likely the song is a live recording.</td>
  </tr>
  <tr>
    <td>10</td>
    <td>val</td>
    <td>The higher the value, the more positive the mood for the song.</td></tr>
  <tr>
    <td>11</td>
    <td>dur</td>
    <td>The duration of the song</td>
  </tr>
  <tr>
    <td>12</td>
    <td>acous</td>
    <td>The higher the value the more acoustic the song is.</td>
  </tr>
  <tr>
    <td>13</td>
    <td>spch</td>
    <td>The higher the value the more spoken words the song contains.</td>
  </tr>
  <tr>
    <td>14</td>
    <td>popularity</td>
    <td>The higher the value the more popular the song is.</td>
  </tr>
  <tr>
    <td>15</td>
    <td>has_win_award</td>
    <td>Boolean value to indicate if the song has won an award or not. Value of 1 if the song has already won one or more awards otherwise 0 if the song hasn’t won any awards.</td>
  </tr>
</table>

load semua csv

In [None]:
df_1950 = pd.read_csv("1950.csv")
df_1960 = pd.read_csv("1960.csv")
df_1970 = pd.read_csv("1970.csv")
df_1980 = pd.read_csv("1980.csv")
df_1990 = pd.read_csv("1990.csv")
df_2000 = pd.read_csv("2000.csv")
df_2010 = pd.read_csv("2010.csv")
df_top10 = pd.read_csv("top10s.csv")

In [None]:
# Penggabungan seluruh data frame
df_all = pd.concat(\
    [df_1950,df_1960\
        ,df_1970,df_1980\
        ,df_1990,df_2000\
        ,df_2010,df_top10],ignore_index=True, sort=False)

df_all.to_csv('all_sort_from_1950_to_top10.csv',index=False)

# drop duplicate
df_all = df_all.drop_duplicates()
# drop all nan
df_all_clear = df_all.dropna()
# ubah data has_win_award dari float ke boolean
df_all = df_all.astype({"has_win_award":bool})
df_all_clear = df_all_clear.astype({"has_win_award":bool})

df_all.to_csv('all_no_duplicates.csv',index=False)

Preprocessing

In [None]:
# ubah data has_win_award dari float ke boolean
df_1950 = df_1950.astype({"has_win_award":bool})
df_1960 = df_1960.astype({"has_win_award":bool})
df_1970 = df_1970.astype({"has_win_award":bool})
df_1980 = df_1980.astype({"has_win_award":bool})
df_1990 = df_1990.astype({"has_win_award":bool})
df_2000 = df_2000.astype({"has_win_award":bool})
df_2010 = df_2010.astype({"has_win_award":bool})
df_top10 = df_top10.astype({"has_win_award":bool})

In [None]:
# cek duplikasi
print(sum(df_1950.duplicated()))
print(sum(df_1960.duplicated()))
print(sum(df_1970.duplicated()))
print(sum(df_1980.duplicated()))
print(sum(df_1990.duplicated()))
print(sum(df_2000.duplicated()))
print(sum(df_2010.duplicated()))
print(sum(df_top10.duplicated()))

In [None]:
# cek null di kolom yang mana
def cek_apakah_ada_null(data_frame):
    cek = data_frame.isnull()
    columns = list(cek.columns)
    c_null = []
    for i in columns:
        if sum(cek[i]) != 0:
            c_null.append(i)
    return c_null

In [None]:
print(cek_apakah_ada_null(df_1950))
print(cek_apakah_ada_null(df_1960))
print(cek_apakah_ada_null(df_1970))
print(cek_apakah_ada_null(df_1980))
print(cek_apakah_ada_null(df_1990))
print(cek_apakah_ada_null(df_2000))
print(cek_apakah_ada_null(df_2010))
print(cek_apakah_ada_null(df_top10))

Karena nilai null pada genre tidak mempengaruhi klaster maka tidak perlu dilakukan penambahan atau pengurangan. Jika dilakukan penambahan, belum tentu sesuai dengan genre musik sesungguhnya. Nilai genre hanya akan berpengaruh dalam visualisasi data.

In [None]:
# buat scaler
def scaler_spotify(df, attribute_spotify):
    sc = MinMaxScaler()
    data_scaled = sc.fit_transform(df.loc[:,attribute_spotify])
    return data_scaled

def scaler_spotify_with_PCA(df, attribute_spotify):
    sc = MinMaxScaler()
    data_scaled = sc.fit_transform(df.loc[:,attribute_spotify])
    pca = PCA(n_components=0.95)
    pca.fit(data_scaled)
    data_scaled = pca.transform(data_scaled)
    return data_scaled

Clustering

In [None]:
# menggunakan clustering kmeans
# mencari nilai optimal dari k
def optimal_kmeans(scaled_data):
    sil = []
    k = range(2,25)
    for i in k:
        cluster_data = KMeans(n_clusters=i)\
            .fit(scaled_data)
        sil.append(silhouette_score(scaled_data\
            ,cluster_data.labels_,metric='euclidean'))
    plt.plot(k,sil, "bx-")
    plt.xlabel("K")
    plt.ylabel("sil")
    plt.show()
    return sil.index(max(sil))+2

In [None]:
attribute_spotify = ["bpm","nrgy","dnce","dB","live","val","dur","acous","spch","popularity","has_win_award"]

In [None]:
df_1950_std = scaler_spotify(df=df_1950\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_1950_std)
df_1950_cd = KMeans(n_clusters=k)\
    .fit_predict(df_1950_std)
print(k)

In [None]:
df_1960_std = scaler_spotify(df=df_1960\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_1960_std)
df_1960_cd = KMeans(n_clusters=k).fit_predict(df_1960_std)
print(k)

In [None]:
df_1970_std = scaler_spotify(df=df_1970\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_1970_std)
df_1970_cd = KMeans(n_clusters=k).fit_predict(df_1970_std)
print(k)

In [None]:
df_1980_std = scaler_spotify(df=df_1980\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_1980_std)
df_1980_cd = KMeans(n_clusters=k).fit_predict(df_1980_std)
print(k)

In [None]:
df_1990_std = scaler_spotify(df=df_1990\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_1990_std)
df_1990_cd = KMeans(n_clusters=k).fit_predict(df_1990_std)
print(k)

In [None]:
df_2000_std = scaler_spotify(df=df_2000\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_2000_std)
df_2000_cd = KMeans(n_clusters=k).fit_predict(df_2000_std)
print(k)

In [None]:
df_2010_std = scaler_spotify(df=df_2010\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_2010_std)
df_2010_cd = KMeans(n_clusters=k).fit_predict(df_2010_std)
print(k)

In [None]:
df_top10_std = scaler_spotify(df=df_top10\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_top10_std)
df_top10_cd = KMeans(n_clusters=k).fit_predict(df_top10_std)
print(k)

In [None]:
df_all_std = scaler_spotify(df=df_all\
    ,attribute_spotify=attribute_spotify)
k = optimal_kmeans(df_all_std)
df_all_cd = KMeans(n_clusters=k).fit_predict(df_all_std)
print(k)

Analisa

Artis yang populer

In [None]:
sum(df_top10.duplicated())

In [None]:
artist_total = pd.unique(df_all.loc[:,'artist'])
print("Total Artist : %d" % (len(artist_total)))

In [None]:
genre_total = pd.unique(df_all.loc[:,'genre'])
print("Total Genre : %d" % (len(genre_total)))

In [None]:
df_all['artist'].value_counts()

Artis yang paling populer dapat dihitung dari jumlah nilai lagu yang populer

In [None]:
def dict_artist_popularity(df):
    artist_popularity = dict()
    for i in df.index:
        artist = df['artist'][i]
        popularity = df['popularity'][i]
        artist_popularity[artist] = artist_popularity.setdefault(artist,0)\
            + popularity
    return artist_popularity

In [None]:
tmp = dict_artist_popularity(df_1950)
# max(tmp, key=tmp.get)

Mencari genre lagu yang populer dapat menggunakan hal yang sama saat mencari artis yang paling populer

In [None]:
def dict_genre_popularity(df):
    genre_popularity = dict()
    for i in df.index:
        genre = df['genre'][i]
        popularity = df['popularity'][i]
        genre_popularity[genre] = genre_popularity.setdefault(genre,0)\
            + popularity
    return genre_popularity

Untuk melakukan list genre musik dari artis dapat dilakukan dengan melakukan list setiap artis

In [None]:
def dict_artist_genre(df):
    artist_genre = dict()
    for i in df.index:
        artist = df['artist'][i]
        genre = df['genre'][i]
        if isNaN(genre):
            artist_genre.setdefault(artist,set())
            continue
        artist_genre.setdefault(artist,set()).add(genre)
    return artist_genre

In [None]:
def dict_artist_music(df):
    artist_music = dict()
    for i in df.index:
        artist = df['artist'][i]
        music = df['title'][i]
        if isNaN(music):
            artist_music.setdefault(artist,set())
            continue
        artist_music.setdefault(artist,set()).add(music)
    return artist_music

In [None]:
music = dict_artist_music(df_all)
df_all.loc[df_all['artist'] == 'Taylor Swift']

In [None]:
tmp_2 = dict_artist_genre(df_all)

tmp_3 = dict()
for i in tmp_2.keys():
    tmp_3[i] = tmp_3.setdefault(i,0) + len(tmp_2[i])

genre_max = max(tmp_3.values())
list_of_artist = dict()
for i in tmp_3.keys():
    if tmp_3[i] == genre_max:
        list_of_artist[i] = tmp_2[i]

list_of_artist

Pengaruh durasi waktu dengan popularitas lagu

In [None]:
attribute_spotify_durration_popularity = ["dur","popularity"]

In [None]:
# Menggunakan svm
svm_train = svm.NuSVC(gamma='auto')
duration = list(df_all['dur'])
popularity = list(df_all['popularity'])
svm_train.fit([duration, popularity])