In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as img
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import seaborn as sns

In [None]:
# fungsi membantu
def isNaN(value) :
    try:
        import math
        return math.isnan(float(value))
    except:
        return False

<h2>Penjelasan Atribut</h2>

<table>
  <tr>
    <th>Num</th>
    <th>Attribute Name</th>
    <th>Description</th>
  </tr>
  <tr>
    <td>1</td>
    <td>title</td>
    <td>Title</td>
  </tr>
  <tr>
    <td>2</td>
    <td>artist</td>
    <td>Artist</td>
  </tr>
  <tr>
    <td>3</td>
    <td>genre</td>
    <td>Genre of the song</td>
  </tr>
  <tr>
    <td>4</td>
    <td>year</td>
    <td>Year of the song (due to re-releases, the year might not correspond to the release year of the original song)</td>
  </tr>
  <tr>
    <td>5</td>
    <td>bpm</td>
    <td>Beats per minute</td>
  </tr>
  <tr>
    <td>6</td>
    <td>nrgy</td>
    <td>Energy of a song, the higher the value the more energetic the song is</td>
  </tr>
  <tr>
    <td>7</td>
    <td>dnce</td>
    <td>The higher the value, the easier it is to dance to this song.</td>
  </tr>
  <tr>
    <td>8</td>
    <td>dB</td>
    <td>The higher the value, the louder the song</td>
  </tr>
  <tr>
    <td>9</td>
    <td>live</td>
    <td>The higher the value, the more likely the song is a live recording.</td>
  </tr>
  <tr>
    <td>10</td>
    <td>val</td>
    <td>The higher the value, the more positive the mood for the song.</td></tr>
  <tr>
    <td>11</td>
    <td>dur</td>
    <td>The duration of the song</td>
  </tr>
  <tr>
    <td>12</td>
    <td>acous</td>
    <td>The higher the value the more acoustic the song is.</td>
  </tr>
  <tr>
    <td>13</td>
    <td>spch</td>
    <td>The higher the value the more spoken words the song contains.</td>
  </tr>
  <tr>
    <td>14</td>
    <td>popularity</td>
    <td>The higher the value the more popular the song is.</td>
  </tr>
  <tr>
    <td>15</td>
    <td>has_win_award</td>
    <td>Boolean value to indicate if the song has won an award or not. Value of 1 if the song has already won one or more awards otherwise 0 if the song hasn’t won any awards.</td>
  </tr>
</table>

load semua csv

In [None]:
df_1950 = pd.read_csv("1950.csv")
df_1960 = pd.read_csv("1960.csv")
df_1970 = pd.read_csv("1970.csv")
df_1980 = pd.read_csv("1980.csv")
df_1990 = pd.read_csv("1990.csv")
df_2000 = pd.read_csv("2000.csv")
df_2010 = pd.read_csv("2010.csv")
df_top10 = pd.read_csv("top10s.csv")

In [None]:
# Penggabungan seluruh data frame
df_all = pd.concat(\
    [df_1950,df_1960\
        ,df_1970,df_1980\
        ,df_1990,df_2000\
        ,df_2010,df_top10],ignore_index=True, sort=False)

df_all.to_csv('all_sort_from_1950_to_top10.csv',index=False)

# drop duplicate
df_all = df_all.drop_duplicates()
# drop all nan
df_all_clear = df_all.dropna()
# ubah data has_win_award dari float ke boolean
df_all = df_all.astype({"has_win_award":bool})
df_all_clear = df_all_clear.astype({"has_win_award":bool})

df_all.to_csv('all_no_duplicates.csv',index=False)

Preprocessing

In [None]:
df_top10.info()

In [None]:
# ubah data has_win_award dari float ke boolean
df_1950 = df_1950.astype({"has_win_award":bool})
df_1960 = df_1960.astype({"has_win_award":bool})
df_1970 = df_1970.astype({"has_win_award":bool})
df_1980 = df_1980.astype({"has_win_award":bool})
df_1990 = df_1990.astype({"has_win_award":bool})
df_2000 = df_2000.astype({"has_win_award":bool})
df_2010 = df_2010.astype({"has_win_award":bool})
df_top10 = df_top10.astype({"has_win_award":bool})

In [None]:
# cek duplikasi
print(sum(df_1950.duplicated()))
print(sum(df_1960.duplicated()))
print(sum(df_1970.duplicated()))
print(sum(df_1980.duplicated()))
print(sum(df_1990.duplicated()))
print(sum(df_2000.duplicated()))
print(sum(df_2010.duplicated()))
print(sum(df_top10.duplicated()))

In [None]:
# cek null di kolom yang mana
def cek_apakah_ada_null(data_frame):
    cek = data_frame.isnull()
    columns = list(cek.columns)
    c_null = []
    for i in columns:
        if sum(cek[i]) != 0:
            c_null.append(i)
    return c_null

In [None]:
print(cek_apakah_ada_null(df_1950))
print(cek_apakah_ada_null(df_1960))
print(cek_apakah_ada_null(df_1970))
print(cek_apakah_ada_null(df_1980))
print(cek_apakah_ada_null(df_1990))
print(cek_apakah_ada_null(df_2000))
print(cek_apakah_ada_null(df_2010))
print(cek_apakah_ada_null(df_top10))

In [None]:
sns.heatmap(df_all.corr())

In [None]:
sns.heatmap(df_1950.corr())

In [None]:
sns.heatmap(df_1960.corr())

In [None]:
sns.heatmap(df_1970.corr())

In [None]:
sns.heatmap(df_1980.corr())

In [None]:
sns.heatmap(df_1990.corr())

In [None]:
sns.heatmap(df_2000.corr())

In [None]:
sns.heatmap(df_2010.corr())

In [None]:
sns.heatmap(df_top10.corr())

Hanya energy, dB, dan acoustic saja yang dapat mendeskripsikan sebuah lagu


In [None]:
attribute_spotify_high_corr = ["nrgy","dB","acous"]

In [None]:
# buat scaler
def scaler_spotify(df, attribute_spotify):
    sc = MinMaxScaler()
    data_scaled = sc.fit_transform(df.loc[:,attribute_spotify])
    return data_scaled

def scaler_spotify_with_PCA(df, attribute_spotify):
    sc = MinMaxScaler()
    data_scaled = sc.fit_transform(df.loc[:,attribute_spotify])
    pca = PCA(n_components=0.95)
    pca.fit(data_scaled)
    data_scaled = pca.transform(data_scaled)
    return data_scaled

Clustering

In [None]:
# menggunakan clustering kmeans
# mencari nilai optimal dari k
def optimal_kmeans(scaled_data):
    sil = []
    k = range(2,25)
    for i in k:
        cluster_data = KMeans(n_clusters=i)\
            .fit(scaled_data)
        sil.append(silhouette_score(scaled_data\
            ,cluster_data.labels_,metric='euclidean'))
    plt.plot(k,sil, "bx-")
    plt.xlabel("K")
    plt.ylabel("sil")
    plt.show()
    return sil.index(max(sil))+2

In [None]:
attribute_spotify = ["bpm","nrgy","dnce","dB","live","val","dur","acous","spch","popularity","has_win_award"]

In [None]:
df_1950_std = scaler_spotify(df_1950, attribute_spotify)
k = optimal_kmeans(df_1950_std)
df_1950_cd = KMeans(n_clusters=k)\
    .fit_predict(df_1950_std)
df_1950_cluster = df_1950.copy()
df_1950_cluster['cluster'] = df_1950_cd
print(k)

In [None]:
df_1960_std = scaler_spotify(df_1960, attribute_spotify)
k = optimal_kmeans(df_1960_std)
df_1960_cd = KMeans(n_clusters=k).fit_predict(df_1960_std)
df_1960_cluster = df_1960.copy()
df_1960_cluster['cluster'] = df_1960_cd
print(k)

In [None]:
df_1970_std = scaler_spotify(df_1970, attribute_spotify)
k = optimal_kmeans(df_1970_std)
df_1970_cd = KMeans(n_clusters=k).fit_predict(df_1970_std)
df_1970_cluster = df_1970.copy()
df_1970_cluster['cluster'] = df_1970_cd
print(k)

In [None]:
df_1980_std = scaler_spotify(df_1980, attribute_spotify)
k = optimal_kmeans(df_1980_std)
df_1980_cd = KMeans(n_clusters=k).fit_predict(df_1980_std)
df_1980_cluster = df_1980.copy()
df_1980_cluster['cluster'] = df_1980_cd
print(k)

In [None]:
df_1990_std = scaler_spotify(df_1990, attribute_spotify)
k = optimal_kmeans(df_1990_std)
df_1990_cd = KMeans(n_clusters=k).fit_predict(df_1990_std)
df_1990_cluster = df_1990.copy()
df_1990_cluster['cluster'] = df_1990_cd
print(k)

In [None]:
df_2000_std = scaler_spotify(df_2000, attribute_spotify)
k = optimal_kmeans(df_2000_std)
df_2000_cd = KMeans(n_clusters=k).fit_predict(df_2000_std)
df_2000_cluster = df_2000.copy()
df_2000_cluster['cluster'] = df_2000_cd
print(k)

In [None]:
df_2010_std = scaler_spotify(df_2010, attribute_spotify)
k = optimal_kmeans(df_2010_std)
df_2010_cd = KMeans(n_clusters=k).fit_predict(df_2010_std)
df_2010_cluster = df_2010.copy()
df_2010_cluster['cluster'] = df_2010_cd
print(k)

In [None]:
df_top10_std = scaler_spotify(df=df_top10\
    ,attribute_spotify=attribute_spotify_high_corr)
k = optimal_kmeans(df_top10_std)
df_top10_cd = KMeans(n_clusters=k).fit_predict(df_top10_std)
df_top10_cluster = df_top10.copy()
df_top10_cluster['cluster'] = df_top10_cd
print(k)

In [None]:
df_all_std = scaler_spotify(df_all, attribute_spotify)
k = optimal_kmeans(df_all_std)
df_all_cd = KMeans(n_clusters=k).fit_predict(df_all_std)
df_all_cluster = df_all.copy()
df_all_cluster['cluster'] = df_all_cd
print(k)

Analisa

Artis yang populer

In [None]:
sum(df_top10.duplicated())

In [None]:
artist_total = pd.unique(df_all.loc[:,'artist'])
print("Total Artist : %d" % (len(artist_total)))

In [None]:
genre_total = pd.unique(df_all.loc[:,'genre'])
print("Total Genre : %d" % (len(genre_total)))

In [None]:
df_all['artist'].value_counts()

Artis yang paling populer dapat dihitung dari jumlah nilai lagu yang populer

In [None]:
def dict_artist_popularity(df):
    artist_popularity = dict()
    for i in df.index:
        artist = df['artist'][i]
        popularity = df['popularity'][i]
        artist_popularity[artist] = artist_popularity.setdefault(artist,0)\
            + popularity
    return artist_popularity

In [None]:
artis_populer = dict_artist_popularity(df_top10.loc[df_top10['year'] == 2010])
max(artis_populer, key=artis_populer.get)

Mencari genre lagu yang populer dapat menggunakan hal yang sama saat mencari artis yang paling populer

In [None]:
def dict_genre_popularity(df):
    genre_popularity = dict()
    for i in df.index:
        genre = df['genre'][i]
        popularity = df['popularity'][i]
        genre_popularity[genre] = genre_popularity.setdefault(genre,0)\
            + popularity
    return genre_popularity

In [None]:
genre_populer = dict_genre_popularity(df_top10[df_top10['year'] == 2010])
max(genre_populer, key=genre_populer.get)

Untuk melakukan list genre musik dari artis dapat dilakukan dengan melakukan list setiap artis

In [None]:
def dict_artist_genre(df):
    artist_genre = dict()
    for i in df.index:
        artist = df['artist'][i]
        genre = df['genre'][i]
        if isNaN(genre):
            artist_genre.setdefault(artist,set())
            continue
        artist_genre.setdefault(artist,set()).add(genre)
    return artist_genre

In [None]:
def dict_artist_music(df):
    artist_music = dict()
    for i in df.index:
        artist = df['artist'][i]
        music = df['title'][i]
        if isNaN(music):
            artist_music.setdefault(artist,set())
            continue
        artist_music.setdefault(artist,set()).add(music)
    return artist_music

In [None]:
def dict_genre_music(df):
    genre_music = dict()
    for i in df.index:
        genre = df['genre'][i]
        music = df['title'][i]
        if isNaN(music):
            genre_music.setdefault(genre,set())
            continue
        genre_music.setdefault(genre,set()).add(music)
    return genre_music

In [None]:
artist_genre = dict_artist_genre(df_all)

for i in artist_genre.keys():
    artist_genre[i] = len(artist_genre[i])

genre_max = max(artist_genre.values())
genre_max

In [None]:
music = dict_artist_music(df_all)
df_all.loc[df_all['artist'] == 'Taylor Swift']

Pengaruh durasi waktu dengan popularitas lagu

In [None]:
attribute_spotify_durration_popularity = ["dur","popularity"]

In [None]:
df_durr_popularity = df_all.loc[:,attribute_spotify_durration_popularity]
df_durr_popularity.corr()

Klasifikasi Genre Lagu

In [None]:
df_all_clear.head()

In [None]:
X = df_all_clear.loc[:,attribute_spotify_high_corr]
y = df_all_clear['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

X_train = scaler_spotify(X_train, attribute_spotify_high_corr)
X_test = scaler_spotify(X_test, attribute_spotify_high_corr)

In [None]:
knn_genre = KNeighborsClassifier(n_neighbors=3)
knn_genre.fit(X_train, y_train)

y_pred = knn_genre.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Prediksi popularitas
<br>
Jika dilihat dari nilai korelasi yang dimiliki, maka fitur dnce, dB, dan acous miliki nilai yang tinggi

In [None]:
attribute_spotify_popularity = ['dnce','dB','acous']
X = np.array(df_all.loc[:,attribute_spotify_popularity])
y = np.array(df_all.loc[:,'popularity'])

lr_popularity = LinearRegression().fit(X,y)

In [None]:
lr_popularity.score(X,y)

In [None]:
lr_popularity.coef_

In [None]:
lr_popularity.intercept_

In [None]:
label_0 = df_1980_cluster.loc[df_1980_cluster['cluster'] == 0]
label_1 = df_1980_cluster.loc[df_1980_cluster['cluster'] == 1]
label_2 = df_1980_cluster.loc[df_1980_cluster['cluster'] == 2]

In [None]:
plt.scatter(label_0['nrgy'], label_0['acous'], color='red')
plt.scatter(label_1['nrgy'], label_1['acous'], color='blue')
plt.scatter(label_2['nrgy'], label_2['acous'], color='green')
plt.show