# Classificando músicas utilizando KNN

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # pre-processing, uniform data
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_excel('musicas_features.xlsx', usecols = "B:T")
df = df[df['id'].notnull()]
# df.dropna(subset=['id'])
# df.dropna(subset=['duration_ms'])
df = df.drop_duplicates(subset=['id'], keep='last')
df['Ano'] = df['Ano'].astype(int)
df.head()

Unnamed: 0,Posicao,Musica,Artista,Ano,id,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,1.0,Balancê,Gal Costa,1980,2m1G38M0gJuE4aVPh0tmIP,1979-01-08,0.638,0.736,9.0,-10.536,1.0,0.0811,0.0849,0.0,0.333,0.881,144.823,188667.0,4.0
1,4.0,Momentos,Joanna,1980,5ncQCIFnjpHWRXbuKFG8RP,2002-11-08,0.501,0.296,7.0,-12.018,0.0,0.0292,0.802,0.00637,0.117,0.305,132.462,235147.0,4.0
2,5.0,Menino do Rio,Baby Consuelo,1980,6tisytwaOaBA4CAZPPdlFP,1978-09-14,0.38,0.548,0.0,-9.26,1.0,0.0333,0.73,0.000142,0.31,0.395,152.498,263733.0,4.0
3,6.0,Toada (Na Direção do Dia),Boca Livre,1980,2LO6c0rfmBm49Qs9KtQR1W,2007-01-01,0.402,0.506,6.0,-13.094,0.0,0.0409,0.928,0.000946,0.728,0.446,97.082,233707.0,4.0
4,9.0,Meu Bem Querer,Djavan,1980,5ogHB4oYt1C7kaWJYm5MDG,1999-03-25,0.519,0.457,9.0,-8.45,1.0,0.0326,0.763,0.000666,0.796,0.264,81.561,254800.0,4.0


## Determinando a década de cada música

In [3]:
def obter_decada(linha):
    ano = str(linha['Ano'])
    if "198" in ano:
        return "1980"
    if "199" in ano:
        return "1999"
    if "200" in ano:
        return "2000"
    if "201" in ano:
        return "2010"
    return "-"

df['Decada'] = df.apply (lambda x: obter_decada(x), axis=1)

##  Dividindo o dataset em base de teste e treino

In [4]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 
        'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
        'valence', 'tempo', 'duration_ms', 'time_signature']]
y = df['Decada']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [5]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

## Classificador

In [6]:
classifier = KNeighborsClassifier(
    n_neighbors=21, # raiz quadrada do tamanho do y_test
    p=4, # quantidade de décadas a serem testadas
    metric='euclidean')

In [7]:
# df[['danceability', 'energy', 'key', 'loudness', 'mode', 
#         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
#         'valence', 'tempo', 'duration_ms', 'time_signature']].isnull().sum()

# print(np.isnan(df['danceability']).sum())
# print(np.isnan(closingPriceTrain).any())
# print(np.isnan(openingPriceTest).any())

In [8]:
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=21, p=4)

In [9]:
y_pred = classifier.predict(X_test)
y_pred

array(['2010', '2010', '1999', '2000', '2010', '1999', '2010', '2010',
       '1980', '1980', '2010', '2000', '2010', '2010', '2010', '1980',
       '2010', '2010', '2010', '2010', '1999', '2010', '1999', '2000',
       '2000', '1980', '1999', '1999', '2010', '2010', '1999', '1999',
       '1980', '2010', '2010', '1999', '2010', '2010', '1999', '1999',
       '2010', '2000', '1999', '2010', '2000', '2000', '2000', '1999',
       '1980', '2000', '1999', '2010', '1999', '2010', '1999', '2010',
       '1999', '2000', '2010', '2000', '1999', '2000', '2010', '1999',
       '1980', '1999', '2000', '2000', '2000', '1999', '1999', '2010',
       '2000', '1999', '2010', '1999', '2010', '2010', '2010', '2010',
       '1999', '2000', '1999', '1980', '1980', '1980', '2010', '2010',
       '2010', '2010', '1980', '1999', '2010', '2010', '2000', '1999',
       '1980', '2010', '2000', '1980', '2010', '1980', '2010', '1999',
       '1980', '2000', '2010', '1980', '2000', '2000', '2010', '1999',
      

In [10]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 0  0  0  1 15]
 [ 0 24 37 14 11]
 [ 0 18 40 17 12]
 [ 0  7 17 56 25]
 [ 0  3  8 18 92]]


In [18]:
print(f1_score(y_test, y_pred, average='micro'))

0.5108433734939759


In [12]:
print(accuracy_score(y_test, y_pred))

0.5108433734939759
