In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input/proximo-hit-spotify'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

KeyboardInterrupt: 

# Hipoteses

## primeira hipotese: duração da musica afeta o target

# Carregamento dos dados

In [2]:
pd.read_csv('/kaggle/input/proximo-hit-spotify/sample_submission.csv')

Unnamed: 0,track_unique_id,popularity_target
0,113186,0
1,42819,0
2,59311,0
3,91368,1
4,61000,1
...,...,...
34195,39480,1
34196,74692,0
34197,53782,0
34198,41890,1


In [3]:
treino = pd.read_csv('/kaggle/input/proximo-hit-spotify/train.csv')
teste = pd.read_csv('/kaggle/input/proximo-hit-spotify/test.csv')

# Tratamento de valores nulos

In [4]:
# verificando a existência e volume de valores nulos em cada coluna
teste.isnull().sum()

track_unique_id     0
track_id            0
artists             1
album_name          1
track_name          1
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [5]:
# baixo volume de valores nulos, então podemos lidar com eles dropando as linhas em que eles se encontram
teste = teste.dropna()

In [6]:
# garantindo que todos os valores nulos foram excluidos do dataset
treino.isnull().sum()

track_unique_id      0
track_id             0
artists              0
album_name           0
track_name           0
duration_ms          0
explicit             0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
track_genre          0
popularity_target    0
dtype: int64

# Processamento dos dados

In [7]:
# verificando o tipo dos dados
treino.dtypes

track_unique_id        int64
track_id              object
artists               object
album_name            object
track_name            object
duration_ms            int64
explicit                bool
danceability         float64
energy               float64
key                    int64
loudness             float64
mode                   int64
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
time_signature         int64
track_genre           object
popularity_target      int64
dtype: object

## Excluindo variáveis catergóricas que não serão úteis para a predição

In [8]:
# definindo quais colunas vão ser excluídas
# o motivo de excluir essas colunas é que são variáveis irrelevantes ou variáveis com muitos valores categóricos únicos, o que torna dificil de encode 
colunas_para_excluir = ['track_id','artists','album_name','track_name']

In [9]:
# dropando as colunas selecionadas
treino_v2 = treino.drop(columns = colunas_para_excluir)
teste_v2 = teste.drop(columns = colunas_para_excluir)

## Transformar valores 'bool' em 0 ou 1, ao invés de 'True' ou 'False'

In [10]:
# transformando os valores booleanos em int para melhorar a visualização
treino_v2["explicit"] = treino_v2["explicit"].astype(int)
teste_v2["explicit"] = teste_v2["explicit"].astype(int)

In [11]:
# target encoding na variável de genero musical
# foi escolhido aqui o método target encoding por terem muitos valores únicos, porém ser uma feature de alta importância para a predição 
mean_target_per_genre = treino_v2.groupby('track_genre')['popularity_target'].mean()

treino_v2['genre_encoded'] = treino_v2['track_genre'].map(mean_target_per_genre)
teste_v2['genre_encoded'] = teste_v2['track_genre'].map(mean_target_per_genre)

mean_target_general = treino_v2['popularity_target'].mean()
teste_v2['genre_encoded'] = teste_v2['genre_encoded'].fillna(mean_target_general)

treino_v2

Unnamed: 0,track_unique_id,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,popularity_target,genre_encoded
0,41996,211533,0,0.305,0.84900,9,-10.795,1,0.0549,0.000058,0.056700,0.4640,0.3200,141.793,4,goth,0,0.197917
1,76471,196000,0,0.287,0.19000,7,-12.030,0,0.0370,0.930000,0.000356,0.0834,0.1330,83.685,4,opera,0,0.177465
2,54809,216506,0,0.583,0.50900,1,-9.661,1,0.0362,0.777000,0.202000,0.1150,0.5440,90.459,3,idm,1,0.095745
3,16326,218346,0,0.163,0.03680,8,-23.149,1,0.0472,0.991000,0.899000,0.1070,0.0387,69.442,3,classical,0,0.108883
4,109799,173160,0,0.647,0.92100,2,-7.294,1,0.1850,0.000939,0.371000,0.1310,0.1710,137.981,4,techno,0,0.641834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79795,76820,162613,0,0.554,0.00763,4,-30.566,1,0.0502,0.915000,0.000970,0.2210,0.1560,119.502,4,opera,1,0.177465
79796,110268,240062,0,0.689,0.55400,9,-8.200,1,0.0759,0.091000,0.914000,0.0867,0.1630,148.002,4,trance,0,0.615811
79797,103694,136306,0,0.629,0.56000,0,-11.455,0,0.0523,0.595000,0.000000,0.1820,0.8800,118.895,3,soul,0,0.290859
79798,860,216841,0,0.421,0.10700,6,-15.191,1,0.0335,0.948000,0.000000,0.0881,0.1180,104.218,4,acoustic,1,0.645864


In [12]:
# dropando a coluna original de genero
treino_v3 = treino_v2.drop(columns = ['track_genre'])
teste_v3 = teste_v2.drop(columns = ['track_genre'])

In [13]:
teste_v3

Unnamed: 0,track_unique_id,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre_encoded
0,113186,440247,0,0.369,0.598,7,-6.984,1,0.0304,0.005110,0.000000,0.176,0.0466,148.014,4,0.780521
1,42819,93933,0,0.171,0.997,7,-3.586,1,0.1180,0.005210,0.801000,0.420,0.0294,122.223,4,0.015015
2,59311,213578,0,0.173,0.803,9,-10.071,0,0.1440,0.613000,0.001910,0.195,0.0887,75.564,3,0.000000
3,91368,151387,0,0.683,0.511,6,-5.598,1,0.0279,0.406000,0.000197,0.111,0.5980,109.991,3,0.274566
4,61000,236293,0,0.555,0.941,9,-3.294,0,0.0481,0.484000,0.000000,0.266,0.8130,92.487,4,0.098571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34195,39480,216650,0,0.597,0.980,7,-2.075,1,0.0636,0.004270,0.000035,0.293,0.6330,139.989,4,0.473988
34196,74692,219493,0,0.454,0.367,0,-12.251,1,0.0285,0.137000,0.000029,0.171,0.1420,113.250,4,0.955556
34197,53782,165333,1,0.687,0.915,0,-1.342,1,0.0411,0.021000,0.000006,0.151,0.2720,89.962,4,0.503737
34198,41890,293040,0,0.362,0.668,0,-9.578,1,0.0432,0.000406,0.009400,0.131,0.1580,122.234,4,0.197917


In [14]:
treino_v3

Unnamed: 0,track_unique_id,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity_target,genre_encoded
0,41996,211533,0,0.305,0.84900,9,-10.795,1,0.0549,0.000058,0.056700,0.4640,0.3200,141.793,4,0,0.197917
1,76471,196000,0,0.287,0.19000,7,-12.030,0,0.0370,0.930000,0.000356,0.0834,0.1330,83.685,4,0,0.177465
2,54809,216506,0,0.583,0.50900,1,-9.661,1,0.0362,0.777000,0.202000,0.1150,0.5440,90.459,3,1,0.095745
3,16326,218346,0,0.163,0.03680,8,-23.149,1,0.0472,0.991000,0.899000,0.1070,0.0387,69.442,3,0,0.108883
4,109799,173160,0,0.647,0.92100,2,-7.294,1,0.1850,0.000939,0.371000,0.1310,0.1710,137.981,4,0,0.641834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79795,76820,162613,0,0.554,0.00763,4,-30.566,1,0.0502,0.915000,0.000970,0.2210,0.1560,119.502,4,1,0.177465
79796,110268,240062,0,0.689,0.55400,9,-8.200,1,0.0759,0.091000,0.914000,0.0867,0.1630,148.002,4,0,0.615811
79797,103694,136306,0,0.629,0.56000,0,-11.455,0,0.0523,0.595000,0.000000,0.1820,0.8800,118.895,3,0,0.290859
79798,860,216841,0,0.421,0.10700,6,-15.191,1,0.0335,0.948000,0.000000,0.0881,0.1180,104.218,4,1,0.645864


# Treinamento do modelo

## Validação cruzada

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# dividindo o dataset em X e Y 
X = treino_v3.drop(columns=['popularity_target'])
y = treino_v3['popularity_target']

# utilizando RandomForest, que costuma ser bom na maioria dos problemas
rf_classifier = RandomForestClassifier()

# fazendo validação cruzada (usando apenas 5 folds para economizar tempo e processamento)
cv_scores = cross_val_score(rf_classifier, X, y, cv=5)

# printando a acurácia do modelo 
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", cv_scores.mean())


## Utilizando RandomSearch para achar hiperparametros melhores, possivelmente melhorando a performance do modelo. A escolha do algoritmo é dada pelo fato de que um resultado satisfatorio já foi encontrado, portanto não é necessario utilizar poder computacional e tempo excessivo aplicando um grid search.

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

# dividindo o dataset em conjunto de treino e validação (80% treino, 20% validação)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# definindo o espaço de busca para os hiperparâmetros
param_distributions = {
    'n_estimators': np.arange(10, 201, 10),  
    'max_depth': np.arange(5, 51, 5),  
    'min_samples_split': np.arange(2, 11),  
    'min_samples_leaf': np.arange(1, 11)  
}

# inicializando o RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=111)

# configurando o RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_classifier, 
    param_distributions=param_distributions, 
    n_iter=50,  
    cv=3,  
    verbose=1, 
    n_jobs=-1,  
    random_state=111
)

# executando o RandomizedSearchCV
random_search.fit(X_train, y_train)

# fazendo previsões no conjunto de validação
y_pred = random_search.best_estimator_.predict(X_val)

# avaliando a acurácia no conjunto de validação
accuracy = accuracy_score(y_val, y_pred)

# exibindo os melhores hiperparâmetros e a acurácia correspondente
print("Melhores hiperparâmetros: ", random_search.best_params_)
print("Acurácia no conjunto de validação: ", accuracy)

## O finetuning de hiperparâmetros não surtiu tanto efeito nos resultados.

# Teste do modelo

In [None]:
# dividindo o dataset treino_v3 em X e y
X_train = treino_v3.drop(columns=['popularity_target', 'track_unique_id'])
y_train = treino_v3['popularity_target']

# o dataset teste_v3 só contém o track_unique_id e as features
X_test = teste_v3.drop(columns=['track_unique_id'])  # Certifique-se de que a coluna 'track_unique_id' seja removida

# inicializando o RandomForestClassifier com os melhores hiperparâmetros encontrados
best_params = random_search.best_params_  # Usando os melhores parâmetros encontrados previamente
rf_classifier = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42
)

# treinando o modelo com o treino_v3
rf_classifier.fit(X_train, y_train)

# fazendo as predições no conjunto de teste_v3
predictions = rf_classifier.predict(X_test)

# criando o dataframe de submissão
submission = pd.DataFrame({
    'track_unique_id': teste_v3['track_unique_id'],  # pegando o track_unique_id do dataset de teste
    'popularity_target': predictions  # previsões feitas pelo modelo
})

# salvando o arquivo no formato CSV para submissão no Kaggle
submission.to_csv('kaggle_submission.csv', index=False)

print("Arquivo de submissão 'kaggle_submission.csv' criado com sucesso!")
