In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("song_data.csv")

## Data Preprocessing

In [None]:
df1 = df.drop(["song_name"], axis=1)

In [None]:
X = df1.drop(["song_popularity"], axis=1)
y = df1["song_popularity"]

### MinMaxScaler

We scale the datas with MinMaxScaler. That way we can keep the distribution of the datas and prevent too many outliers. The datas are put in a range between 0 and 1.

In [None]:
from sklearn.preprocessing import MinMaxScaler 

In [None]:
min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(X)
X

array([[0.1400585 , 0.00554115, 0.50253293, ..., 0.68942464, 0.8       ,
        0.48170732],
       [0.11465771, 0.01034035, 0.5491388 , ..., 0.43437136, 0.8       ,
        0.37601626],
       [0.12293814, 0.0082018 , 0.74670719, ..., 0.51123317, 0.8       ,
        0.32926829],
       ...,
       [0.09523114, 0.85040145, 0.72847011, ..., 0.53868883, 0.8       ,
        0.29065041],
       [0.19038284, 0.94879513, 0.49442756, ..., 0.4377017 , 0.8       ,
        0.32825203],
       [0.10156567, 0.91465855, 0.64842958, ..., 0.37756172, 0.8       ,
        0.59044715]])

### Train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=45)

## Machine Learning

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
randomregressor = RandomForestRegressor(random_state=45, max_depth=30, max_features="sqrt")
randomregressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=45, verbose=0, warm_start=False)

In [None]:
score_train = randomregressor.score(X_train,y_train)
score_test = randomregressor.score(X_test,y_test)
print ("Score de Train: ", score_train)
print("Score de Test: ", score_test)

Score de Train:  0.9048962153082213
Score de Test:  0.40292395766242056


In [None]:
randomregressor.feature_importances_

array([0.09296227, 0.09661337, 0.09648027, 0.09330077, 0.08624705,
       0.04896014, 0.08725967, 0.10465064, 0.01261894, 0.0883523 ,
       0.08966502, 0.00722548, 0.09566407])

In [None]:
feature_importance = pd.DataFrame({"features": ['song_duration_ms', 
                                                'acousticness', 
                                                'danceability', 
                                                'energy', 
                                                'instrumentalness',
                                                'key',
                                                'liveness',
                                                'loudness',
                                                'audio_mode',
                                                'speechiness',
                                                'tempo',
                                                'time_signature',
                                                'audio_valence',
                                        
                                                ], 
                                "values":randomregressor.feature_importances_})
feature_importance.sort_values(["values"])

Unnamed: 0,features,values
11,time_signature,0.007225
8,audio_mode,0.012619
5,key,0.04896
4,instrumentalness,0.086247
6,liveness,0.08726
9,speechiness,0.088352
10,tempo,0.089665
0,song_duration_ms,0.092962
3,energy,0.093301
12,audio_valence,0.095664


Mean Square Error

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
ypREG = randomregressor.predict(X_test)

In [None]:
y_true = y_test
y_pred = ypREG
mean_squared_error(y_true,y_pred)

286.3104679280087

We have improved our model changing some hyperparameters. Let's try to improve the model with other parameters.



### Hyperparameters Research

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
hyperparams = {
     "max_depth": [10, 20, 30, 50, 100, 150, 200],
    "random_state":[10,20,50,100, 200,500],
    "n_estimators" :[20,50,100,200,300,500],
    "bootstrap":[True,False],
    "min_samples_split":[2,5,10],
}

In [None]:
random_search = RandomizedSearchCV(randomregressor, hyperparams)

In [None]:
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=30,
                                                   max_features='sqrt',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                             

In [None]:
random_search.best_params_

{'random_state': 500,
 'n_estimators': 200,
 'min_samples_split': 2,
 'max_depth': 200,
 'bootstrap': False}

In [None]:
random_search.best_score_

0.3761742961599632

In [None]:
Randomregressor2 = RandomForestRegressor(max_depth= 200, random_state= 500,n_estimators=200,min_samples_split=2, bootstrap=False, max_features="sqrt")

In [None]:
Randomregressor2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=200, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=500, verbose=0, warm_start=False)

In [None]:
Randomregressor2.score(X_train, y_train)

0.9869007116361476

In [None]:
Randomregressor2.score(X_test, y_test)

0.4257389562898637

In [None]:
Randomregressor2.feature_importances_

array([0.09334335, 0.09625774, 0.09709585, 0.09328384, 0.08756133,
       0.04791418, 0.08734133, 0.10503712, 0.01262452, 0.08777948,
       0.08938391, 0.0068299 , 0.09554746])

In [None]:
feature_importance = pd.DataFrame({"features": ['song_duration_ms', 
                                                'acousticness', 
                                                'danceability', 
                                                'energy', 
                                                'instrumentalness',
                                                'key',
                                                'liveness',
                                                'loudness',
                                                'audio_mode',
                                                'speechiness',
                                                'tempo',
                                                'time_signature',
                                                'audio_valence',
                                        
                                                ], 
                                "values":Randomregressor2.feature_importances_})
feature_importance.sort_values(["values"])

Unnamed: 0,features,values
11,time_signature,0.00683
8,audio_mode,0.012625
5,key,0.047914
6,liveness,0.087341
4,instrumentalness,0.087561
9,speechiness,0.087779
10,tempo,0.089384
3,energy,0.093284
0,song_duration_ms,0.093343
12,audio_valence,0.095547


In [None]:
ypREG2 = Randomregressor2.predict(X_test)

In [None]:
y_true = y_test
y_pred = ypREG2
mean_squared_error(y_true,y_pred)

275.37019823099257

The performance of the model has improved.
However, in order to get a better model, we should consider a different dataset, with more linear variables as well as other features to segment the analyze: genre, country...