In [1]:
# ! pip install ydata_profiling



In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import set_config
from sklearn.model_selection import GridSearchCV
from tensorflow import keras
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from ydata_profiling import ProfileReport as report

In [3]:
# df = pd.read_csv("278k_labelled_uri.csv")
df = pd.read_csv("278k_song_labelled.csv").drop("Unnamed: 0", axis=1)
X = df.drop("labels", axis=1)
y = df["labels"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

In [4]:
df.head()

Unnamed: 0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,labels
0,195000.0,0.611,0.614,-8.815,0.0672,0.0169,0.000794,0.753,0.52,128.05,3.446154e-07,2
1,194641.0,0.638,0.781,-6.848,0.0285,0.0118,0.00953,0.349,0.25,122.985,1.464234e-07,1
2,217573.0,0.56,0.81,-8.029,0.0872,0.0071,8e-06,0.241,0.247,170.044,4.00785e-07,1
3,443478.0,0.525,0.699,-4.571,0.0353,0.0178,8.8e-05,0.0888,0.199,92.011,7.959809e-08,0
4,225862.0,0.367,0.771,-5.863,0.106,0.365,1e-06,0.0965,0.163,115.917,4.693131e-07,1


In [5]:
df.labels

0         2
1         1
2         1
3         0
4         1
         ..
277933    1
277934    1
277935    2
277936    1
277937    1
Name: labels, Length: 277938, dtype: int64

In [6]:
# profile = report(df, title="Profiling Report")
# profile.to_notebook_iframe()

In [7]:
# Sum all of the missing values
null_counts = df.isnull().sum()
null_counts = null_counts[null_counts != 0]

SGD Pipeline

In [35]:
imputer = ColumnTransformer([('Imputer', SimpleImputer(), [i for i in range(11)])])
scaler = ColumnTransformer([('StandardScaler', StandardScaler(), [i for i in range(11)])])

sgdpipe = Pipeline([('Imputer', imputer), ('StandardScaler', scaler), ('SGD', SGDClassifier())])
sgdpipe.fit(X_train, y_train)

In [36]:
yhat = sgdpipe.predict(X_test)
acc = accuracy_score(y_test, yhat)
print(f'Accuracy: {acc}')

Accuracy: 0.8062171691732029


In [38]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [51]:
grid = {
    'rfc': {
        'ccp_alpha': [0, 0.1, 0.4, 0.9],
        'max_depth': [3,5,7,9],
        'n_estimators': [1, 10, 50, 100, 300, 700, 1000],
        'n_jobs': [1, -1]
        },
    
    'xgb': {
        'n_estimators': [1, 10, 100, 500, 1000],
        'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
        'gamma': [0, 0.1, 0.5, 1, 5, 10],
        'max_depth': [3, 5, 7, 9, 10],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]        
        }
    }

Random Forest

In [26]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
rfcpipe = Pipeline([('Imputer', imputer), ('StandardScaler', scaler), ('RandomForest', RandomForestClassifier())])
rfcpipe.fit(X_train, y_train)
# pipe

In [27]:
yhat = rfcpipe.predict(X_test)
acc = accuracy_score(y_test, yhat)
print(f'Accuracy: {acc}')

0.9423472691947903

XGB

In [None]:
xgb = XGBClassifier()
xgbgrid = GridSearchCV(xgb, grid['xgb'], n_jobs=-1)
xgbpipe = Pipeline([('StandardScaler', scaler), ('XGBClassifier', xgbgrid)])
xgbpipe.fit(X_train, y_train)

In [32]:
yhat = xgcpipe.predict(X_test)
acc = accuracy_score(y_test, yhat)
print(f'Accuracy: {acc}')

Accuracy: 0.9666834568611931
