# Decision Tree

In [1]:
import numpy as np
import pandas as pd
from sklearn import pipeline
from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn import impute
from sklearn import model_selection
from matplotlib import pyplot as plt

# Pre-processing

## Reading csv


In [55]:
dataset = pd.read_csv("./dataset.csv")
dataset.drop(["Unnamed: 0", "track_id", "track_name", "album_name"], axis=1, inplace=True)

## Show data

In [56]:
print(dataset.shape)
print()
dataset.info()
dataset.head()

(114000, 17)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   artists           113999 non-null  object 
 1   popularity        114000 non-null  int64  
 2   duration_ms       114000 non-null  int64  
 3   explicit          114000 non-null  bool   
 4   danceability      114000 non-null  float64
 5   energy            114000 non-null  float64
 6   key               114000 non-null  int64  
 7   loudness          114000 non-null  float64
 8   mode              114000 non-null  int64  
 9   speechiness       114000 non-null  float64
 10  acousticness      114000 non-null  float64
 11  instrumentalness  114000 non-null  float64
 12  liveness          114000 non-null  float64
 13  valence           114000 non-null  float64
 14  tempo             114000 non-null  float64
 15  time_signature    114000 non-null  int64  
 16  track_

Unnamed: 0,artists,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## Clean data

In [57]:
def train_validation_test_split(
    df, target_column, validation_size=0.1, test_size=0.1, random_state=42
):
    df_train, df_test = model_selection.train_test_split(
        df, test_size=test_size, random_state=random_state, stratify=df[target_column]
    )

    df_train, df_validation = model_selection.train_test_split(
        df_train,
        test_size=validation_size / (1 - test_size),
        random_state=random_state,
        stratify=df_train[target_column],
    )
    return df_train, df_validation, df_test


random_state = 42

df_train, df_validation, df_test = train_validation_test_split(
    df=dataset,
    target_column="track_genre",
    validation_size=0.2,
    test_size=0.2,
    random_state=random_state,
)

assert df_train.shape[0] + df_validation.shape[0] + df_test.shape[0] == dataset.shape[0]

## Adding median values to NaN

In [58]:
numeric_columns = df_train.select_dtypes(include=['number']).columns

numeric_imputer = impute.SimpleImputer(strategy="median")

numeric_imputer.fit(df_train[numeric_columns])

df_train[numeric_columns] = numeric_imputer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = numeric_imputer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

In [59]:
df_train.dropna(inplace=True, how="any")
df_test.dropna(inplace=True, how="any")
df_validation.dropna(inplace=True, how="any")

## Normalizing data

In [60]:
normalizer = preprocessing.MinMaxScaler()

normalizer.fit(df_train[numeric_columns])

df_train[numeric_columns] = normalizer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = normalizer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = normalizer.transform(df_test[numeric_columns])

## Adding encoder

In [61]:
categorical_columns = dataset.iloc[:, :-1].select_dtypes(include=["object"]).columns

target_encoder = preprocessing.LabelEncoder()

target_encoder.fit(dataset[categorical_columns])

df_train[categorical_columns] = target_encoder.transform(
    df_train[categorical_columns]
).reshape(-1, 1)
df_validation[categorical_columns] = target_encoder.transform(
    df_validation[categorical_columns]
).reshape(-1, 1)
df_test[categorical_columns] = target_encoder.transform(
    df_test[categorical_columns]
).reshape(-1, 1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## Training

In [91]:
decision_tree = tree.DecisionTreeClassifier(random_state=random_state)

parameters = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best", "random"],
    "max_depth": np.arange(5, 10000, 50),
    "min_samples_leaf": np.arange(1, 10000, 50),
    "max_features": ["sqrt", "log2"],
    "max_leaf_nodes": np.arange(2, 10000, 50),
}

grid_search = model_selection.RandomizedSearchCV(
    cv=5,
    random_state=random_state,
    n_iter=500,
    n_jobs=-1,
    estimator=decision_tree,
    param_distributions=parameters,
)

grid_search.fit(df_train.iloc[:, :-1], df_train.iloc[:, -1])

In [93]:
print(grid_search.best_params_)
print(grid_search.score(df_validation.iloc[:, :-1], df_validation.iloc[:, -1]))
print(grid_search.score(df_train.iloc[:, :-1], df_train.iloc[:, -1]))


{'splitter': 'best', 'min_samples_leaf': 51, 'max_leaf_nodes': 5752, 'max_features': 'log2', 'max_depth': 6305, 'criterion': 'entropy'}
0.17767543859649124
0.22497404932820655
