In [4]:
# Helper packages
import missingno as msno
import numpy as np
import pandas as pd
from plotnine import ggplot, aes, geom_density, geom_line, geom_point, ggtitle

# Modeling pre-processing with scikit-learn functionality
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

# Modeling pre-processing with non-scikit-learn packages
from category_encoders.ordinal import OrdinalEncoder
from feature_engine.encoding import RareLabelEncoder

# Modeling
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

In [8]:
%%capture
data = pd.read_csv('../data/trainClean.csv')

# redefine 'class' and 'key' as a categorical variables
data.drop(columns=['Unnamed: 0'], inplace=True)
data['class'] = data['class'].astype('category')
data['key'] = data['key'].astype('category')
data['class'].replace([0,1,2,3,4,5,6,7,8,9,10],['Acoustic', 'AltMusic', 'Blues', 'Bollywood', 'Country', 'HipHop', 'Indie', 'Instrumental', 'Metal', 'Pop', 'Rock'])

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17996 entries, 0 to 17995
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   artist_name         17996 non-null  object  
 1   track_name          17996 non-null  object  
 2   popularity          17568 non-null  float64 
 3   danceability        17996 non-null  float64 
 4   energy              17996 non-null  float64 
 5   key                 15982 non-null  category
 6   loudness            17996 non-null  float64 
 7   mode                17996 non-null  int64   
 8   speechiness         17996 non-null  float64 
 9   acousticness        17996 non-null  float64 
 10  instrumentalness    13619 non-null  float64 
 11  liveness            17996 non-null  float64 
 12  valence             17996 non-null  float64 
 13  tempo               17996 non-null  float64 
 14  duration_in_min_ms  17996 non-null  float64 
 15  time_signature      17996 non-null  

In [6]:
# create train/test split
train, test = train_test_split(data, train_size=0.7, random_state=123)

x_train = train.drop("class", axis=1)
y_train = train[["class"]]

X_test = test.drop("class", axis=1)
y_test = test[["class"]]

In [None]:
# remove near-zero variance features
nzv = VarianceThreshold(threshold=0.1)

In [None]:
# Normalizing approach
scaler = StandardScaler()

# standardize all numeric features
std = ColumnTransformer([("norm", scaler, selector(dtype_include="number"))])

In [None]:
# one-hot encoder
encoder = OneHotEncoder()

# apply to all categorical features
ohe = ColumnTransformer([("one-hot", encoder, selector(dtype_include="object"))])

In [None]:
# create rare label encoder
rare_encoder = RareLabelEncoder(tol=0.01, replace_with="other")

# demonstrate how some neighborhoods are now represented by "other"
rare_encoder.fit_transform(x_train)['artist_name'].unique()

In [None]:
len(x_train['artist_name'].unique())

In [None]:
y_pred2 = clf.predict_proba(X_test)

In [None]:
import catboost

In [None]:
LR_score = CatBoostClassifier.score(y_pred2,y_test)