# Model Building

putting it all togather

## Setting up

In [1]:
import os, sys
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

#For the plots
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

set random reed

In [2]:
random_state = 42
# np.random.seed = random_state
rng = np.random.default_rng(random_state)

In [3]:
train = pd.read_csv('../data/processed/train.csv')
train.drop(['id'], inplace=True, axis=1)

test = pd.read_csv('../data/processed/test.csv')
test.drop(['id'], inplace=True, axis=1)

## Importing Data

In [4]:
col_y = 'song_popularity'

X = train.copy()
y = X.pop(col_y)

In [5]:
mask = X.nunique() < 15
categorical_cols = X.columns[mask]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [6]:
# convert audio_mode, key & time_signature column values to categorical
X[categorical_cols] = train[categorical_cols].astype('category')
# get levels of categorical columns
X[categorical_cols].apply(lambda x: x.cat.categories)

key               Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6....
audio_mode                        Int64Index([0, 1], dtype='int64')
time_signature              Int64Index([2, 3, 4, 5], dtype='int64')
dtype: object

## Importing modeling & pre/post-processing libraries

In [7]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer, enable_halving_search_cv
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator, IterativeImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler, PowerTransformer, RobustScaler, power_transform, minmax_scale
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, classification_report

# import classifier models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegressionCV, SGDClassifier, SGDRegressor, Ridge, LassoLarsCV, LassoLars, BayesianRidge
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

## building Preprocessors

In [12]:
from sklearn.impute import KNNImputer

knn_imptr = KNNImputer(n_neighbors=5, weights="distance", add_indicator=True)
train_knnimp = knn_imptr.fit_transform(X[X.columns].copy())
mask = X.isna().any(axis=0)
imputed_col_names = ["_"+col+"_imputed_" for col in mask[mask].index]

train_knnimp = pd.DataFrame(train_knnimp, columns=X.columns.tolist()+imputed_col_names)
train_knnimp.head()

Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,...,time_signature,audio_valence,_song_duration_ms_imputed_,_acousticness_imputed_,_danceability_imputed_,_energy_imputed_,_instrumentalness_imputed_,_key_imputed_,_liveness_imputed_,_loudness_imputed_
0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,0.177389,-5.619088,0.0,0.08257,...,4.0,0.734642,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,233847.746686,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1.0,0.127358,...,3.0,0.711531,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,193213.0,0.094609,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0.0,0.052282,...,3.0,0.425536,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0.0,0.035618,...,3.0,0.453597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,165969.0,0.493017,0.532451,0.740982,0.002033,10.0,0.094891,-2.684095,0.0,0.050746,...,4.0,0.741311,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# sys.path.append("kuma_utils/")
# from kuma_utils.preprocessing.imputer import LGBMImputer

# lgbm_imtr = LGBMImputer(n_iter=100, verbose=True)

# train_lgbmimp = lgbm_imtr.fit_transform((X[X.columns].copy())
# train_lgbmimp = pd.DataFrame(train_lgbmimp, columns=X.columns)

### Custom transformations for each column

In [None]:
# columns used for inverse sigmoid transformation
col_sig = [
    'acousticness',
    'danceability',
    'energy',
    'instrumentalness',
    'liveness',
    'speechiness',
    'audio_valence',
]

col_pow = [
    'song_duration_ms',
    'tempo',
]

# make a custom transformer to transform the data
class PreProcessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols, show_impute=False, inplace=False):
        self.show_impute = show_impute
        self.inplace = inplace
        self.col_sig, self.col_pow, self.loudness = cols
        self.cols = self.col_sig + self.col_pow
        self.inv_sigmoid = FunctionTransformer(lambda x: np.log(x / (1-x)))
        self.loudness_transformer = FunctionTransformer(lambda x: np.log1p(-x))
        
    def fit(self, X, y=None):
            
        self.scaler = MinMaxScaler(feature_range=(0+1e-6, 1-1e-6)).fit(X[self.col_sig])
        
        X_tmp = X[self.col_sig+self.col_pow].copy()
        X_tmp[self.col_sig] = self.scaler.transform(X_tmp[self.col_sig])
        X_tmp[self.col_sig] = self.inv_sigmoid.fit_transform(X_tmp[self.col_sig])
        
        self.transformer = PowerTransformer().fit(X_tmp)

        return self
    
    def transform(self, X):

        if not self.inplace:
            X = X.copy()
        
        X[self.col_sig] = self.scaler.transform(X[self.col_sig])
        X[self.col_sig] = self.inv_sigmoid.fit_transform(X[self.col_sig])
    #    # Box-Cox transformation
        X[self.col_sig+self.col_pow] = self.transformer.transform(X[self.col_sig+self.col_pow])
        X[self.loudness] = self.loudness_transformer.fit_transform(X[self.loudness])
        
        return X

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, **kwrgs):
        self.outlier_remover = LocalOutlierFactor(**kwrgs)
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        y_hat = self.outlier_remover.fit_predict(X)
        mask = y_hat != -1
        y = y if not y is None else np.zeros(X.shape[0])
        return X.iloc[mask, :], y[mask]


transformer = PreProcessorTransformer([col_sig, col_pow, 'loudness'])
outlier_remover = OutlierRemover(n_neighbors=20, contamination=0.02)

#### Descritizing `instrumentalness`

In [None]:
from sklearn.cluster import KMeans

scaler = StandardScaler()
X_subset = train_knnimp[['instrumentalness']]
X_subset = scaler.fit_transform(X_subset)

kmeans = KMeans(n_clusters=5, random_state=0).fit(X_subset)
labels = kmeans.labels_