# Model Building

putting it all togather

## Setting up

In [74]:
import os, sys
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

#For the plots
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

set random reed

In [75]:
random_state = 42
# np.random.seed = random_state
rng = np.random.default_rng(random_state)

In [76]:
train = pd.read_csv('../data/processed/train.csv')
train.drop(['id'], inplace=True, axis=1)

test = pd.read_csv('../data/processed/test.csv')
test.drop(['id'], inplace=True, axis=1)

## Importing Data

In [77]:
train['isTrain'] = True
test['isTrain'] = False

test_train = pd.concat([train, test]).reset_index(drop=True).copy()
train.drop(['isTrain'], inplace=True, axis=1)
test.drop(['isTrain'], inplace=True, axis=1)
test_train.tail()

Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity,isTrain
49995,196204.0,0.083936,0.823252,0.442246,0.00299,1.0,0.123921,-8.682195,1,0.041171,98.264618,4,0.784104,,False
49996,218343.0,0.105279,0.683743,0.848627,0.001911,6.0,0.106522,-4.94625,0,0.033605,103.949646,4,0.850754,,False
49997,188434.0,0.753472,0.783947,0.498261,0.134027,6.0,0.088147,-11.812606,1,0.029526,119.090558,4,0.482961,,False
49998,173044.0,0.863579,0.373283,0.476142,4.9e-05,10.0,0.122122,-12.371232,0,0.190628,103.048349,3,0.518742,,False
49999,157896.0,0.019852,0.72038,,0.037727,2.0,0.139422,-4.965311,1,0.053257,109.873611,4,0.971368,,False


In [78]:
col_y = 'song_popularity'

X = train.copy()
y = X.pop(col_y)

In [79]:
mask = X.nunique() < 15
categorical_cols = X.columns[mask]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [80]:
# convert audio_mode, key & time_signature column values to categorical
X[categorical_cols] = train[categorical_cols].astype('category')
# get levels of categorical columns
X[categorical_cols].apply(lambda x: x.cat.categories)

key               Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6....
audio_mode                        Int64Index([0, 1], dtype='int64')
time_signature              Int64Index([2, 3, 4, 5], dtype='int64')
dtype: object

## Importing modeling & pre/post-processing libraries

In [81]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.experimental import enable_iterative_imputer, enable_halving_search_cv
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator, IterativeImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler, PowerTransformer, RobustScaler, power_transform, minmax_scale
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, classification_report

# import classifier models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegressionCV, SGDClassifier, SGDRegressor, Ridge, LassoLarsCV, LassoLars, BayesianRidge
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

## building Preprocessors

In [69]:
# from sklearn.impute import SimpleImputer

# simple_mean_imptr = SimpleImputer(strategy="mean", add_indicator=False)
# tr_simple_imp = simple_mean_imptr.fit_transform(X.copy())
# tr_simple_imp = pd.DataFrame(tr_simple_imp, columns=X.columns)

In [None]:
from sklearn.impute import KNNImputer

knn_imptr = KNNImputer(n_neighbors=5, weights="distance", add_indicator=True)
train_knnimp = knn_imptr.fit_transform(X_train[FEATURES])
mask = X_train.isna().any(axis=0)
imputed_col_names = ["_"+col+"_imputed_" for col in mask[mask].index]

train_knnimp = pd.DataFrame(train_knnimp, columns=FEATURES+imputed_col_names)
train_knnimp.head()

In [None]:
# sys.path.append("kuma_utils/")
# from kuma_utils.preprocessing.imputer import LGBMImputer

# lgbm_imtr = LGBMImputer(n_iter=100, verbose=True)

# train_lgbmimp = lgbm_imtr.fit_transform(X_train[FEATURES])
# train_lgbmimp = pd.DataFrame(train_lgbmimp, columns=FEATURES)

### Custom transformations for each column

In [None]:
def scale_N_transform(X, col_operations):
    
    for cols, operation in col_operations:
        # chech if operations is string
        # check if the operation is a scaler or transformer object
        if isinstance(operation, str):
            if operation == 'scale':
                X[cols] = StandardScaler().fit_transform(X[cols])
            elif operation == 'minmax':
                X[cols] = MinMaxScaler(feature_range=(0+1e-6, 1-1e-6)).fit_transform(X[cols])
            elif operation == 'robust':
                X[cols] = RobustScaler().fit_transform(X[cols])
            elif operation == 'power':
                X[cols] = PowerTransformer(method='yeo-johnson').fit_transform(X[cols])
            elif operation == 'log':
                # X[cols] = PowerTransformer(method='box-cox').fit_transform(X[cols])
                X[cols] = X[cols].apply(np.log, axis=0)
            elif operation == 'sqrt':
                X[cols] = PowerTransformer(method='quantile').fit_transform(X[cols])
            elif operation == 'none':
                pass
            else:
                raise ValueError('Operation not supported')
        # check if operations is a lambda function
        elif callable(operation):
            X[cols] = X[cols].apply(operation, axis=0)
        else:
            raise ValueError('Operation not supported')

    return X

col_sig = [
    'acousticness',
    'danceability',
    'energy',
    'instrumentalness',
    'liveness',
    'speechiness',
    'audio_valence',
]

col_pow = [
    'song_duration_ms',
    'tempo',
]

inv_sigmoid = lambda x: np.log(x / (1-x))
neg_log = lambda x: np.log1p(-x)

col_ops = [
    (col_sig, 'minmax'), 
    (col_sig, inv_sigmoid), 
    ((col_pow+col_sig), 'power'),
    (['loudness'], neg_log),
]

In [None]:
custom_scaler_N_transformer = FunctionTransformer(scale_N_transform, validate=False, kw_args={'col_operations': col_ops})
    
custom_scaler_N_transformer.fit_transform(X.copy())

['song_duration_ms', 'tempo', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'audio_valence'] power


Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,0.479596,1.127241,1.820578,-0.066134,-0.155927,10.0,,1.889958,0,0.356267,1.467947,4,0.481835
1,,-0.622294,0.752487,0.550228,-0.456444,8.0,1.599676,1.830494,1,0.872890,-0.431703,3,0.394697
2,0.057212,,-1.911715,0.274584,-3.025891,5.0,0.179421,1.783687,0,-0.273780,1.926658,3,-0.557785
3,1.220914,0.848052,-0.033537,-0.642118,-0.593586,0.0,-0.976646,2.185343,0,-0.937525,0.609499,3,-0.464100
4,-0.559714,0.856266,,0.077900,-0.147512,10.0,-0.974578,1.304025,0,-0.319642,0.369913,4,0.507706
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.984045,,0.855150,-0.785582,0.065129,0.0,0.212530,2.287844,0,0.284705,1.701591,3,-1.612459
39996,0.011317,-0.509010,0.399785,0.904447,-0.381440,5.0,-0.425178,2.174637,0,1.306044,-0.074762,3,0.711902
39997,-0.680151,1.618109,-0.792375,-1.574610,,10.0,-0.639012,2.795975,0,-1.212402,-0.587558,3,-1.613758
39998,0.072611,,-0.978460,,-0.697574,1.0,1.321679,1.733773,1,-0.222195,0.964031,3,0.635899


#### Descritizing `instrumentalness`

In [None]:
from sklearn.cluster import KMeans

scaler = StandardScaler()
X_subset = train_knnimp[['instrumentalness']]
X_subset = scaler.fit_transform(X_subset)

kmeans = KMeans(n_clusters=5, random_state=0).fit(X_subset)
labels = kmeans.labels_