# Pre-process Source Data & Create Derived Features

This notebook is a lot messier than boost-ensemble-notebook. In this notebook, we'll clip and normalize all our feature. (any feature 2std away from the mean are clipped). Next, we'll identify the top N features most correlated with the target value. These features will be used to fit N kmeans clusters. Each sample will include a new derived feature for the samples' Euclidean distance to each of the generated clusters.
https://www.kaggle.com/motchan/tps-oct-2021-kmeans

In [1]:
import pandas as pd
import numpy as np
import gc
import os
import joblib

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn import model_selection

from hdf_utils import HDFBuilder

#### Load data into memory and convert to np arrays

In [2]:
train_data = pd.read_csv('data/train.csv',index_col=None)
test_data = pd.read_csv('data/test.csv',index_col=None)

train_data.pop('id')
test_id = test_data.pop('id')

column_names = train_data.columns.tolist()
y_train = train_data.pop('target').values.reshape(-1,1)

X_train = train_data.values
X_test = test_data.values

del train_data,test_data
out = gc.collect()

#### Clip outliers and normalize features

In [3]:
def preprocess(X_in,sigma=2,scaler_filepath='preproc_scaler1.save'):

    if os.path.exists(scaler_filepath):
        scaler = joblib.load(scaler_filepath)
        return scaler.transform(X_in)
    
    data_mean = X_in.mean(axis=0)
    data_std = X_in.std(axis=0)
    lower_bound = data_mean-sigma*data_std
    upper_bound = data_mean+sigma*data_std

    X_in = X_in.clip(lower_bound,upper_bound)
    
    from sklearn import preprocessing
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(X_in)
            
    scaler_filepath = "preproc_scaler1.save"
    joblib.dump(scaler,scaler_filepath)
    
    return scaler.transform(X_in)
    

In [4]:
test_train_split = X_train.shape[0]

X_combined = np.concatenate([X_train,X_test])
X_combined = preprocess(X_combined,sigma=2)
X_train,X_test = X_combined[:test_train_split,:], X_combined[test_train_split:,:]

del X_combined
out = gc.collect()

### Build new features

#### Identify top N features most correlated to target value

In [5]:
def get_top_feature_indices(X,y,thresh=0.8):

    corr_coef = abs(np.corrcoef(X_train,y_train,rowvar=False)[:-1,-1])
    sorted_indices = np.argsort(corr_coef)[::-1]
    feature_cnt = sorted_indices.shape[0]
    
    if isinstance(thresh,int):
        return sorted_indices[:min(feature_cnt,thresh)]
    
    top_indices = []
    curr_ix = 0
    score = 0
    
    thresh = min(thresh,1.0)
    weight_total = sum(corr_coef)
    
    while score/weight_total<thresh:
        target_ix = sorted_indices[curr_ix]
        score+=corr_coef[target_ix]
        top_indices.append(target_ix)
        curr_ix+=1
        
    return np.array(top_indices)

In [6]:
top_features = get_top_feature_indices(X_train,y_train,0.5)

#### Determine Ideal Number of Clusters from Top Features

In [7]:
def analyze_clusters(X_in,selected_features,max_cluster_cnt=15):
    silhouette_scores = []
    for i in range(2,max_cluster_cnt):
        kmeans_model = KMeans(
            n_clusters=i, 
            init='k-means++',
            max_iter=500,
            random_state=1
        ).fit(X_in[:,selected_features])
        labels = kmeans_model.labels_
        silhouette = metrics.silhouette_score(
            X_in, 
            labels, 
            metric='euclidean',
            sample_size=50000,
            n_jobs=-1
        )
        silhouette_scores.append((i,silhouette))
        print(f"cluster_cnt: {i}, silhouette_score: {silhouette}")
    return silhouette_scores

In [8]:
compute_clusters = False

if compute_clusters:
    X_combined = np.concatenate([X_train,X_test])
    analyze_clusters(X_combined,top_features,max_cluster_cnt=15)
    del X_combined
    out = gc.collect()

desired_cluster_cnt = 6

### Split train data into train & validation dataset

In [9]:
train_data = pd.DataFrame(
    np.concatenate([X_train,y_train],axis=1),
    columns=column_names
)

# Create train-test split
train_data,val_data = model_selection.train_test_split(
    train_data,
    test_size=0.15,
    shuffle=True,
    stratify=train_data['target'],
)

train_data.reset_index(inplace=True,drop=True)
val_data.reset_index(inplace=True,drop=True)
test_data = pd.DataFrame(X_test,columns=column_names[:-1])

train_target = train_data.pop('target')
val_target = val_data.pop('target')

#### Use KMeans Clustering to generate Derived Features

In [10]:
kmeans_model = KMeans(
    n_clusters=desired_cluster_cnt, 
    init="k-means++", 
    max_iter=500, 
    random_state=1,
)

cluster_names = [f'cluster{i}' for i in range(desired_cluster_cnt)]

train_cluster_cols = kmeans_model.fit_transform(train_data.iloc[:,top_features])
train_cluster_cols = pd.DataFrame(train_cluster_cols,columns=cluster_names)

val_cluster_cols = kmeans_model.transform(val_data.iloc[:,top_features])
kmeans_model.fit(pd.concat([train_data,val_data],axis=0).iloc[:,top_features])
val_cluster_cols = pd.DataFrame(val_cluster_cols,columns=cluster_names)

train_data = pd.concat([train_data,train_cluster_cols],axis=1)
val_data = pd.concat([val_data,val_cluster_cols],axis=1)
del train_cluster_cols, val_cluster_cols

test_cluster_cols = kmeans_model.transform(test_data.iloc[:,top_features])
test_cluster_cols = pd.DataFrame(test_cluster_cols,columns=cluster_names)
test_data = pd.concat([test_data,test_cluster_cols],axis=1)
del test_cluster_cols

#### Create HDF5 record for processed train/val data

In [11]:
HDFBuilder().from_df(
    X_train=train_data,
    X_val=val_data,
    y_train=train_target,
    y_val=val_target,
)

#### Create CSV for processed test dataframe

In [12]:
pd.concat([test_id,test_data],axis=1).to_csv('data/processed_test.csv')