# Clustering

## The goal is that the objects in a group will be similar to one another and different from objects in the other groups.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading and defining the methodology

In [None]:
from sklearn.metrics import roc_auc_score,f1_score
import os
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [None]:
Train_df = pd.read_csv('/content/drive/MyDrive/dmt/train.csv')
Train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [None]:
Test_df = pd.read_csv('/content/drive/MyDrive/dmt/test.csv')
Test_df.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


## Defining the clusters and test size from the dataset

In [None]:
n_clusters = 5
test_size = 0.3

Splitting train_df into train and test sets (X_train, X_test, Y_train, Y_test)

In [None]:
y_df = Train_df['target']
Train_df.drop(columns=['target'], inplace=True)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(Train_df, y_df, test_size=test_size, random_state=40)

In [None]:
len(X_train), len(y_train)


(140000, 140000)

In [None]:
columns = [i for i in X_train.columns if i not in ['ID_code']]

## **We augment the dataset to oversample the positive examples to deal with class imbalance**

In [None]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

## Kmeans clustering of the data into n_clusters. Data is augmented with a column called 'clusters'.

In [None]:
kmeans = KMeans(n_clusters=n_clusters, max_iter=1000).fit(X_train[columns])
X_train["clusters"] = kmeans.labels_

cluster_idxs is a list containing indices of data belonging to a cluster.

In [None]:
cluster_idxs = [X_train["clusters"] == i for i in range(5)]

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.1,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 4,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1,
    'max_bin': 50,
}

In [None]:
predictors = []

for i in range(n_clusters):
  print("TRAINING MODEL FOR CLUSTER: {}".format(i))
  x_i = X_train[cluster_idxs[i]]
  y_i = y_train[cluster_idxs[i]]
  num_folds = 3
  features = [c for c in x_i.columns if c not in ['ID_code', 'clusters', 'target']]
  folds = KFold(n_splits=num_folds)

  x_i = x_i[features]

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_i.values, y_i.values)):
    X_trai, y_trai = x_i.iloc[trn_idx][features], y_i.iloc[trn_idx]
    X_val, y_val = x_i.iloc[val_idx][features], y_i.iloc[val_idx]

    X_trai, y_trai = augment(X_trai.values, y_trai.values)
    X_trai = pd.DataFrame(X_trai)

    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_trai, label=y_trai)
    val_data = lgb.Dataset(X_val, label=y_val)

    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 3000)
    predictors.append(clf)

TRAINING MODEL FOR CLUSTER: 0
Fold idx:1
Training until validation scores don't improve for 3000 rounds.
[5000]	training's auc: 0.934163	valid_1's auc: 0.882707
Early stopping, best iteration is:
[6968]	training's auc: 0.940696	valid_1's auc: 0.884708
Fold idx:2
Training until validation scores don't improve for 3000 rounds.
[5000]	training's auc: 0.930242	valid_1's auc: 0.886937
[10000]	training's auc: 0.944928	valid_1's auc: 0.887534
Early stopping, best iteration is:
[7885]	training's auc: 0.939362	valid_1's auc: 0.88838
Fold idx:3
Training until validation scores don't improve for 3000 rounds.
[5000]	training's auc: 0.932368	valid_1's auc: 0.881914
Early stopping, best iteration is:
[5798]	training's auc: 0.935346	valid_1's auc: 0.882333
TRAINING MODEL FOR CLUSTER: 1
Fold idx:1
Training until validation scores don't improve for 3000 rounds.
[5000]	training's auc: 0.933167	valid_1's auc: 0.885566
[10000]	training's auc: 0.947201	valid_1's auc: 0.886884
Early stopping, best iteration

Get the predictions on the Test sets:¶



In [None]:
def get_predictions(kmeans, X_test, n_clusters, get_score=True):
    features = [c for c in X_test.columns if c not in ['ID_code', 'target']]
    X_test['cluster'] = kmeans.predict(X_test[features])
    test_idxs = [X_test['cluster'] == i for i in range(n_clusters)]
    X_test.drop(columns=['cluster'], inplace=True)
    preds = []
    true = []

    for i in range(n_clusters):
        x_te = X_test[test_idxs[i]]
        pred = predictors[i].predict(x_te[features])
        preds.append(pred)
        if get_score:
            y_te = y_test[test_idxs[i]]
            true.append(y_te.values)
    x = []
    y = []
    for i in preds:
        x = x + list(i)
    if get_score:
        for i in true:
            y = y + list(i)
        print(roc_auc_score( np.array(y), np.array(x)))
    
    return np.array(x),x,y

## **AUC score on our sampled test set**

In [None]:
a,pred,true=get_predictions(kmeans, X_test, n_clusters = 5, get_score=True)


0.8808116267896128


In [None]:
f=[]
for i in pred:
  if i>0.6:
    f.append(1)
  else:
    f.append(0)  

In [None]:
f1_score(true,f)

0.3628604681236261