**KNN Feature Generation***

In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 

import xgboost as xgb

from annoy import AnnoyIndex

# from pynndescent import NNDescent
from sklearn.calibration import CalibratedClassifierCV

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score,precision_recall_curve,roc_curve, recall_score,precision_score
from sklearn.metrics import confusion_matrix
from scipy.stats import pearsonr
from scipy.stats import rankdata, skew, kurtosis

pd.options.display.max_rows = 2000
pd.options.display.max_columns  = 999

np.set_printoptions(threshold=5000) 

import lightgbm as lgb
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import f1_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder


pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 100)

import gc
import time

for p in [np, pd, sklearn, scipy]:
    print (p.__name__, p.__version__)

numpy 1.16.2
pandas 0.23.4
sklearn 0.20.3
scipy 1.1.0


Process Data

In [2]:
%%time
Path = '../input/ltfs-fin-model/'

train = pd.read_csv(Path+'train_preproc.csv',index_col=0)
test = pd.read_csv(Path +'test_preproc.csv',index_col=0)
targetcol = 'loan_default'
target = train[targetcol].astype('int16')

CPU times: user 2.64 s, sys: 260 ms, total: 2.9 s
Wall time: 2.92 s


In [3]:
#scale data
def scaledata(train,test,selcols,independent=False):
    scaler = MinMaxScaler()
    scaler.fit(train[selcols])
    scaled = scaler.transform(train[selcols])
    train_scaled = pd.DataFrame(scaled)
#     print(len(train_scaled.columns))
#     print(len(selcols))
    train_scaled.columns = [col+'_scale_tranx' for col in selcols]
    if independent:
        scaled = scaler.fit_transform(test[selcols])
    else:
        scaled = scaler.transform(test[selcols])
        
    test_scaled = pd.DataFrame(scaled)
    test_scaled.columns = [col+'_scale_tranx' for col in selcols]
    
    return train_scaled,test_scaled

#scale data
def scaledata_single(data,selcols):
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(data[selcols])
    data_scaled = pd.DataFrame(scaled)
#     print(len(train_scaled.columns))
#     print(len(selcols))
    data_scaled.columns = [col+'_scale_tranx' for col in selcols]
    
    return data_scaled


In [4]:
#Replace NANs
def replaceNaN(train,test,sel_feats):
    for df in [train,test]:
#         print('*****************')
        for col in sel_feats:
            if df[col].isnull().any():
#                 print(col)
                newcol =  col+'_na_replaced'
                for df_l2 in [train,test]:
                    if newcol not in df_l2.columns:
                        df_l2[newcol] = df_l2[col]
                df[newcol].fillna(train[col].mean(),inplace=True)
            
    # Kmeans feats : add null filled columns  and remove original of these columns
    orig_cols  = [col.replace('_na_replaced','') for col in list(train.columns)  if '_na_replaced'  in col]
    na_replaced_cols  = [col for col in list(train.columns) if '_na_replaced'  in col]
    # orig_cols  = [col.replace('_na_replaced','') for col in list(train.columns) + list(tr_encs[0].columns) if '_na_replaced'  in col]
    # na_replaced_cols  = [col for col in list(train.columns) + list(tr_encs[0].columns) if '_na_replaced'  in col]
    # print(orig_cols)
    sel_feats_NN = sel_feats.copy() 
    for val in orig_cols:
#         print('val:',val)
        sel_feats_NN.remove(val)

    sel_feats_NN += na_replaced_cols
#     print(len(sel_feats_NN))
#     print(sel_feats_NN)        
    
    return train,test,sel_feats_NN

In [5]:
def getfiletemplate(istest,val_fold):
    if istest:
        file_template = 'test_'
    else:
        file_template = 'val_' + str(val_fold)
        
    return file_template

In [6]:

   
def processtraintest(df_train,df_test,features):
    df_train_na_replaced,df_test_na_replaced,sel_feats_NN = \
            replaceNaN(df_train,df_test,features)

    train_raw_scaled,test_raw_scaled = scaledata(df_train_na_replaced,df_test_na_replaced,sel_feats_NN)
    del df_train_na_replaced, df_test_na_replaced;gc.collect()
    scaled_cols = [col for col in train_raw_scaled.columns if '_scale_tranx' in col]
#     print(scaled_cols)
    
    na_replaced_cols  = [col for col in list(df_train.columns) if '_na_replaced'  in col]
    df_train.drop(na_replaced_cols,inplace=True,axis=1)
    df_test.drop(na_replaced_cols,inplace=True,axis=1)
    
    return train_raw_scaled,test_raw_scaled


In [7]:
def getenc():
    tr_encs = []
    val_encs = []
    test_encs = []
    
    Path='../input/ltfs-fin-target-encoding/'

    for i in range(0,5):
        cur_tr_enc = pd.read_csv(Path+'train_targetenc_feats'+str(i)+'.csv',index_col=0)
        cur_val_enc = pd.read_csv(Path+'val_targetenc_feats'+str(i)+'.csv',index_col=0)

        tr_encs += [cur_tr_enc]
        val_encs +=[ cur_val_enc]

        test_encs += [pd.read_csv(Path+'test_targetenc_feats'+str(i)+'.csv',index_col=0)]
        print('read complete for:',i)
        
    return tr_encs,val_encs,test_encs

In [8]:
tr_encs, val_encs,test_encs = getenc()
print(tr_encs[0].shape)
print(val_encs[0].shape)
print(test_encs[0].shape)

read complete for: 0
read complete for: 1
read complete for: 2
read complete for: 3
read complete for: 4
(186522, 4)
(46632, 4)
(112392, 4)


In [9]:
exclude_cols =['Date.of.Birth','Employment.Type','DisbursalDate',
               'PERFORM_CNS.SCORE.DESCRIPTION','AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH',
               'MobileNo_Avl_Flag','disbursal_year','disbursal_day','disbursal_dayofweek',
               'date_of_birth', 'disbursal_date',
               'Current_pincode_ID','Employee_code_ID','supplier_id','branch_id',
               'PERFORM_CNS.SCORE.CATEGORY','State_ID',
               'UniqueID',targetcol]
features = [c for c in train.columns if c not in exclude_cols]
sel_enc_cols = [col for col in test_encs[0].columns if 'targetenc_' in col]
features +=sel_enc_cols
print(features)


['Aadhar_flag', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'Driving_flag', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES', 'PAN_flag', 'PERFORM_CNS.SCORE', 'PRI.ACTIVE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.DISBURSED.AMOUNT', 'PRI.NO.OF.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'Passport_flag', 'SEC.ACTIVE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.DISBURSED.AMOUNT', 'SEC.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.SANCTIONED.AMOUNT', 'VoterID_flag', 'asset_cost', 'disbursed_amount', 'ltv', 'manufacturer_id', 'age', 'disbursal_month', 'Employment.Type.Category', 'AVERAGE.ACCT.AGE_MONTHS', 'CREDIT.HISTORY.LENGTH_MONTHS', 'targetenc_branch_id', 'targetenc_Current_pincode_ID', 'targetenc_Employee_code_ID', 'targetenc_supplier_id']


In [10]:
cur_tr_encs = tr_encs[0]
cur_val_encs= val_encs[0]

train_encs = pd.concat([cur_tr_encs,cur_val_encs])
train_encs.sort_index(inplace=True)

train =  pd.concat([train,train_encs],axis=1)
test=pd.concat([test,test_encs[0]],axis=1)
print(train.shape)
print(test.shape)

(233154, 56)
(112392, 56)


In [11]:
train_scaled,test_scaled = processtraintest(train,test,features)
train_scaled.head(10)

  return self.partial_fit(X, y)


Unnamed: 0,Aadhar_flag_scale_tranx,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_scale_tranx,Driving_flag_scale_tranx,NEW.ACCTS.IN.LAST.SIX.MONTHS_scale_tranx,NO.OF_INQUIRIES_scale_tranx,PAN_flag_scale_tranx,PERFORM_CNS.SCORE_scale_tranx,PRI.ACTIVE.ACCTS_scale_tranx,PRI.CURRENT.BALANCE_scale_tranx,PRI.DISBURSED.AMOUNT_scale_tranx,PRI.NO.OF.ACCTS_scale_tranx,PRI.OVERDUE.ACCTS_scale_tranx,PRI.SANCTIONED.AMOUNT_scale_tranx,PRIMARY.INSTAL.AMT_scale_tranx,Passport_flag_scale_tranx,SEC.ACTIVE.ACCTS_scale_tranx,SEC.CURRENT.BALANCE_scale_tranx,SEC.DISBURSED.AMOUNT_scale_tranx,SEC.INSTAL.AMT_scale_tranx,SEC.NO.OF.ACCTS_scale_tranx,SEC.OVERDUE.ACCTS_scale_tranx,SEC.SANCTIONED.AMOUNT_scale_tranx,VoterID_flag_scale_tranx,asset_cost_scale_tranx,disbursed_amount_scale_tranx,ltv_scale_tranx,manufacturer_id_scale_tranx,age_scale_tranx,disbursal_month_scale_tranx,Employment.Type.Category_scale_tranx,AVERAGE.ACCT.AGE_MONTHS_scale_tranx,CREDIT.HISTORY.LENGTH_MONTHS_scale_tranx,targetenc_branch_id_scale_tranx,targetenc_Current_pincode_ID_scale_tranx,targetenc_Employee_code_ID_scale_tranx,targetenc_supplier_id_scale_tranx
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.013442,0.038125,0.93586,0.0,0.850016,0.0,0.5,0.0,0.0,0.280956,0.08399,0.349605,0.349832
1,1.0,0.05,0.0,0.0,0.0,0.0,0.67191,0.006944,0.064978,5e-05,0.002208,0.04,5e-05,7.8e-05,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.017934,0.034612,0.743792,0.0,0.834219,0.5,1.0,0.062331,0.049145,0.280956,0.195,0.349605,0.349832
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.015302,0.040888,0.936801,0.0,0.833561,0.0,1.0,0.0,0.0,0.295681,0.215736,0.380212,0.37146
3,1.0,0.0,0.0,0.0,0.027778,0.0,0.342697,0.0,0.06471,0.0,0.006623,0.0,0.0,1e-06,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.018287,0.045222,0.923267,0.0,0.750055,1.0,1.0,0.02168,0.032051,0.280956,0.099444,0.349605,0.349832
4,1.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.06471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.014636,0.039967,0.922208,0.0,0.910634,0.5,1.0,0.0,0.0,0.278839,0.668384,0.351221,0.333023
5,1.0,0.0,0.0,0.0,0.0,0.0,0.926966,0.0,0.06471,0.0,0.004415,0.0,0.0,5.3e-05,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.015641,0.042152,0.937154,0.0,0.783156,0.5,1.0,0.056911,0.051282,0.280956,0.099444,0.349605,0.349832
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.01539,0.033798,0.781335,0.0,0.805854,0.5,0.5,0.0,0.0,0.287914,0.396667,0.394697,0.385854
7,1.0,0.0,0.0,0.0,0.0,0.0,0.019101,0.006944,0.065416,7.5e-05,0.002208,0.0,7.5e-05,0.0,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.015641,0.031286,0.728022,0.0,0.792438,0.5,0.5,0.00542,0.004274,0.278839,0.109722,0.351221,0.333023
8,1.0,0.0,0.0,0.0,0.027778,0.0,0.806742,0.006944,0.06471,0.000365,0.002208,0.0,0.000365,0.0,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,0.0,0.015687,0.041333,0.935977,0.0,0.771301,0.5,1.0,0.151762,0.119658,0.291837,0.209473,0.380212,0.363082
9,0.0,0.0,0.0,0.0,0.0,0.0,0.919101,0.0,0.06471,0.0,0.002208,0.0,0.0,0.000102,0.0,0.0,0.015698,0.0,0.0,0.0,0.0,0.0,1.0,0.015264,0.040197,0.905261,0.0,0.005832,0.5,0.5,0.051491,0.040598,0.286614,0.202341,0.394697,0.385699


In [12]:
# #weights


#                'Current_pincode_ID','Employee_code_ID','supplier_id','branch_id',
#                'PERFORM_CNS.SCORE.CATEGORY','State_ID',

weights = [0.002192,0.050711,0.034286,0.020046,
# 0.035524,branch_id
0.030277,
# 0.059572, Current_pincode_ID
0.010461,0.018567,0.044102,0.000185,
# 0.03996,Employee_code_ID
0.012528,
0.061277,0.017356,0.005715,0.015055,0.00312,0.000502,0.046694,
# 0.008329,PERFORM_CNS.SCORE.CATEGORY
0.009246,0.021913,0.020133,0.019752,0.014508,0.020782,0.030673,0.000287,
0.000815,0.000487,0.00034,0.000494,0.000155,0.001279,
# 0.017089,State_ID
# 0.044709,supplier_id
0.06282,0.064212,0.07677,0.073356,0.003723 ]

sel_features = features.copy()
sel_features.sort(key=lambda x: x.lower())
print(sel_features)
print(len(sel_features))
print(len(weights))
train_scaled_w=pd.DataFrame()
test_scaled_w=pd.DataFrame()
for weight,col in zip(weights,sel_features):
    train_scaled[col + '_scale_tranx'] = 100*train_scaled[col + '_scale_tranx'] * weight
    test_scaled[col + '_scale_tranx'] = 100*test_scaled[col+'_scale_tranx'] * weight
    
train_scaled.head(10)

['Aadhar_flag', 'age', 'asset_cost', 'AVERAGE.ACCT.AGE_MONTHS', 'CREDIT.HISTORY.LENGTH_MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'disbursal_month', 'disbursed_amount', 'Driving_flag', 'Employment.Type.Category', 'ltv', 'manufacturer_id', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES', 'PAN_flag', 'Passport_flag', 'PERFORM_CNS.SCORE', 'PRI.ACTIVE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.DISBURSED.AMOUNT', 'PRI.NO.OF.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.ACTIVE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.DISBURSED.AMOUNT', 'SEC.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.SANCTIONED.AMOUNT', 'targetenc_branch_id', 'targetenc_Current_pincode_ID', 'targetenc_Employee_code_ID', 'targetenc_supplier_id', 'VoterID_flag']
36
36


Unnamed: 0,Aadhar_flag_scale_tranx,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_scale_tranx,Driving_flag_scale_tranx,NEW.ACCTS.IN.LAST.SIX.MONTHS_scale_tranx,NO.OF_INQUIRIES_scale_tranx,PAN_flag_scale_tranx,PERFORM_CNS.SCORE_scale_tranx,PRI.ACTIVE.ACCTS_scale_tranx,PRI.CURRENT.BALANCE_scale_tranx,PRI.DISBURSED.AMOUNT_scale_tranx,PRI.NO.OF.ACCTS_scale_tranx,PRI.OVERDUE.ACCTS_scale_tranx,PRI.SANCTIONED.AMOUNT_scale_tranx,PRIMARY.INSTAL.AMT_scale_tranx,Passport_flag_scale_tranx,SEC.ACTIVE.ACCTS_scale_tranx,SEC.CURRENT.BALANCE_scale_tranx,SEC.DISBURSED.AMOUNT_scale_tranx,SEC.INSTAL.AMT_scale_tranx,SEC.NO.OF.ACCTS_scale_tranx,SEC.OVERDUE.ACCTS_scale_tranx,SEC.SANCTIONED.AMOUNT_scale_tranx,VoterID_flag_scale_tranx,asset_cost_scale_tranx,disbursed_amount_scale_tranx,ltv_scale_tranx,manufacturer_id_scale_tranx,age_scale_tranx,disbursal_month_scale_tranx,Employment.Type.Category_scale_tranx,AVERAGE.ACCT.AGE_MONTHS_scale_tranx,CREDIT.HISTORY.LENGTH_MONTHS_scale_tranx,targetenc_branch_id_scale_tranx,targetenc_Current_pincode_ID_scale_tranx,targetenc_Employee_code_ID_scale_tranx,targetenc_supplier_id_scale_tranx
0,0.2192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.046088,0.16814,5.734668,0.0,4.310518,0.0,0.6264,0.0,0.0,1.764966,0.539316,2.68392,2.566226
1,0.2192,0.052305,0.0,0.0,0.0,0.0,3.137417,0.006421,0.142385,0.000101,0.00436,0.058032,0.000104,0.000238,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.061487,0.152647,4.557734,0.0,4.230406,0.92835,1.2528,0.124948,0.148797,1.764966,1.252134,2.68392,2.566226
2,0.2192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.052463,0.180325,5.740437,0.0,4.227074,0.0,1.2528,0.0,0.0,1.857467,1.385283,2.918886,2.724884
3,0.2192,0.0,0.0,0.0,0.041819,0.0,1.600188,0.0,0.141799,0.0,0.013081,0.0,0.0,4e-06,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.062699,0.199437,5.657503,0.0,3.803603,1.8567,1.2528,0.04346,0.097042,1.764966,0.638553,2.68392,2.566226
4,0.2192,0.0,0.0,0.0,0.041819,0.0,0.0,0.0,0.141799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.05018,0.176263,5.651013,0.0,4.617917,0.92835,1.2528,0.0,0.0,1.751666,4.291826,2.696327,2.44292
5,0.2192,0.0,0.0,0.0,0.0,0.0,4.328376,0.0,0.141799,0.0,0.008721,0.0,0.0,0.000161,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.053626,0.185898,5.7426,0.0,3.971464,0.92835,1.2528,0.114083,0.155267,1.764966,0.638553,2.68392,2.566226
6,0.2192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.052765,0.149055,4.787784,0.0,4.086565,0.92835,0.6264,0.0,0.0,1.808674,2.547076,3.030085,2.830471
7,0.2192,0.0,0.0,0.0,0.0,0.0,0.089191,0.006421,0.143347,0.00015,0.00436,0.0,0.000155,0.0,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.053626,0.137976,4.461098,0.0,4.018532,0.92835,0.6264,0.010865,0.012939,1.751666,0.704548,2.696327,2.44292
8,0.2192,0.0,0.0,0.0,0.041819,0.0,3.766999,0.006421,0.141798,0.000736,0.00436,0.0,0.000759,0.0,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.0,0.053783,0.182288,5.735389,0.0,3.911345,0.92835,1.2528,0.304221,0.362289,1.833323,1.345068,2.918886,2.663427
9,0.0,0.0,0.0,0.0,0.0,0.0,4.291651,0.0,0.141799,0.0,0.00436,0.0,0.0,0.000312,0.0,0.0,0.001279,0.0,0.0,0.0,0.0,0.0,0.3723,0.052334,0.177279,5.547166,0.0,0.029574,0.92835,0.6264,0.103218,0.122919,1.800509,1.299274,3.030085,2.829333


# Load data

KNN Feature Class and Functions

In [13]:
#annoy functions
def BuildANN(X,metric,save,index_filename):
    col_size = X.shape[1]
    a = AnnoyIndex(col_size,metric=metric)
    fit_count = X.shape[0]
    for i in range(fit_count):
        a.add_item(i,X.iloc[i].values)
    a.build(-1)
    if save:
        if index_filename is None:
            print('Save Index File name not specified')
        else:
            a.save(index_filename)
    return a
def LoadANN(indexfilename,col_size,metric):
    a = AnnoyIndex(col_size,metric=metric)
    a.load(indexfilename)
    return a

def LoadANNQuery(neigh_filename):
    neighs=np.loadtxt(neigh_filename,delimiter=',')
    return neighs
def ANNQuery(X,NN,neighbors,save,neigh_filename,isdistance=True):
    predict_count = X.shape[0]
    neighs=[]
    for i in range(predict_count):
        neighs += [NN.get_nns_by_vector(X.iloc[i], neighbors, include_distances=isdistance)]
    
    neighs = np.array(neighs)
    if (isdistance):
        neighs = neighs.reshape(neighs.shape[0],-1)             

    if save:
        if neigh_filename is None:
            print('Save Index File name not specified')
        else:
            print('Save Neigh File')
            np.savetxt(neigh_filename,neighs,delimiter=',')
    
    return neighs

In [14]:
x = np.array([[1,2,3],[5,6,7]])
x

array([[1, 2, 3],
       [5, 6, 7]])

In [15]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors
from multiprocessing import Pool
# from scipy.spatial import cKDTree
# from pynndescent import NNDescent
import multiprocessing

import numpy as np
from itertools import groupby
# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm

def findfirststreak(a):
    #compute the difference of consecutive elements
    # e.g. [5,5,5,8,8,8,8] will become [0 0 3 0 0 0]
    diff= np.ediff1d(a)
#     print('diff:',diff)
    #get the index of the first non zero element e.g. array[2]
    nz=np.nonzero(diff)
#     print('nz:',nz)
    #check if no non-zero element (i.e) all elements are same, then full streak
    #else count will be index + 1
    count = len(a) if len(nz[0])==0 else nz[0][0]+1
    return count
def findmaxstreak(a):
    lst = []
    for n,c in groupby(a):
#        print('n,c=',n,c)
       num,count = n,sum(1 for i in c)
       lst.append((num,count))

#     print(lst)
    maxx = max([y for x,y in lst])
    return maxx

def delzerovals(a):
    return np.delete(a,np.where(a==0))

pbar=None

class NearestNeighborsFeats(BaseEstimator, ClassifierMixin):
    '''
        This class should implement KNN features extraction 
    '''
    def __init__(self, n_jobs, k_list, metric, 
                 n_classes=None, NN_index=None, n_neighbors=None, eps=1e-6,
                 saveindex=False,
                 saveneighs=False,
                index_filename=None,
                neigh_filename=None,
                loadneighs=False,loadindex=False):
        self.n_jobs = n_jobs
        self.k_list = k_list
        self.metric = metric
        
        self.max_neigh_count = max(self.k_list)
        self.NN = NN_index
        
        if n_neighbors is None:
            self.n_neighbors = max(k_list) 
        else:
            self.n_neighbors = n_neighbors
            
        self.eps = eps        
        self.n_classes_ = n_classes
        self.saveindex=saveindex
        self.saveneighs=saveneighs
        self.loadneighs=loadneighs
        self.loadindex=loadindex
        self.index_filename = index_filename
        self.neigh_filename =neigh_filename
    def setmetric(self,metric):
        self.metric=metric
    def getNNindex(self):
        return self.NN
    def fit(self, X, y):
        '''
            Set's up the train set and self.NN object
        '''
        # Create a NearestNeighbors (NN) object. We will use it in `predict` function 
#         self.NN = NearestNeighbors(n_neighbors=max(self.k_list), 
#                                       metric=self.metric, 
#                                       n_jobs=-1, 
#                                       algorithm='ball_tree')
#         if isloadfromfile:
#             in_s = open(filename, 'rb')
#             self.NN = pickle.load(in_s)
        if self.NN is None:
            if loadneighs:
                if self.neigh_filename is None:
                    print('Load Neigh File name not specified')
                    return []
            elif not self.loadindex:
                    start = time.time()
                    print('NN Fit Start..')
        #             self.NN = BuildANN(X,self.metric,True,'index.ann')
                    if self.saveindex:
                        cur_index_filename = self.index_filename
                    else:
                        cur_index_filename = 'index.ann'
                    BuildANN(X,self.metric,True,cur_index_filename)
        #             self.NN = NNDescent(X,metric=self.metric,n_neighbors=max(self.k_list))
                    print('NN Fit End..')
                    end= time.time()
                    print('Fit Exec Time:',end-start)
        
        self.neighbors = None
        # Store labels 
        self.y_train = y
        self.fit_col_size=X.shape[1]
        # Save how many classes we have
        self.n_classes = np.unique(y).shape[0] if self.n_classes_ is None else self.n_classes_
        
    def predict(self, X):       
        '''
            Produces KNN features for every object of a dataset X
        '''
        start= time.time()
        
        if self.loadneighs:
            if self.neigh_filename is None:
                print('Load Neigh File name not specified')
                return []
            print('NN query start')
            self.neighbors = LoadANNQuery(self.neigh_filename)
            print('NN query end')
            end= time.time()
            print('Query Exec Time:',end-start)
        
        else:     
            
            if self.loadindex or self.saveindex:
                cur_index_filename = self.index_filename
            else:
                cur_index_filename = 'index.ann'
            print('NN Load start')
            NN = LoadANN(cur_index_filename,self.fit_col_size,self.metric)
            end= time.time()
            print('Load Exec Time:',end-start)
            start= time.time()

            print('NN query start')
            self.neighbors = ANNQuery(X,NN,self.max_neigh_count,
                                      self.saveneighs,
                                      self.neigh_filename,
                                      isdistance=True)
            
            print('neighs nan shape:',self.neighbors[self.neighbors==np.nan].shape)
    #         self.neighbors = self.NN.query(X,k=max(self.k_list))
            print('NN query end')
            end= time.time()
            print('Query Exec Time:',end-start)
            
        no_elem= X.shape[0]
        self.no_elem = no_elem
        global pbar 
        start= time.time()
        print('Feature Start')
#         pbar = tqdm(total=no_elem)
    
        if self.n_jobs == 1:
            test_feats = []
            for i in range(no_elem):
                test_feats.append(self.get_features_for_one(i))
#             pbar.close()
        else:
            processes = self.n_jobs if self.n_jobs!=-1 else multiprocessing.cpu_count()
            print('no of cpus:',multiprocessing.cpu_count())
           
            with Pool(processes=processes) as pool:
                test_feats = pool.map(self.get_features_for_one, range(no_elem))
#                 test_feats = list(tqdm(pool.map(self.get_features_for_one, range(no_elem)),total=no_elem))
#             pbar.close()

        end= time.time()
        print('Feature End')
        print('Feature Exec Time:',end-start)
        return np.vstack(test_feats)
    
    def dummy(self, index): 
        return None
    
    def debug_index(self,index, template):
        if DEBUG:
            if (index % 1000 ==0) or (index==(self.no_elem-1)):
                print('index {0} : {1}'.format(template,index))
        
        
    def get_features_for_one(self, index):
        '''
            Computes KNN features for a single object `x`
        '''
        
        self.debug_index(index,'start')     
        
        
#         neighs = self.neighbors[0][index,:]
        neighs = self.neighbors[index,:self.max_neigh_count]
        
#         print('neigh indices:',neighs)
        
        # Vector of size `n_neighbors`
        # Stores distances to corresponding neighbors
#         neighs_dist = self.neighbors[1][index,:] 
        neighs_dist = self.neighbors[index,self.max_neigh_count:] 
       

        # Vector of size `n_neighbors`
        # Stores labels of corresponding neighbors
#         start = time.time()
        neighs_y = self.y_train.iloc[neighs] 
        self.debug_index(index,'neighs y iloc')     
            
#         end = time.time()
#         print('neighs Y time:',end-start)
#         print('neigh y:',neighs_y)
#         print('type of neigh y:',type(neighs_y[0:1]))
        
        ## ========================================== ##
        ##              YOUR CODE BELOW
        ## ========================================== ##
        
        # We will accumulate the computed features here
        # Eventually it will be a list of lists or np.arrays
        # and we will use np.hstack to concatenate those
#         start = time.time()
        return_list = [] 
        
        
        ''' 
            1. Fraction of objects of every class.
               It is basically a KNNСlassifiers predictions.

               Take a look at `np.bincount` function, it can be very helpful
               Note that the values should sum up to one
        '''
        for k in self.k_list:
            # YOUR CODE GOES HERE
            feats_raw= np.bincount(neighs_y[:k],minlength=self.n_classes)  
#             print('feats raw for k={} : {}'.format(k,feats_raw))
            feats= feats_raw / k
#             print('feats for k={} : {}'.format(k,feats))
#             assert len(feats) == self.n_classes
            return_list += [feats]
#         end = time.time()
#         print('section 1 time:',end-start)
        
       
        
        '''
            2. Same label streak: the largest number N, 
               such that N nearest neighbors have the same label.
               
               What can help you: `np.where`
        '''
#         start= time.time()
        feats = np.array([findfirststreak(neighs_y)])
#         print('streak:',feats)
        
#         assert len(feats) == 1
        return_list += [feats]
#         end = time.time()
#         print('section 2 time:',end-start)
        
        '''
            3. Minimum distance to objects of each class
               Find the first instance of a class and take its distance as features.
               
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.

               `np.where` might be helpful
        '''
#         start = time.time()
        feats = []
        for c in range(self.n_classes):
            # YOUR CODE GOES HERE
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                min_dist = 999
            else:
                min_dist = np.min(neighs_dist[curclass_matches])
            feats+=[min_dist]
#         print('min dist feat:',feats)
#         assert len(feats) == self.n_classes
        return_list += [feats]
#         end = time.time()
#         print('section 3 time:',end-start)
       
        '''
            4. Minimum *normalized* distance to objects of each class
               As 3. but we normalize (divide) the distances
               by the distance to the closest neighbor.
               
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.
               
               Do not forget to add self.eps to denominator.
        '''
#         remove zero distance from distance array and then get the closest distance
#         close_dist = min(delzerovals(neighs_dist))
        close_dist = min(neighs_dist)
        neighs_norm_dist = neighs_dist / (self.eps + close_dist)
#         print('neighs norm dist',neighs_norm_dist)
        
        #Find the minimum distance from normalized distances
#         start = time.time()
        feats = []
        for c in range(self.n_classes):
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                min_norm_dist = 999
            else:
                min_norm_dist = np.min(neighs_norm_dist[curclass_matches])
            feats+=[min_norm_dist]
#         print('min norm dist feat:',feats)
        
#         assert len(feats) == self.n_classes
        return_list += [feats]
#         end = time.time()
#         print('section 4 time:',end-start)
        
        '''
            5. 
               5.1 Distance to Kth neighbor
                   Think of this as of quantiles of a distribution
               5.2 Distance to Kth neighbor normalized by 
                   distance to the first neighbor
               
               feat_51, feat_52 are answers to 5.1. and 5.2.
               should be scalars
               
               Do not forget to add self.eps to denominator.
        '''
#         start = time.time()
        for k in self.k_list:
            
            feat_51 = neighs_dist[k-1]
            #normalize by the first non-zero neighbor
#             feat_52 = feat_51 / (self.eps * delzerovals(neighs_dist)[0])
            feat_52 = feat_51 / (self.eps + neighs_dist[0])
            
#             print('feat 51:',feat_51)
#             print('feat 52:',feat_52)
          
            return_list += [[feat_51, feat_52]]
#         end = time.time()
#         print('section 5 time:',end-start)
        
#         print('return list upto point 5:', return_list)
        '''
            6. Mean distance to neighbors of each class for each K from `k_list` 
                   For each class select the neighbors of that class among K nearest neighbors 
                   and compute the average distance to those objects
                   
                   If there are no objects of a certain class among K neighbors, set mean distance to 999
                   
               You can use `np.bincount` with appropriate weights
               Don't forget, that if you divide by something, 
               You need to add `self.eps` to denominator.
        '''
#         start = time.time()
        for k in self.k_list:
            
            # YOUR CODE GOES HERE
            denom= np.bincount(neighs_y[:k],minlength=self.n_classes) + self.eps  
            numer= np.bincount(neighs_y[:k],minlength=self.n_classes,weights=neighs_dist[:k])  
            feats = numer / denom
            feats[feats==0]=999
#             print('6. denom for k={} : {}'.format(k,denom))
#             print('6. numer for k={} : {}'.format(k,numer))
#             print('6. feats for k={} : {}'.format(k,feats))
            
#             assert len(feats) == self.n_classes
            return_list += [feats]
#         end = time.time()
#         print('section 6 time:',end-start)

    
        '''
            7. Maximum of the distance to objects of each class
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.
        '''
        feats = []
        for c in range(self.n_classes):
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                max_dist = 999
            else:
                max_dist = np.max(neighs_dist[curclass_matches])
            feats+=[max_dist]
        return_list += [feats] 
        
        self.debug_index(index,'knn 7')     
        '''
            8. Standard deviation of the distance to objects of each class
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.
        '''
        feats = []
        for c in range(self.n_classes):
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                std_dist = 999
            else:
                std_dist = np.std(neighs_dist[curclass_matches])
            feats+=[std_dist]
        return_list += [feats]
        self.debug_index(index,'knn 8')     
        
#         '''
#             9. Skew of the distance to objects of each class
#                If there are no neighboring objects of some classes, 
#                Then set distance to that class to be 999.
#         '''
#         feats = []
#         for c in range(self.n_classes):
#             curclass_matches = np.where(neighs_y==c)
#             if len(curclass_matches[0])==0:
#                 skew_dist = 999
#             else:
#                 skew_dist = skew(neighs_dist[curclass_matches])
#             feats+=[skew_dist]
#         return_list += [feats]
#         '''
#             10. Kurtosis of the distance to objects of each class
#                If there are no neighboring objects of some classes, 
#                Then set distance to that class to be 999.
#         '''
#         feats = []
#         for c in range(self.n_classes):
#             curclass_matches = np.where(neighs_y==c)
#             if len(curclass_matches[0])==0:
#                 kurtosis_dist = 999
#             else:
#                 kurtosis_dist = kurtosis(neighs_dist[curclass_matches])
#             feats+=[kurtosis_dist]
#         return_list += [feats]   
        
#         '''
#             11. IQR of the distance to objects of each class
#                If there are no neighboring objects of some classes, 
#                Then set distance to that class to be 999.
#         '''
#         start = time.time()
#         feats = []
#         for c in range(self.n_classes):
#             curclass_matches = np.where(neighs_y==c)
#             if len(curclass_matches[0])==0:
#                 q11 =999;q12=999;q13=999;q14=999
#                 q31=999;q32=999;q33=999;q34=999
#                 iqr_mean=999;iqr_range_dist=999;range_dist=999;
#             else:
#                 cur_neighs_dist = neighs_dist[curclass_matches]
#                 q11 = np.quantile(cur_neighs_dist,0.05)
#                 q12 = np.quantile(cur_neighs_dist,0.1)
#                 q13 = np.quantile(cur_neighs_dist,0.15)
#                 q14 = np.quantile(cur_neighs_dist,0.25)
#                 q31 = np.quantile(cur_neighs_dist,0.95)
#                 q32 = np.quantile(cur_neighs_dist,0.9)
#                 q33 = np.quantile(cur_neighs_dist,0.85)
#                 q34 = np.quantile(cur_neighs_dist,0.75)
                
#                 mask = (cur_neighs_dist>=q11) & (cur_neighs_dist<=q31)
#                 iqr_mean = np.mean(cur_neighs_dist[mask])
#                 range_dist = max_dist - min_dist
#                 iqr_range_dist = q34 - q14
                
#             feats+=[q11,q12,q13,q14,q31,q32,q33,q34,iqr_mean,iqr_range_dist,range_dist]
#         return_list += [feats]     
#         end = time.time()
        
        
#         if index % 1000 ==0:
#             print('index:',index)
#             print('quantile time:',end-start)
    

#         return_list+=[np.array([0,1,2,3,4,5])]
        # merge
#         start = time.time()
        knn_feats = np.hstack(return_list)
#         end = time.time()
#         print('hstack time:',end-start)
#         print('total feats shape',knn_feats.shape)

#         pbar.update(self.n_jobs)
        return knn_feats

## Get features for train

Compute features for train, using out-of-fold strategy.

In [16]:
a= np.array([[32,12,23,45],[75,18,5,45],[6,7,93,26]])
b= np.zeros((12,4))
indices =np.array([[5,7,10]])
print(a)
b[indices]=a
print(b)

[[32 12 23 45]
 [75 18  5 45]
 [ 6  7 93 26]]
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [32. 12. 23. 45.]
 [ 0.  0.  0.  0.]
 [75. 18.  5. 45.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 6.  7. 93. 26.]
 [ 0.  0.  0.  0.]]


In [17]:

# Differently from other homework we will not implement OOF predictions ourselves
# but use sklearn's `cross_val_predict`
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

def getcombinedknnfeats(metriclist,k_list,train_X,train_Y,indices=None,test=None,
                        col_start=0,istrain=1,NN_index=None):
    if istrain:
        result_knn_feats,ret_NN_index= getknnfeatsfortrain(metriclist,k_list,train_X,train_Y,
                                             NN_index=NN_index,indices=indices)
        result_index=train_X.index
        source = train_X
    else:
        result_knn_feats,ret_NN_index= getknnfeatsfortest(metriclist,k_list,train_X,train_Y,test,
                                             NN_index)
        result_index = test.index
        source = test
    print('knn feats generation complete')
    knn_feats= pd.DataFrame(result_knn_feats[0],index=result_index)
    knn_feats_count = knn_feats.shape[1]
    knn_feats.columns = ['knn'+str(i) for i in range(col_start,knn_feats_count+col_start)]
    
#     combined_data = knn_feats
    
    print('knn feats shape:',knn_feats.shape)
    combined_data = pd.concat([source,knn_feats],axis=1)
    print('knn combine process complete')
    return knn_feats_count,combined_data,ret_NN_index

def getknnfeatsfortest(metriclist,k_list,train_X,train_Y,test,NN_index=None):
    # for metric in ['minkowski', 'cosine']:
        test_knn_feats=[]
        NNF = NearestNeighborsFeats(n_jobs=4, NN_index=NN_index,
                                    k_list=k_list, metric='dummy',
                                   neigh_filename=neigh_filename,
                                   loadneighs=loadneighs,
                                   saveneighs=saveneighs,
                                   loadindex = loadindex,
                                   index_filename= index_filename,
                                   saveindex=saveindex)
        for metric in metriclist:
            print (metric)
            NNF.setmetric(metric)
            # Create instance of our KNN feature extractor
            # Fit on train set
            NNF.fit(train_X , train_Y)
            # Get features for test
            test_knn_feats += [NNF.predict(test)]
            # Dump the features to disk
#             np.save('knn_feats_%s_test.npy' % metric , test_knn_feats)
        
        return test_knn_feats,NNF.getNNindex()

def runknnmodel(NNF,train_X,train_Y,indices=None):
    
    oof_knn_feats = None
    if indices is None:
        print('indices is None')
        folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)
        indices = folds.split(train_to_split.values, target_to_split.values)
    
    for fold_, (train_idx, val_idx) in enumerate(indices):
        
        tr = train_X.iloc[train_idx]
        y_tr= train_Y.iloc[train_idx]
        val = train_X.iloc[val_idx]
        y_val = train_Y.iloc[val_idx]

        print('fold:{0} train shape:{1} val shape: {2}'.format(fold_,y_tr.shape,y_val.shape))
        
        #Concat current encoding train, valid and test files
#         cur_tr_encs = tr_encs[fold_]
#         cur_val_encs= val_encs[fold_]

#         tr=pd.concat([tr,cur_tr_encs],axis=1)
#         val=pd.concat([val,cur_val_encs],axis=1)
        
#         print('val shape after:',val.shape)
#         print('tr shape after:',tr.shape)
        
        
#         print('val index head:',val[0:10])
        
        NNF.fit(tr,y_tr)
        cur_knn_feats = NNF.predict(val)
        if oof_knn_feats is None:
            knn_feats_count = cur_knn_feats.shape[1]
            oof_knn_feats = np.zeros((train_X.shape[0],knn_feats_count))
        
        oof_knn_feats[val_idx] = cur_knn_feats

    return oof_knn_feats
    
def getknnfeatsfortrain(metriclist,k_list,train_X,train_Y,indices=None,NN_index=None):

    # We will use two metrics for KNN
    train_knn_feats=[]
    # for metric in ['minkowski', 'cosine']:
        # Create instance of our KNN feature extractor
        # n_jobs can be larger than the number of cores
    NNF = NearestNeighborsFeats(n_jobs=4, k_list=k_list, metric='dummy',
                               NN_index=NN_index,
                                    neigh_filename=neigh_filename,
                                   loadneighs=loadneighs,
                                   saveneighs=saveneighs,
                                   saveindex=saveindex)
    for metric in metriclist:
        print (metric)
        NNF.setmetric(metric)
#         if indices is None:
#             # Set up splitting scheme, use StratifiedKFold
#             # use skf_seed and n_splits defined above with shuffle=True
#             skf = StratifiedKFold(n_splits=n_splits, random_state=4590, shuffle=True)
#             #Note: cross_val_predict internally creates new NNF object 
#             # and so if NN_index need to be passed, then cross_val_predict to be replaced
#             # with the manual 4 fold validation code
#             train_knn_feats_cur = cross_val_predict(NNF, train_X,train_Y,cv=skf)
#         else:
        train_knn_feats_cur = runknnmodel(NNF,train_X,train_Y,indices=indices)
            
        print(train_knn_feats_cur.shape)
        train_knn_feats+=[train_knn_feats_cur]
        # Save the features
#         np.save('knn_feats_%s_train.npy' % metric, train_knn_feats_cur)
    return train_knn_feats,NNF.getNNindex()

In [18]:
def knncolnames(k_list):
    x = len(k_list)
    c = 2
    
    origindex=0
    colname_dict ={}
    for i in range(x*c):
        colname_dict['knn'+str(origindex)] = 'knn_1.' + str(i+1)
        origindex +=1
    colname_dict['knn'+str(origindex)] = 'knn_2'
    origindex +=1
    for i in range(2):
        colname_dict['knn'+str(origindex)] = 'knn_3.'+ str(i+1)
        origindex +=1
    for i in range(2):
        colname_dict['knn'+str(origindex)] = 'knn_4.'+ str(i+1)
        origindex +=1
    
    for i in range(x*c):
        colname_dict['knn'+str(origindex)] = 'knn_5.' + str(i+1)
        origindex +=1
    for i in range(x*c):
        colname_dict['knn'+str(origindex)] = 'knn_6.' + str(i+1)
        origindex +=1
    knnno = 7
    for j in range(4):
        for i in range(2):
            colname_dict['knn'+str(origindex)] = 'knn_'+str(knnno)+'.' + str(i+1)
            origindex +=1
        knnno+=1   
        
    #iqr
    for j in range(2):
        for i in range(11):
            colname_dict['knn'+str(origindex)] = 'knn_'+str(knnno)+'.' + str(j+1) + '_' + str(i+1)
            origindex +=1
    
    return colname_dict


def exec_knn_fulltrainortest(istest,metric,k_list,train_scaled,test_scaled,target,indices):
        
    if istest:
        istrain=0
        file_template='test_'    
    else:
        istrain=1
        file_template='fulltrain_'
        
    
    print()
    print('************************KNN Execution ************************')
    print()
    knn_count,data_with_knn,NN_index=getcombinedknnfeats([metric],k_list,train_scaled,
                                                      target,indices=indices,
                                                istrain=istrain,test=test_scaled,
                                                NN_index=None)
    print('data_with_knn shape:',data_with_knn.shape)
       
    colname_dict=knncolnames(k_list)
    data_with_knn.rename(columns=colname_dict, inplace=True)    
    
    if not istest:
        saveknnfeats(data_with_knn,file_template,metric)
    
#     print()
#     knncols = [col for col in data_with_knn.columns if ('knn_' in col) ]
#     knnfilename = '{0}_{1}_{2}.zip'.format(file_template,'knn_feats',metric)
#     print('Save KNN Feats File Name: ',knnfilename)
#     data_with_knn[knncols].to_pickle(knnfilename)
#     print('Save KNN Feats Complete')
    
    return data_with_knn

def saveknnfeats(data_with_knn,file_template,metric):
    print()
    knncols = [col for col in data_with_knn.columns if ('knn_' in col) ]
    knnfilename = '{0}_{1}_{2}.zip'.format(file_template,'knn_feats',metric)
    print('Save KNN Feats File Name: ',knnfilename)
    data_with_knn[knncols].to_pickle(knnfilename)
    print('Save KNN Feats Complete')
    

In [19]:
k_list=[101,1001]
metric ='euclidean'
n_splits = 5

train_to_split = train
target_to_split = target

neigh_filename='test_neighs.csv'
loadneighs=False
saveneighs=False
saveindex=False
loadindex = False
index_filename = 'train_index.ann'

In [20]:
#Test
# test_frac=0.1
# test_sample = test_scaled.sample(frac=test_frac)


n_splits_frac = 10
sss = StratifiedKFold(n_splits=n_splits_frac, shuffle=True, random_state=3000)
indices = sss.split(train_scaled.values, target.values)

foldbreak = 0
for fold_, (train_idx, val_idx) in enumerate(indices):
    if fold_ != foldbreak:
        break
    train_sample = train_scaled.iloc[val_idx]
    target_sample= target.iloc[val_idx]
    
sss = KFold(n_splits=n_splits_frac, shuffle=True, random_state=3000)
indices = sss.split(test_scaled.values)

foldbreak = 0
for fold_, (train_idx, val_idx) in enumerate(indices):
    if fold_ != foldbreak:
        break
    test_sample = test_scaled.iloc[val_idx]
    

print(train_sample.shape)
print(target_sample[target_sample==1].shape)
print(target_sample[target_sample==0].shape)
print(test_sample.shape)

(23317, 36)
(5062,)
(18255,)
(11240, 36)


In [21]:
%%time
#Train KNN
saveindex=False
loadindex = False
DEBUG = False
# train_to_split = train_sample
# target_to_split = target_sample
# train_with_knn = exec_knn_fulltrainortest(False,metric,k_list,train_sample,None,
#                                          target_sample,None)

train_with_knn = exec_knn_fulltrainortest(False,metric,k_list,train_scaled,None,
                                         target,None)
knncols =[col for col in train_with_knn if 'knn_' in col]
print(train_with_knn[knncols].shape)
train_with_knn[knncols].head()


************************KNN Execution ************************

euclidean
indices is None
fold:0 train shape:(186522,) val shape: (46632,)
NN Fit Start..
NN Fit End..
Fit Exec Time: 25.92730450630188
NN Load start
Load Exec Time: 0.00020742416381835938
NN query start
neighs nan shape: (0,)
NN query end
Query Exec Time: 180.052494764328
Feature Start
no of cpus: 4
Feature End
Feature Exec Time: 86.4379460811615
fold:1 train shape:(186523,) val shape: (46631,)
NN Fit Start..
NN Fit End..
Fit Exec Time: 25.67326045036316
NN Load start
Load Exec Time: 0.00020575523376464844
NN query start
neighs nan shape: (0,)
NN query end
Query Exec Time: 177.84406852722168
Feature Start
no of cpus: 4
Feature End
Feature Exec Time: 84.97143483161926
fold:2 train shape:(186523,) val shape: (46631,)
NN Fit Start..
NN Fit End..
Fit Exec Time: 25.833146810531616
NN Load start
Load Exec Time: 0.0002353191375732422
NN query start
neighs nan shape: (0,)
NN query end
Query Exec Time: 186.69614124298096
Feature 

In [22]:
col='knn_3.2'
print(train_with_knn[col].describe())

mask = target==1
print(train_with_knn.loc[mask,col].describe())
print(train_with_knn.loc[mask,col].quantile(0.85))
print(train_with_knn.loc[mask,col].quantile(0.9))
print(train_with_knn.loc[mask,col].quantile(0.95))
print(train_with_knn.loc[mask,col].quantile(0.99))
mask = target==0
print(train_with_knn.loc[mask,col].describe())
print(train_with_knn.loc[mask,col].quantile(0.85))
print(train_with_knn.loc[mask,col].quantile(0.9))
print(train_with_knn.loc[mask,col].quantile(0.95))
print(train_with_knn.loc[mask,col].quantile(0.99))

# test_with_knn[test_with_knn['knn_1.4']].describe()

count    233154.000000
mean          0.625782
std           0.269894
min           0.000921
25%           0.435840
50%           0.587583
75%           0.770709
max           5.480172
Name: knn_3.2, dtype: float64
count    50611.000000
mean         0.600339
std          0.273335
min          0.023883
25%          0.407948
50%          0.560071
75%          0.745995
max          3.879927
Name: knn_3.2, dtype: float64
0.8609572947025299
0.9451549053192139
1.093899428844452
1.4559476375579834
count    182543.000000
mean          0.632836
std           0.268506
min           0.000921
25%           0.443318
50%           0.594562
75%           0.777244
max           5.480172
Name: knn_3.2, dtype: float64
0.8920786976814269
0.9780151128768921
1.1194373726844788
1.4448824238777158


In [23]:
# DEBUG = False
# loadneighs=False
# saveindex = False
# loadindex = False

# # # Test KNN
# # test_with_knn = exec_knn_fulltrainortest(True,metric,k_list,train_sample,test_sample,
# #                                          target_sample,None)

# test_with_knn = execknn_testblocks(test_sample,train_sample,target_sample)

In [24]:
%%time

def execknn_testblocks(test_scaled,train_scaled,target):
    
#     test_scaled=pd.concat([test_scaled,test_encs[0]],axis=1)

    loadneighs=False

    train_val_size = int(train_scaled.shape[0] / n_splits)

    start = 0
    end = 0
    test_size = test_scaled.shape[0]
    test_with_knn = None

    saveindex = True
    loadindex = False

    while end < test_size:

        end = start + train_val_size
        print()
        print('START:{0} END:{1}'.format(start,end))
        print()
        test_cur = test_scaled[start:end]
        test_with_knn_cur = exec_knn_fulltrainortest(True,metric,k_list,train_scaled,test_cur,
                                                 target,None)
        knncols =[col for col in test_with_knn_cur if 'knn_' in col]
        print(test_with_knn_cur[knncols].shape)

        if test_with_knn is  None:
            test_with_knn = test_with_knn_cur.copy()
            loadindex = True
            saveindex = False
        else:
            test_with_knn = pd.concat([test_with_knn,test_with_knn_cur])

        start = end
        
    saveknnfeats(test_with_knn,'test_',metric)

    return test_with_knn

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.9 µs


In [25]:
len(train_scaled.columns)

36

In [26]:
%%time

DEBUG = False

test_with_knn = execknn_testblocks(test_scaled,train_scaled,target)


START:0 END:46630


************************KNN Execution ************************

euclidean
NN Fit Start..
NN Fit End..
Fit Exec Time: 33.95022702217102
NN Load start
Load Exec Time: 0.00019311904907226562
NN query start
neighs nan shape: (0,)
NN query end
Query Exec Time: 185.26862406730652
Feature Start
no of cpus: 4
Feature End
Feature Exec Time: 88.48967337608337
knn feats generation complete
knn feats shape: (46630, 21)
knn combine process complete
data_with_knn shape: (46630, 57)
(46630, 21)

START:46630 END:93260


************************KNN Execution ************************

euclidean
NN Fit Start..
NN Fit End..
Fit Exec Time: 33.72976303100586
NN Load start
Load Exec Time: 0.00019669532775878906
NN query start
neighs nan shape: (0,)
NN query end
Query Exec Time: 194.35801577568054
Feature Start
no of cpus: 4
Feature End
Feature Exec Time: 88.80717301368713
knn feats generation complete
knn feats shape: (46630, 21)
knn combine process complete
data_with_knn shape: (46630, 

In [27]:
test_with_knn[knncols].head()

Unnamed: 0,knn_1.1,knn_1.2,knn_1.3,knn_1.4,knn_2,knn_3.1,knn_3.2,knn_4.1,knn_4.2,knn_5.1,knn_5.2,knn_5.3,knn_5.4,knn_6.1,knn_6.2,knn_6.3,knn_6.4,knn_7.1,knn_7.2,knn_8.1,knn_8.2
0,0.613861,0.386139,0.663337,0.336663,4.0,1.142914,1.043801,1.094953,0.999999,1.480194,1.418079,1.804892,1.729151,1.385841,1.331457,1.66077,1.6532,1.804855,1.804892,0.123213,0.147106
1,0.693069,0.306931,0.737263,0.262737,2.0,1.049196,1.112197,0.999999,1.060047,1.504586,1.434036,1.884655,1.796284,1.371904,1.348036,1.71296,1.709664,1.884655,1.881553,0.148867,0.163834
2,0.653465,0.346535,0.659341,0.340659,2.0,1.047225,1.019126,1.02757,0.999999,1.158269,1.136531,1.386832,1.360804,1.116565,1.117602,1.279094,1.279872,1.386707,1.386832,0.081503,0.081419
3,0.663366,0.336634,0.674326,0.325674,2.0,1.047083,1.118811,0.999999,1.068501,1.358685,1.297589,1.639807,1.566069,1.289244,1.267974,1.514259,1.51067,1.639807,1.639701,0.103345,0.110724
4,0.643564,0.356436,0.646354,0.353646,2.0,1.036227,0.991366,1.04525,0.999999,1.29857,1.309878,1.597495,1.611407,1.225279,1.226909,1.457794,1.462667,1.597495,1.597012,0.11086,0.110418


In [28]:
test_with_knn['knn_3.2'].describe()

count    112392.000000
mean          1.128437
std           0.173780
min           0.928357
25%           1.018636
50%           1.084447
75%           1.186315
max          10.249741
Name: knn_3.2, dtype: float64

In [29]:
# train_with_knn_copy = train_with_knn.copy()
# test_with_knn_copy = test_with_knn.copy()

In [30]:
# train_with_knn = train_with_knn_copy.copy()
# test_with_knn = test_with_knn_copy.copy()


In [31]:
selcols = ['knn_3.1','knn_3.2','knn_5.3','knn_5.4','knn_6.3','knn_6.4','knn_7.1','knn_7.2']
for col in selcols:
    train_with_knn[col] = train_with_knn[col].rank(pct=True)

print(train_with_knn['knn_3.2'].describe())
print(train_with_knn.loc[target==1,'knn_3.2'].describe())
print(train_with_knn.loc[target==0,'knn_3.2'].describe())

# selcols = ['knn_3.2']
# scaler = StandardScaler()
# scaled = scaler.fit_transform(train_with_knn[selcols])
# train_scaled_knn = pd.DataFrame(scaled)
# train_scaled_knn.columns = [selcols]
# print(train_scaled_knn['knn_3.2'].describe())
# print(train_scaled_knn.loc[target==1,'knn_3.2'].describe())
# print(train_scaled_knn.loc[target==0,'knn_3.2'].describe())

count    233154.000000
mean          0.500002
std           0.288676
min           0.000004
25%           0.250004
50%           0.500002
75%           0.750001
max           1.000000
Name: knn_3.2, dtype: float64
count    50611.000000
mean         0.469078
std          0.293883
min          0.000024
25%          0.207680
50%          0.456089
75%          0.722340
max          0.999983
Name: knn_3.2, dtype: float64
count    182543.000000
mean          0.508576
std           0.286626
min           0.000004
25%           0.261962
50%           0.511465
75%           0.757085
max           1.000000
Name: knn_3.2, dtype: float64


In [32]:
# scaler = StandardScaler()
# scaled = scaler.fit_transform(test_with_knn[selcols])
# test_scaled_knn = pd.DataFrame(scaled)
# test_scaled_knn.columns = [selcols]
# test_scaled_knn['knn_3.2'].describe()

for col in selcols:
    test_with_knn[col] = test_with_knn[col].rank(pct=True)

test_with_knn['knn_3.2'].describe()

count    112392.000000
mean          0.500004
std           0.288676
min           0.000009
25%           0.250007
50%           0.500004
75%           0.750002
max           1.000000
Name: knn_3.2, dtype: float64

In [33]:
test_with_knn['knn_3.2'].describe()

count    112392.000000
mean          0.500004
std           0.288676
min           0.000009
25%           0.250007
50%           0.500004
75%           0.750002
max           1.000000
Name: knn_3.2, dtype: float64

In [34]:
import time
def runlgb(ispermutefeats,train,test,param,cur_features,score_function=None,isparamFolds=False):

    overall_sel_feats =[]
    overall_imp_df = pd.DataFrame()
    overall_imp_df['feature']= np.array(cur_features)
    overall_imp_df['overall_score_mean'] =0 
    overall_imp_df['overall_score_max'] =-9999 
    overall_imp_df['overall_score_min'] =9999 
    
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    start = time.time()
    valid_scores =[]
    fold_importance_df = pd.DataFrame()
    

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)
    indices = folds.split(train_orig.values, target.values)
        
    for fold_, (trn_idx, val_idx) in enumerate(indices):
        print()
        print("fold n°{}".format(fold_))

        tr = train.iloc[trn_idx]
        val = train.iloc[val_idx]
        y_val = target.iloc[val_idx]
        y_tr = target.iloc[trn_idx]
        
#         val_index_ser = pd.Series(np.array(val.index))
#         print('val shape:',val.shape)
#         print('val index head:',val_index_ser.head(20))
#         print('val index tail:',val_index_ser.tail(20))
        
        trn_data = lgb.Dataset(tr[cur_features], label=y_tr)#,, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(val[cur_features], label=y_val)#,, categorical_feature=categorical_feats)
        
        if isparamFolds:
            cur_param = param[fold_]
            print('cur param:',cur_param)
        else:
            cur_param = param
            
        clf = lgb.train(cur_param, trn_data, num_round, valid_sets = [val_data], verbose_eval=500, 
                        early_stopping_rounds = 300)

        #Prediction based on current fold selected features
        if ispermutefeats:
            
            selected_features, importance_df = permutation_feature_selection(clf, val[cur_features], 
                                                                             y_val,score_function,
                                                                             rep=4,max_delta_score=max_delta_score)
            overall_sel_feats += [selected_features]
            print(selected_features)

#             print('overal imp shape:{0} importance_df shape:{1}'.format(overall_imp_df.shape,importance_df.shape))
            
            overall_imp_df['fold_'+str(fold_)+'score_mean'] = importance_df['delta_score_mean']
            overall_imp_df['fold_'+str(fold_)+'score_max'] = importance_df['delta_score_max']
            overall_imp_df['fold_'+str(fold_)+'score_min'] = importance_df['delta_score_min']
        else:
            oof[val_idx] = clf.predict(val[cur_features], num_iteration=clf.best_iteration)

            fold_importance_df["feature"] = cur_features
            if fold_==0:
                fold_importance_df["importance"] =0
            fold_importance_df["importance"] += clf.feature_importance() / n_splits
            valid_scores+=[clf.best_score['valid_0'][cur_param['metric']]]
            predictions += clf.predict(test[cur_features], num_iteration=clf.best_iteration) / folds.n_splits

    if ispermutefeats:
        fold_mean_cols = [col for col in overall_imp_df.columns if ('score_mean' in col) and ('fold_' in col) ]
        fold_max_cols = [col for col in overall_imp_df.columns if ('score_max' in col) and ('fold_' in col) ]
        fold_min_cols = [col for col in overall_imp_df.columns if ('score_min' in col) and ('fold_' in col) ]
        overall_imp_df['overall_score_mean'] = overall_imp_df[fold_mean_cols].mean(axis=1)
        overall_imp_df['overall_score_max'] = overall_imp_df[fold_max_cols].max(axis=1)
        overall_imp_df['overall_score_min'] = overall_imp_df[fold_min_cols].min(axis=1)
    else:
        print('valid scores:',valid_scores)
        print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

    return fold_importance_df,predictions,oof,overall_imp_df,overall_sel_feats

In [35]:
param = {'colsample_bytree': 0.6070263484848221,
   'min_child_samples': 120,
   'num_leaves': 65,
   'reg_alpha': 0.4086395992151187,
   'reg_lambda': 0.8723592642214507,
   'subsample': 0.9764323431114542,
   'subsample_for_bin': 160000,
   'learning_rate': 0.01,
   'boosting': 'gbdt',
   'bagging_seed': 2018,
   'bagging_freq': 2,
   'min_data_in_bin': 100,
   'n_estimators': 10000,
   'objective': 'binary',
   'metric': 'auc',
   'random_state': 2333,
   'max_depth': 15,
   'scale_pos_weight': 1}

In [36]:
%%time
Path = '../input/ltfs-fin-model/'

train_orig = pd.read_csv(Path+'train_preproc.csv',index_col=0)
test_orig = pd.read_csv(Path +'test_preproc.csv',index_col=0)

CPU times: user 2.4 s, sys: 56 ms, total: 2.46 s
Wall time: 2.5 s


In [37]:
knncols = [col for col in train_with_knn.columns if 'knn_' in col]
train_knn = pd.concat([train_orig,train_with_knn[knncols]],axis=1)
test_knn = pd.concat([test_orig,test_with_knn[knncols]],axis=1)

In [38]:
exclude_cols =['Date.of.Birth','Employment.Type','DisbursalDate',
               'PERFORM_CNS.SCORE.DESCRIPTION','AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH',
               'MobileNo_Avl_Flag','disbursal_year','disbursal_day','disbursal_dayofweek',
               'date_of_birth', 'disbursal_date',
#                'Current_pincode_ID','Employee_code_ID','supplier_id','branch_id',
#                'PERFORM_CNS.SCORE.CATEGORY','State_ID',
               'UniqueID',targetcol]
knn_dist_1 = ['knn_1.1','knn_1.2','knn_5.1','knn_5.2','knn_6.1','knn_6.2']
knn_dist_2 = ['knn_1.3','knn_1.4','knn_5.3','knn_5.4','knn_6.3','knn_6.4']
knn_set1567 = [col for col in knncols if 
                   (col in knn_dist_2) or ('knn_7' in col)]
knn_set23 = [col for col in knncols if 
                   ('knn_2' in col) or ('knn_3' in col)]
# sel_enc_cols = [col for col in test_encs[0].columns if 'targetenc_' in col]
# features +=['knn_1.4']
# features = [c for c in train_orig.columns if c not in exclude_cols]
# features +=include_knn_cols
# print(features)

In [39]:
num_round = param['n_estimators']

In [40]:
# features = [c for c in train_orig.columns if c not in exclude_cols]
# print(features)

# fold_importance_df,predictions1,oof1,overall_imp_df,overall_sel_feats = \
#         runlgb(False,train_knn,test_knn,param,features,score_function=None,
#               isparamFolds=False)

In [41]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features +=['knn_1.4']
print(features)
fold_importance_df,predictions2,oof2,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

['Aadhar_flag', 'Current_pincode_ID', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'Driving_flag', 'Employee_code_ID', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES', 'PAN_flag', 'PERFORM_CNS.SCORE', 'PRI.ACTIVE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.DISBURSED.AMOUNT', 'PRI.NO.OF.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'Passport_flag', 'SEC.ACTIVE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.DISBURSED.AMOUNT', 'SEC.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.SANCTIONED.AMOUNT', 'State_ID', 'VoterID_flag', 'asset_cost', 'branch_id', 'disbursed_amount', 'ltv', 'manufacturer_id', 'supplier_id', 'age', 'disbursal_month', 'PERFORM_CNS.SCORE.CATEGORY', 'Employment.Type.Category', 'AVERAGE.ACCT.AGE_MONTHS', 'CREDIT.HISTORY.LENGTH_MONTHS', 'knn_1.4']

fold n°0




Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.677043
[1000]	valid_0's auc: 0.679131
[1500]	valid_0's auc: 0.679583
Early stopping, best iteration is:
[1538]	valid_0's auc: 0.679637

fold n°1
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.673341
[1000]	valid_0's auc: 0.674431
[1500]	valid_0's auc: 0.674539
Early stopping, best iteration is:
[1334]	valid_0's auc: 0.674664

fold n°2
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.672022
[1000]	valid_0's auc: 0.674287
[1500]	valid_0's auc: 0.674525
Early stopping, best iteration is:
[1641]	valid_0's auc: 0.674707

fold n°3
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.671443
[1000]	valid_0's auc: 0.67346
[1500]	valid_0's auc: 0.673828
[2000]	valid_0's auc: 0.673709
Early stopping, best iteration is:
[1723]	valid_0's auc: 0.673879

fold n°4
Training until validation scores don't improve

In [42]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features +=knn_set23
print(features)
fold_importance_df,predictions3,oof3,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

['Aadhar_flag', 'Current_pincode_ID', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'Driving_flag', 'Employee_code_ID', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES', 'PAN_flag', 'PERFORM_CNS.SCORE', 'PRI.ACTIVE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.DISBURSED.AMOUNT', 'PRI.NO.OF.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'Passport_flag', 'SEC.ACTIVE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.DISBURSED.AMOUNT', 'SEC.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.SANCTIONED.AMOUNT', 'State_ID', 'VoterID_flag', 'asset_cost', 'branch_id', 'disbursed_amount', 'ltv', 'manufacturer_id', 'supplier_id', 'age', 'disbursal_month', 'PERFORM_CNS.SCORE.CATEGORY', 'Employment.Type.Category', 'AVERAGE.ACCT.AGE_MONTHS', 'CREDIT.HISTORY.LENGTH_MONTHS', 'knn_2', 'knn_3.1', 'knn_3.2']

fold n°0




Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.669673
[1000]	valid_0's auc: 0.673469
[1500]	valid_0's auc: 0.67482
[2000]	valid_0's auc: 0.675445
[2500]	valid_0's auc: 0.675974
[3000]	valid_0's auc: 0.676215
[3500]	valid_0's auc: 0.676171
Early stopping, best iteration is:
[3326]	valid_0's auc: 0.676294

fold n°1
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.666426
[1000]	valid_0's auc: 0.669571
[1500]	valid_0's auc: 0.670414
[2000]	valid_0's auc: 0.670933
[2500]	valid_0's auc: 0.671207
Early stopping, best iteration is:
[2401]	valid_0's auc: 0.671265

fold n°2
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.667627
[1000]	valid_0's auc: 0.671936
[1500]	valid_0's auc: 0.673221
[2000]	valid_0's auc: 0.67358
[2500]	valid_0's auc: 0.673823
Early stopping, best iteration is:
[2676]	valid_0's auc: 0.673944

fold n°3
Training until validation scores don't improve for 300 rounds

In [43]:
# features = [c for c in train_orig.columns if c not in exclude_cols]
# features +=knn_set1567
# print(features)
# fold_importance_df,predictions4,oof4,overall_imp_df,overall_sel_feats = \
#         runlgb(False,train_knn,test_knn,param,features,score_function=None,
#               isparamFolds=False)

In [44]:
# features = [c for c in train_orig.columns if c not in exclude_cols]
# features +=['knn_8.2']
# print(features)
# fold_importance_df,predictions5,oof5,overall_imp_df,overall_sel_feats = \
#         runlgb(False,train_knn,test_knn,param,features,score_function=None,
#               isparamFolds=False)

In [45]:
# oof1 = oof.copy() # without knn
# oof2 = oof.copy() # knn_1.4
# oof4 = oof.copy()  # knn 2,3
# oof4 = oof.copy() # knn 1,5,6,7

In [46]:
oof =  oof2 +  oof3 
# oof_blend = 0.1*oof1 + oof2 +  oof3 + 0.4*oof4
# oof_blend =  oof2 * oof3 * oof1
print('Ens AUC:',roc_auc_score(target,oof))
#Ens AUC: 0.6748153848733317 of baseline and knn_1.4
# predictions = predictions1 + predictions2 + predictions3 + predictions4
predictions = predictions2 + predictions3

Ens AUC: 0.6779505961990675


In [47]:
sub_df = pd.DataFrame({"UniqueID":test["UniqueID"].values})
sub_df[targetcol] = predictions
sub_df.to_csv("submission_knnfeats_euclidean.csv", index=False)
np.save('oof_knnfeats_euclidean.npy',oof)
np.save('oof2_knnfeats_euclidean.npy',oof2)
np.save('oof3_knnfeats_euclidean.npy',oof3)

np.save('pred_knnfeats_euclidean.npy',predictions)
np.save('pred2_knnfeats_euclidean.npy',predictions2)
np.save('pred3_knnfeats_euclidean.npy',predictions3)