**KNN Feature Generation***

In [2]:
%qtconsole

In [3]:
%connect_info

{
  "shell_port": 60359,
  "iopub_port": 40269,
  "stdin_port": 52931,
  "control_port": 59375,
  "hb_port": 53109,
  "ip": "127.0.0.1",
  "key": "2a265a90-d9c69109ec1b2b542eaf86e4",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-e8aad4ae-341f-46a9-a019-520d38be3ed6.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
from numba import jit, cuda

In [3]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 

import xgboost as xgb

from annoy import AnnoyIndex
from category_encoders import TargetEncoder

# from pynndescent import NNDescent
from sklearn.calibration import CalibratedClassifierCV

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score,precision_recall_curve,roc_curve, recall_score,precision_score
from sklearn.metrics import confusion_matrix
from scipy.stats import pearsonr
from scipy.stats import rankdata, skew, kurtosis

pd.options.display.max_rows = 2000
pd.options.display.max_columns  = 999

np.set_printoptions(threshold=5000) 

import lightgbm as lgb
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import f1_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder


pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 100)

import gc
import time

for p in [np, pd, sklearn, scipy]:
    print (p.__name__, p.__version__)

numpy 1.20.3
pandas 1.3.5
sklearn 0.23.2
scipy 1.7.3


Process Data

In [4]:
import pickle

In [5]:
KAGGLE= False

In [6]:
%%time

if KAGGLE:
    Path = '/kaggle/input/'
else:
    Path = './'
    

train = pd.read_csv(Path+'train_preproc_knn.csv')
test = pd.read_csv(Path +'test_preproc_knn.csv')

targetcol = 'engagement_score_bin'
targetcol2 = 'engagement_score'
target = train[targetcol].astype('int16')

#read features and indices
foldgenfile = open('knn_kfold_gen', 'rb')     
result = pickle.load(foldgenfile)
indices=[]
for (train_index,test_index) in result.split(train):
    indices += [(train_index,test_index)]
    print('train index:',train_index)
    print('val index:',test_index)
foldgenfile.close()

features = list(np.load('features_for_knn.npy'))
#exclude target col in features if present
# excl_cols=[targetcol,targetcol2,'user_id_orig', 'video_id_orig', 'category_id_orig']
# features = [col for col in features if col not in excl_cols]
print(len(features))
print(sorted(features))

print(train.shape)
print(test.shape)

train index: [ 7587  7588  7589 ... 75863 75864 75865]
val index: [   0    1    2 ... 7584 7585 7586]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [ 7587  7588  7589 ... 15171 15172 15173]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [15174 15175 15176 ... 22758 22759 22760]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [22761 22762 22763 ... 30345 30346 30347]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [30348 30349 30350 ... 37932 37933 37934]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [37935 37936 37937 ... 45519 45520 45521]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [45522 45523 45524 ... 53105 53106 53107]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [53108 53109 53110 ... 60691 60692 60693]
train index: [    0     1     2 ... 75863 75864 75865]
val index: [60694 60695 60696 ... 68277 68278 68279]
train index: [    0     1     2 ..

In [7]:
def ordinal_dummy_coding(data,cols):
    data_updated = data.copy()
    for col in cols:
        enc_unique = np.sort(data[col].unique())
        start_enc = enc_unique[0]
        data_updated = pd.get_dummies(data_updated,columns = [col], prefix=[col])
#         print('after_dummy_update:',data_updated.columns)
        for enc_val in enc_unique:
            #remove 1st dummy variable since it will become constant value
            #as all higher values of 1 will always update it to 1

            dummy_col = col+'_'+str(enc_val)
            if enc_val==start_enc:
                del data_updated[dummy_col]
                continue
                
            mask = data_updated[dummy_col]==1
            for prior_enc in range(start_enc+1,enc_val):
                prior_dummy_col = col+'_'+str(prior_enc)
                data_updated[prior_dummy_col][mask] = 1
            
    return data_updated   

In [8]:
#scale data
def scaledata(train,test,selcols,independent=False):
    scaler = StandardScaler()
    scaler.fit(train[selcols])
    scaled = scaler.transform(train[selcols])
    train_scaled = pd.DataFrame(scaled)
#     print(len(train_scaled.columns))
#     print(len(selcols))
    train_scaled.columns = [col+'_scale_tranx' for col in selcols]
    if independent:
        scaled = scaler.fit_transform(test[selcols])
    else:
        scaled = scaler.transform(test[selcols])
        
    test_scaled = pd.DataFrame(scaled)
    test_scaled.columns = [col+'_scale_tranx' for col in selcols]
    
    return train_scaled,test_scaled

#scale data
def scaledata_single(data,selcols):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data[selcols])
    data_scaled = pd.DataFrame(scaled)
#     print(len(train_scaled.columns))
#     print(len(selcols))
    data_scaled.columns = [col+'_scale_tranx' for col in selcols]
    
    return data_scaled


In [9]:
#Replace NANs
def replaceNaN(train,test,sel_feats):
    for df in [train,test]:
#         print('*****************')
        for col in sel_feats:
            if df[col].isnull().any():
#                 print(col)
                newcol =  col+'_na_replaced'
                for df_l2 in [train,test]:
                    if newcol not in df_l2.columns:
                        df_l2[newcol] = df_l2[col]
                df[newcol].fillna(train[col].mean(),inplace=True)
            
    # Kmeans feats : add null filled columns  and remove original of these columns
    orig_cols  = [col.replace('_na_replaced','') for col in list(train.columns)  if '_na_replaced'  in col]
    na_replaced_cols  = [col for col in list(train.columns) if '_na_replaced'  in col]
    # orig_cols  = [col.replace('_na_replaced','') for col in list(train.columns) + list(tr_encs[0].columns) if '_na_replaced'  in col]
    # na_replaced_cols  = [col for col in list(train.columns) + list(tr_encs[0].columns) if '_na_replaced'  in col]
    # print(orig_cols)
    sel_feats_NN = sel_feats.copy() 
    for val in orig_cols:
#         print('val:',val)
        sel_feats_NN.remove(val)

    sel_feats_NN += na_replaced_cols
#     print(len(sel_feats_NN))
#     print(sel_feats_NN)        
    
    return train,test,sel_feats_NN

In [10]:
def getfiletemplate(istest,val_fold):
    if istest:
        file_template = 'test_'
    else:
        file_template = 'val_' + str(val_fold)
        
    return file_template

In [11]:
  
def processtraintest(df_train,df_test,features):
    df_train_na_replaced,df_test_na_replaced,sel_feats_NN = \
            replaceNaN(df_train,df_test,features)

    train_raw_scaled,test_raw_scaled = scaledata(df_train_na_replaced,df_test_na_replaced,sel_feats_NN)
    del df_train_na_replaced, df_test_na_replaced;gc.collect()
    scaled_cols = [col for col in train_raw_scaled.columns if '_scale_tranx' in col]
#     print(scaled_cols)
    
    na_replaced_cols  = [col for col in list(df_train.columns) if '_na_replaced'  in col]
    df_train.drop(na_replaced_cols,inplace=True,axis=1)
    df_test.drop(na_replaced_cols,inplace=True,axis=1)
    
    return train_raw_scaled,test_raw_scaled


In [12]:
# # train_scaled,test_scaled = processtraintest(train,test,features)
# train_scaled,test_scaled = scaledata(train,test,features)
# train_scaled.head(10)

# Load data

KNN Feature Class and Functions

In [13]:
#annoy functions
def BuildANN(X,metric,save,index_filename):
    col_size = X.shape[1]
    a = AnnoyIndex(col_size,metric=metric)
    fit_count = X.shape[0]
    for i in range(fit_count):
        a.add_item(i,X.iloc[i].values)
    a.build(-1)
    if save:
        if index_filename is None:
            print('Save Index File name not specified')
        else:
            a.save(index_filename)
    return a
def LoadANN(indexfilename,col_size,metric):
    a = AnnoyIndex(col_size,metric=metric)
    a.load(indexfilename)
    return a

def LoadANNQuery(neigh_filename):
    neighs=np.loadtxt(neigh_filename,delimiter=',')
    return neighs
def ANNQuery(X,NN,neighbors,save,neigh_filename,isdistance=True):
    predict_count = X.shape[0]
    neighs=[]
    for i in range(predict_count):
        neighs += [NN.get_nns_by_vector(X.iloc[i], neighbors, include_distances=isdistance)]
    
    neighs = np.array(neighs)
    if (isdistance):
        neighs = neighs.reshape(neighs.shape[0],-1)             

    if save:
        if neigh_filename is None:
            print('Save Index File name not specified')
        else:
            print('Save Neigh File')
            np.savetxt(neigh_filename,neighs,delimiter=',')
    
    return neighs

In [14]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors
from multiprocessing import Pool
# from scipy.spatial import cKDTree
# from pynndescent import NNDescent
import multiprocessing

import numpy as np
from itertools import groupby
# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm

def findfirststreak(a):
    #compute the difference of consecutive elements
    # e.g. [5,5,5,8,8,8,8] will become [0 0 3 0 0 0]
    diff= np.ediff1d(a)
#     print('diff:',diff)
    #get the index of the first non zero element e.g. array[2]
    nz=np.nonzero(diff)
#     print('nz:',nz)
    #check if no non-zero element (i.e) all elements are same, then full streak
    #else count will be index + 1
    count = len(a) if len(nz[0])==0 else nz[0][0]+1
    return count
def findmaxstreak(a):
    lst = []
    for n,c in groupby(a):
#        print('n,c=',n,c)
       num,count = n,sum(1 for i in c)
       lst.append((num,count))

#     print(lst)
    maxx = max([y for x,y in lst])
    return maxx

def delzerovals(a):
    return np.delete(a,np.where(a==0))

pbar=None

class NearestNeighborsFeats(BaseEstimator, ClassifierMixin):
    '''
        This class should implement KNN features extraction 
    '''
    def __init__(self, n_jobs, k_list, metric, 
                 n_classes=None, NN_index=None, n_neighbors=None, eps=1e-6,
                 saveindex=False,
                 saveneighs=False,
                index_filename=None,
                neigh_filename=None,
                loadneighs=False,loadindex=False):
        self.n_jobs = n_jobs
        self.k_list = k_list
        self.metric = metric
        
        self.max_neigh_count = max(self.k_list)
        self.NN = NN_index
        
        if n_neighbors is None:
            self.n_neighbors = max(k_list) 
        else:
            self.n_neighbors = n_neighbors
            
        self.eps = eps        
        self.n_classes_ = n_classes
        self.saveindex=saveindex
        self.saveneighs=saveneighs
        self.loadneighs=loadneighs
        self.loadindex=loadindex
        self.index_filename = index_filename
        self.neigh_filename =neigh_filename
    def setmetric(self,metric):
        self.metric=metric
    def getNNindex(self):
        return self.NN
    def fit(self, X, y):
        '''
            Set's up the train set and self.NN object
        '''

        if self.NN is None:
            if loadneighs:
                if self.neigh_filename is None:
                    print('Load Neigh File name not specified')
                    return []
            elif not self.loadindex:
                    start = time.time()
                    print('NN Fit Start..')
        #             self.NN = BuildANN(X,self.metric,True,'index.ann')
                    if self.saveindex:
                        cur_index_filename = self.index_filename
                    else:
                        cur_index_filename = 'index.ann'
                    BuildANN(X,self.metric,True,cur_index_filename)
        #             self.NN = NNDescent(X,metric=self.metric,n_neighbors=max(self.k_list))
                    print('NN Fit End..')
                    end= time.time()
                    print('Fit Exec Time:',end-start)
        
        self.neighbors = None
        # Store labels 
        self.y_train = y
        self.fit_col_size=X.shape[1]
        # Save how many classes we have
        self.n_classes = np.unique(y).shape[0] if self.n_classes_ is None else self.n_classes_
    
#     @jit(target ="cuda")
#     def feature_gen_parallel(self,no_elem):
#             test_feats = []
#             for i in range(no_elem):
#                 test_feats.append(self.get_features_for_one(i))
                
#             return test_feats
                
    def predict(self, X):       
        '''
            Produces KNN features for every object of a dataset X
        '''
        start= time.time()
        
        if self.loadneighs:
            if self.neigh_filename is None:
                print('Load Neigh File name not specified')
                return []
            print('NN query start')
            self.neighbors = LoadANNQuery(self.neigh_filename)
            print('NN query end')
            end= time.time()
            print('Query Exec Time:',end-start)
        
        else:     
            
            if self.loadindex or self.saveindex:
                cur_index_filename = self.index_filename
            else:
                cur_index_filename = 'index.ann'
            print('NN Load start')
            NN = LoadANN(cur_index_filename,self.fit_col_size,self.metric)
            end= time.time()
            print('Load Exec Time:',end-start)
            start= time.time()

            print('NN query start')
            self.neighbors = ANNQuery(X,NN,self.max_neigh_count,
                                      self.saveneighs,
                                      self.neigh_filename,
                                      isdistance=True)
            
            print('neighs nan shape:',self.neighbors[self.neighbors==np.nan].shape)
    #         self.neighbors = self.NN.query(X,k=max(self.k_list))
            print('NN query end')
            end= time.time()
            print('Query Exec Time:',end-start)
            
        no_elem= X.shape[0]
        self.no_elem = no_elem
        global pbar 
        start= time.time()
        print('Feature Start')
#         pbar = tqdm(total=no_elem)
    
        if self.n_jobs == 1:
            test_feats = []
            for i in range(no_elem):
                test_feats.append(self.get_features_for_one(i))
#             pbar.close()
        else:
            processes = self.n_jobs if self.n_jobs!=-1 else multiprocessing.cpu_count()
            print('no of cpus:',multiprocessing.cpu_count())
           
            with Pool(processes=processes) as pool:
                test_feats = pool.map(self.get_features_for_one, range(no_elem))
                
#                 test_feats = list(tqdm(pool.map(self.get_features_for_one, range(no_elem)),total=no_elem))
#             pbar.close()

        end= time.time()
        print('Feature End')
        print('Feature Exec Time:',end-start)
        return np.vstack(test_feats)
    
    def dummy(self, index): 
        return None
    
    def debug_index(self,index, template):
        if DEBUG:
            if (index % 1000 ==0) or (index==(self.no_elem-1)):
                print('index {0} : {1}'.format(template,index))
        
        
    def get_features_for_one(self, index):
        '''
            Computes KNN features for a single object `x`
        '''
        
        self.debug_index(index,'start')     
        
        
#         neighs = self.neighbors[0][index,:]
        neighs = self.neighbors[index,:self.max_neigh_count]
        
#         print('neigh indices:',neighs)
        
        # Vector of size `n_neighbors`
        # Stores distances to corresponding neighbors
#         neighs_dist = self.neighbors[1][index,:] 
        neighs_dist = self.neighbors[index,self.max_neigh_count:] 
       

        # Vector of size `n_neighbors`
        # Stores labels of corresponding neighbors
#         start = time.time()
        neighs_y = self.y_train.iloc[neighs] 
        self.debug_index(index,'neighs y iloc')     
            
#         end = time.time()
#         print('neighs Y time:',end-start)
#         print('neigh y:',neighs_y)
#         print('type of neigh y:',type(neighs_y[0:1]))
        
        # We will accumulate the computed features here
        # Eventually it will be a list of lists or np.arrays
        # and we will use np.hstack to concatenate those
#         start = time.time()
        return_list = [] 
        
        
        ''' 
            1. Fraction of objects of every class.
               It is basically a KNNСlassifiers predictions.

               Take a look at `np.bincount` function, it can be very helpful
               Note that the values should sum up to one
        '''
        for k in self.k_list:
            feats_raw= np.bincount(neighs_y[:k],minlength=self.n_classes)  
#             print('feats raw for k={} : {}'.format(k,feats_raw))
            feats= feats_raw / k
#             print('feats for k={} : {}'.format(k,feats))
#             assert len(feats) == self.n_classes
            return_list += [feats]
#         end = time.time()
#         print('section 1 time:',end-start)
        
       
        
        '''
            2. Same label streak: the largest number N, 
               such that N nearest neighbors have the same label.
               
        '''
#         start= time.time()
        feats = np.array([findfirststreak(neighs_y)])
#         print('streak:',feats)
        
#         assert len(feats) == 1
        return_list += [feats]
#         end = time.time()
#         print('section 2 time:',end-start)
        
        '''
            3. Minimum distance to objects of each class
               Find the first instance of a class and take its distance as features.
               
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.

               `np.where` might be helpful
        '''
#         start = time.time()
        feats = []
        for c in range(self.n_classes):
            # YOUR CODE GOES HERE
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                min_dist = 999
            else:
                min_dist = np.min(neighs_dist[curclass_matches])
            feats+=[min_dist]
#         print('min dist feat:',feats)
#         assert len(feats) == self.n_classes
        return_list += [feats]
#         end = time.time()
#         print('section 3 time:',end-start)
       
        '''
            4. Minimum *normalized* distance to objects of each class
               As 3. but we normalize (divide) the distances
               by the distance to the closest neighbor.
               
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.
               
               Do not forget to add self.eps to denominator.
        '''
#         remove zero distance from distance array and then get the closest distance
#         close_dist = min(delzerovals(neighs_dist))
        close_dist = min(neighs_dist)
        neighs_norm_dist = neighs_dist / (self.eps + close_dist)
#         print('neighs norm dist',neighs_norm_dist)
        
        #Find the minimum distance from normalized distances
#         start = time.time()
        feats = []
        for c in range(self.n_classes):
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                min_norm_dist = 999
            else:
                min_norm_dist = np.min(neighs_norm_dist[curclass_matches])
            feats+=[min_norm_dist]
#         print('min norm dist feat:',feats)
        
#         assert len(feats) == self.n_classes
        return_list += [feats]
#         end = time.time()
#         print('section 4 time:',end-start)
        
        '''
            5. 
               5.1 Distance to Kth neighbor
                   Think of this as of quantiles of a distribution
               5.2 Distance to Kth neighbor normalized by 
                   distance to the first neighbor
               
               feat_51, feat_52 are answers to 5.1. and 5.2.
               should be scalars
               
               Do not forget to add self.eps to denominator.
        '''
#         start = time.time()
        for k in self.k_list:
            
            feat_51 = neighs_dist[k-1]
            #normalize by the first non-zero neighbor
#             feat_52 = feat_51 / (self.eps * delzerovals(neighs_dist)[0])
            feat_52 = feat_51 / (self.eps + neighs_dist[0])
            
#             print('feat 51:',feat_51)
#             print('feat 52:',feat_52)
          
            return_list += [[feat_51, feat_52]]
#         end = time.time()
#         print('section 5 time:',end-start)
        
#         print('return list upto point 5:', return_list)
        '''
            6. Mean distance to neighbors of each class for each K from `k_list` 
                   For each class select the neighbors of that class among K nearest neighbors 
                   and compute the average distance to those objects
                   
                   If there are no objects of a certain class among K neighbors, set mean distance to 999
                   
               You can use `np.bincount` with appropriate weights
               Don't forget, that if you divide by something, 
               You need to add `self.eps` to denominator.
        '''
#         start = time.time()
        for k in self.k_list:
            
            # YOUR CODE GOES HERE
            denom= np.bincount(neighs_y[:k],minlength=self.n_classes) + self.eps  
            numer= np.bincount(neighs_y[:k],minlength=self.n_classes,weights=neighs_dist[:k])  
            feats = numer / denom
            feats[feats==0]=999
#             print('6. denom for k={} : {}'.format(k,denom))
#             print('6. numer for k={} : {}'.format(k,numer))
#             print('6. feats for k={} : {}'.format(k,feats))
            
#             assert len(feats) == self.n_classes
            return_list += [feats]
#         end = time.time()
#         print('section 6 time:',end-start)

    
        '''
            7. Maximum of the distance to objects of each class
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.
        '''
        feats = []
        for c in range(self.n_classes):
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                max_dist = 999
            else:
                max_dist = np.max(neighs_dist[curclass_matches])
            feats+=[max_dist]
        return_list += [feats] 
        
        self.debug_index(index,'knn 7')     
        '''
            8. Standard deviation of the distance to objects of each class
               If there are no neighboring objects of some classes, 
               Then set distance to that class to be 999.
        '''
        feats = []
        for c in range(self.n_classes):
            curclass_matches = np.where(neighs_y==c)
            if len(curclass_matches[0])==0:
                std_dist = 999
            else:
                std_dist = np.std(neighs_dist[curclass_matches])
            feats+=[std_dist]
        return_list += [feats]
        self.debug_index(index,'knn 8')     
        
#         '''
#             9. Skew of the distance to objects of each class
#                If there are no neighboring objects of some classes, 
#                Then set distance to that class to be 999.
#         '''
#         feats = []
#         for c in range(self.n_classes):
#             curclass_matches = np.where(neighs_y==c)
#             if len(curclass_matches[0])==0:
#                 skew_dist = 999
#             else:
#                 skew_dist = skew(neighs_dist[curclass_matches])
#             feats+=[skew_dist]
#         return_list += [feats]
#         '''
#             10. Kurtosis of the distance to objects of each class
#                If there are no neighboring objects of some classes, 
#                Then set distance to that class to be 999.
#         '''
#         feats = []
#         for c in range(self.n_classes):
#             curclass_matches = np.where(neighs_y==c)
#             if len(curclass_matches[0])==0:
#                 kurtosis_dist = 999
#             else:
#                 kurtosis_dist = kurtosis(neighs_dist[curclass_matches])
#             feats+=[kurtosis_dist]
#         return_list += [feats]   
        
#         '''
#             11. IQR of the distance to objects of each class
#                If there are no neighboring objects of some classes, 
#                Then set distance to that class to be 999.
#         '''
#         start = time.time()
#         feats = []
#         for c in range(self.n_classes):
#             curclass_matches = np.where(neighs_y==c)
#             if len(curclass_matches[0])==0:
#                 q11 =999;q12=999;q13=999;q14=999
#                 q31=999;q32=999;q33=999;q34=999
#                 iqr_mean=999;iqr_range_dist=999;range_dist=999;
#             else:
#                 cur_neighs_dist = neighs_dist[curclass_matches]
#                 q11 = np.quantile(cur_neighs_dist,0.05)
#                 q12 = np.quantile(cur_neighs_dist,0.1)
#                 q13 = np.quantile(cur_neighs_dist,0.15)
#                 q14 = np.quantile(cur_neighs_dist,0.25)
#                 q31 = np.quantile(cur_neighs_dist,0.95)
#                 q32 = np.quantile(cur_neighs_dist,0.9)
#                 q33 = np.quantile(cur_neighs_dist,0.85)
#                 q34 = np.quantile(cur_neighs_dist,0.75)
                
#                 mask = (cur_neighs_dist>=q11) & (cur_neighs_dist<=q31)
#                 iqr_mean = np.mean(cur_neighs_dist[mask])
#                 range_dist = max_dist - min_dist
#                 iqr_range_dist = q34 - q14
                
#             feats+=[q11,q12,q13,q14,q31,q32,q33,q34,iqr_mean,iqr_range_dist,range_dist]
#         return_list += [feats]     
#         end = time.time()
        
        
#         if index % 1000 ==0:
#             print('index:',index)
#             print('quantile time:',end-start)
    

#         return_list+=[np.array([0,1,2,3,4,5])]
        # merge
#         start = time.time()
        knn_feats = np.hstack(return_list)
#         end = time.time()
#         print('hstack time:',end-start)
#         print('total feats shape',knn_feats.shape)

#         pbar.update(self.n_jobs)
        return knn_feats

## Get features for train

Compute features for train, using out-of-fold strategy.

In [22]:

# Differently from other homework we will not implement OOF predictions ourselves
# but use sklearn's `cross_val_predict`
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

def getcombinedknnfeats(metriclist,k_list,train_X,train_Y,indices=None,test=None,
                        col_start=0,istrain=1,NN_index=None):
    if istrain:
        result_knn_feats,ret_NN_index= getknnfeatsfortrain(metriclist,k_list,train_X,train_Y,
                                             NN_index=NN_index,indices=indices)
        result_index=train_X.index
        source = train_X
    else:
        result_knn_feats,ret_NN_index= getknnfeatsfortest(metriclist,k_list,train_X,train_Y,test,
                                             NN_index)
        result_index = test.index
        source = test
    print('knn feats generation complete')
    knn_feats= pd.DataFrame(result_knn_feats[0],index=result_index)
    knn_feats_count = knn_feats.shape[1]
    knn_feats.columns = ['knn'+str(i) for i in range(col_start,knn_feats_count+col_start)]
    
#     combined_data = knn_feats
    
    print('knn feats shape:',knn_feats.shape)
    combined_data = pd.concat([source,knn_feats],axis=1)
    print('knn combine process complete')
    return knn_feats_count,combined_data,ret_NN_index

def getknnfeatsfortest(metriclist,k_list,train_X,train_Y,test,NN_index=None):
        #target encoding on user_id, category_id and video_id
        enc = TargetEncoder(cols=['user_id', 'video_id','category_id'])

        test_knn_feats=[]
        NNF = NearestNeighborsFeats(n_jobs=8, NN_index=NN_index,
                                    k_list=k_list, metric='dummy',
                                   neigh_filename=neigh_filename,
                                   loadneighs=loadneighs,
                                   saveneighs=saveneighs,
                                   loadindex = loadindex,
                                   index_filename= index_filename,
                                   saveindex=saveindex)
        for metric in metriclist:
            print (metric)
            NNF.setmetric(metric)
            
            #target encoding on user_id, category_id and video_id
            enc = TargetEncoder(cols=['user_id', 'video_id','category_id'])

            y_tr_raw = train_X[targetcol2]
    #         transform the datasets
            tr = enc.fit_transform(train_X, y_tr_raw)
            print(len(tr.columns))
            print(tr.columns)
            test = enc.transform(test) 

            tr['istrain']=1
            test['istrain']=0

            combined = pd.concat([tr,test],axis=0)
            eps=1e-10
            combined['user_id_enc_bin']=pd.cut(combined.user_id,np.linspace(tr['user_id'].min()-eps,
                                                                            tr['user_id'].max(),21),
                                                 labels = range(1,21)
                                               )
            print('user_id_enc_bin:',combined['user_id_enc_bin'].unique())   
            combined['video_id_enc_bin']=pd.cut(combined.video_id,np.linspace(tr['video_id'].min()-eps,
                                                                            tr['video_id'].max(),8),
                                                 labels = [1,2,3,4,5,6,7]
                                                )
            print('video_id_enc_bin:',combined['video_id_enc_bin'].unique())   
            combined['category_id_enc_bin']=pd.cut(combined.category_id,np.linspace(tr['category_id'].min()-eps,
                                                                            tr['category_id'].max(),5),
                                                 labels = [1,2,3,4]
                                                   )
            print('category_id_enc_bin:',combined['category_id_enc_bin'].unique())   

            combined= ordinal_dummy_coding(combined,['user_id_enc_bin','video_id_enc_bin','category_id_enc_bin'])

            tr= combined[combined['istrain']==1]
            test= combined[combined['istrain']==0]
            del combined,tr['istrain'],test['istrain']     

            features_new = features.copy()
            #exclude target encoding fields since these are already binned and ordinal encoded
            exclude_cols = ['user_id','video_id','category_id',targetcol,targetcol2]
            enc_bin_cols = [col for col in tr.columns if '_enc_bin' in col]
            print(enc_bin_cols)
            features_new = [col for col in features if col not in exclude_cols]
            features_new += enc_bin_cols

            #note: user_id_orig, video_id_orig and category_id_orig will be used assuming only 
            #categorical metric

            print('features used for KNN Feats generation:',sorted(features_new))

            NNF.fit(tr[features_new],train_Y)
            test_knn_feats += [NNF.predict(test[features_new])]

#             # Create instance of our KNN feature extractor
#             # Fit on train set
#             NNF.fit(train_X , train_Y)
#             # Get features for test
#             test_knn_feats += [NNF.predict(test)]
#             # Dump the features to disk
# #             np.save('knn_feats_%s_test.npy' % metric , test_knn_feats)
        
        return test_knn_feats,NNF.getNNindex()

    
def runknnmodel(NNF,train_X,train_Y,indices=None):
    
    oof_knn_feats = None
    if indices is None:
        print('indices is None')
#         folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)
        folds = StratifiedKFold(n_splits=2, shuffle=False, random_state=888)
        indices = folds.split(train_to_split.values, target_to_split.values)
    
    for fold_, (train_idx, val_idx) in enumerate(indices):
        
        tr = train_X.iloc[train_idx]
        y_tr= train_Y.iloc[train_idx]
        val = train_X.iloc[val_idx]
        y_val = train_Y.iloc[val_idx]

        print('fold:{0} train shape:{1} val shape: {2}'.format(fold_,y_tr.shape,y_val.shape))
        
        #target encoding on user_id, category_id and video_id
        enc = TargetEncoder(cols=['user_id', 'video_id','category_id'])

        y_tr_raw = train_X[targetcol2].iloc[train_idx]
#         transform the datasets
        print('debug shape:',tr.shape,y_tr_raw.shape)
        tr = enc.fit_transform(tr, y_tr_raw)
        print(len(tr.columns))
        print(tr.columns)
        val = enc.transform(val) 

        tr['istrain']=1
        val['istrain']=0

        combined = pd.concat([tr,val],axis=0)
        eps=1e-10
        combined['user_id_enc_bin']=pd.cut(combined.user_id,np.linspace(tr['user_id'].min()-eps,
                                                                        tr['user_id'].max(),21),
                                             labels = range(1,21)
                                           )
        print('user_id_enc_bin:',combined['user_id_enc_bin'].unique())   
        combined['video_id_enc_bin']=pd.cut(combined.video_id,np.linspace(tr['video_id'].min()-eps,
                                                                        tr['video_id'].max(),8),
                                             labels = [1,2,3,4,5,6,7]
                                            )
        print('video_id_enc_bin:',combined['video_id_enc_bin'].unique())   
        combined['category_id_enc_bin']=pd.cut(combined.category_id,np.linspace(tr['category_id'].min()-eps,
                                                                        tr['category_id'].max(),5),
                                             labels = [1,2,3,4]
                                               )
        print('category_id_enc_bin:',combined['category_id_enc_bin'].unique())   

        combined= ordinal_dummy_coding(combined,['user_id_enc_bin','video_id_enc_bin','category_id_enc_bin'])

        tr= combined[combined['istrain']==1]
        val= combined[combined['istrain']==0]
        del combined,tr['istrain'],val['istrain']     
        
        features_new = features.copy()
        #exclude target encoding fields since these are already binned and ordinal encoded
        exclude_cols = ['user_id','video_id','category_id',targetcol,targetcol2]
        enc_bin_cols = [col for col in tr.columns if '_enc_bin' in col]
        print(enc_bin_cols)
        features_new = [col for col in features if col not in exclude_cols]
        features_new += enc_bin_cols
        
        #note: user_id_orig, video_id_orig and category_id_orig will be used assuming only 
        #categorical metric
        
        if fold_==0:
            print('features used for KNN Feats generation:',sorted(features_new))
#         tr,val = scaledata(tr,val,features)
        
        NNF.fit(tr[features_new],y_tr)
        cur_knn_feats = NNF.predict(val[features_new])
        if oof_knn_feats is None:
            knn_feats_count = cur_knn_feats.shape[1]
            oof_knn_feats = np.zeros((train_X.shape[0],knn_feats_count))
        
        oof_knn_feats[val_idx] = cur_knn_feats

    return oof_knn_feats
    
def getknnfeatsfortrain(metriclist,k_list,train_X,train_Y,indices=None,NN_index=None):

    # We will use two metrics for KNN
    train_knn_feats=[]
    # for metric in ['minkowski', 'cosine']:
        # Create instance of our KNN feature extractor
        # n_jobs can be larger than the number of cores
    NNF = NearestNeighborsFeats(n_jobs=8, k_list=k_list, metric='dummy',
                               NN_index=NN_index,
                                    neigh_filename=neigh_filename,
                                   loadneighs=loadneighs,
                                   saveneighs=saveneighs,
                                   saveindex=saveindex)
    for metric in metriclist:
        print (metric)
        NNF.setmetric(metric)
#         if indices is None:
#             # Set up splitting scheme, use StratifiedKFold
#             # use skf_seed and n_splits defined above with shuffle=True
#             skf = StratifiedKFold(n_splits=n_splits, random_state=4590, shuffle=True)
#             #Note: cross_val_predict internally creates new NNF object 
#             # and so if NN_index need to be passed, then cross_val_predict to be replaced
#             # with the manual 4 fold validation code
#             train_knn_feats_cur = cross_val_predict(NNF, train_X,train_Y,cv=skf)
#         else:
        train_knn_feats_cur = runknnmodel(NNF,train_X,train_Y,indices=indices)
            
        print(train_knn_feats_cur.shape)
        train_knn_feats+=[train_knn_feats_cur]
        # Save the features
#         np.save('knn_feats_%s_train.npy' % metric, train_knn_feats_cur)
    return train_knn_feats,NNF.getNNindex()

In [16]:
def knncolnames(k_list):
    x = len(k_list)
    #number of classes are unique number of values in the target
    c = target.nunique()
    
    origindex=0
    colname_dict ={}
    for i in range(x*c):
        colname_dict['knn'+str(origindex)] = 'knn_prop.' + str(i+1)
        origindex +=1
    colname_dict['knn'+str(origindex)] = 'knn_streak'
    origindex +=1
    for i in range(c):
        colname_dict['knn'+str(origindex)] = 'knn_min.'+ str(i+1)
        origindex +=1
    for i in range(c):
        colname_dict['knn'+str(origindex)] = 'knn_min_norm.'+ str(i+1)
        origindex +=1
    
    for i in range(x*2):
        colname_dict['knn'+str(origindex)] = 'knn_dist_kth.' + str(i+1)
        origindex +=1
    for i in range(x*c):
        colname_dict['knn'+str(origindex)] = 'knn_mean.' + str(i+1)
        origindex +=1
    for i in range(c):
        colname_dict['knn'+str(origindex)] = 'knn_max.'+ str(i+1)
        origindex +=1
    for i in range(c):
        colname_dict['knn'+str(origindex)] = 'knn_std.'+ str(i+1)
        origindex +=1            
        
#     knnno = 7
#     for j in range(4):
#         for i in range(2):
#             colname_dict['knn'+str(origindex)] = 'knn_'+str(knnno)+'.' + str(i+1)
#             origindex +=1
#         knnno+=1   
        
#     #iqr
#     for j in range(2):
#         for i in range(11):
#             colname_dict['knn'+str(origindex)] = 'knn_'+str(knnno)+'.' + str(j+1) + '_' + str(i+1)
#             origindex +=1
    
    return colname_dict


def exec_knn_fulltrainortest(istest,metric,k_list,train_scaled,test_scaled,target,indices):
        
    if istest:
        istrain=0
        file_template='test_'    
    else:
        istrain=1
        file_template='fulltrain_'
        
    
    print()
    print('************************KNN Execution ************************')
    print()
    knn_count,data_with_knn,NN_index=getcombinedknnfeats([metric],k_list,train_scaled,
                                                      target,indices=indices,
                                                istrain=istrain,test=test_scaled,
                                                NN_index=None)
    print('data_with_knn shape:',data_with_knn.shape)
       
    colname_dict=knncolnames(k_list)
    data_with_knn.rename(columns=colname_dict, inplace=True)    
    
    saveknnfeats(data_with_knn,file_template,metric)
    
    return data_with_knn

def saveknnfeats(data_with_knn,file_template,metric):
    print()
    knncols = [col for col in data_with_knn.columns if col.startswith('knn') ]
    knnfilename = '{0}_{1}_{2}.zip'.format(file_template,'knn_feats',metric)
    print('Save KNN Feats File Name: ',knnfilename)
    data_with_knn[knncols].to_pickle(knnfilename)
    print('Save KNN Feats Complete')
    

In [17]:
print(np.sort(train[targetcol2].unique()))

[0.   0.02 0.4  0.42 0.45 0.69 0.86 1.09 1.1  1.15 1.37 1.39 1.5  1.55
 1.61 1.71 1.79 1.84 1.95 1.96 2.01 2.06 2.08 2.16 2.19 2.2  2.24 2.3
 2.32 2.35 2.4  2.4  2.47 2.48 2.53 2.56 2.59 2.6  2.64 2.65 2.7  2.71
 2.76 2.77 2.8  2.83 2.85 2.88 2.89 2.9  2.94 2.94 2.96 2.98 3.   3.02
 3.04 3.06 3.09 3.11 3.13 3.14 3.16 3.17 3.18 3.19 3.22 3.23 3.23 3.26
 3.29 3.3  3.32 3.33 3.34 3.37 3.4  3.4  3.42 3.43 3.44 3.45 3.47 3.49
 3.5  3.52 3.53 3.54 3.56 3.57 3.58 3.59 3.61 3.62 3.63 3.64 3.65 3.66
 3.67 3.69 3.7  3.71 3.73 3.74 3.75 3.76 3.77 3.78 3.79 3.8  3.81 3.82
 3.83 3.84 3.85 3.87 3.89 3.9  3.91 3.92 3.93 3.95 3.96 3.97 3.98 3.99
 4.01 4.02 4.03 4.04 4.05 4.06 4.06 4.08 4.09 4.1  4.11 4.12 4.13 4.14
 4.14 4.15 4.16 4.17 4.18 4.19 4.2  4.21 4.22 4.23 4.24 4.25 4.26 4.27
 4.27 4.28 4.28 4.29 4.3  4.31 4.31 4.32 4.32 4.33 4.34 4.35 4.36 4.36
 4.37 4.38 4.39 4.39 4.41 4.42 4.43 4.43 4.44 4.45 4.46 4.47 4.48 4.49
 4.5  4.51 4.51 4.52 4.53 4.54 4.55 4.56 4.56 4.57 4.58 4.59 4.6  4.6
 4.61 4.

In [20]:
k_list=[25]
metric ='hamming'
n_splits = 10

train_to_split = train
target_to_split = target

neigh_filename='test_neighs.csv'
loadneighs=False
saveneighs=False
saveindex=False
loadindex = False
index_filename = 'train_index.ann'

#Train KNN
saveindex=False
loadindex = False
DEBUG = False
# train_to_split = train_sample
# target_to_split = target_sample
# train_with_knn = exec_knn_fulltrainortest(False,metric,k_list,train_sample,None,
#                                          target_sample,None)

In [95]:
train_with_knn = exec_knn_fulltrainortest(False,metric,k_list,
                                          train[features],None,
                                         target,indices=indices)
knncols =[col for col in train_with_knn if 'knn_' in col]
print(train_with_knn[knncols].shape)
train_with_knn[knncols].head()


************************KNN Execution ************************

hamming
fold:0 train shape:(68279,) val shape: (7587,)
debug shape: (68279, 45) (68279,)
45
Index(['age_bin_2', 'age_bin_3', 'age_bin_4', 'age_bin_5', 'age_bin_6',
       'age_bin_7', 'age_bin_8', 'followers_bin_2', 'followers_bin_3',
       'followers_bin_4', 'followers_bin_5', 'views_bin_2', 'views_bin_3',
       'views_bin_4', 'views_bin_5', 'views_bin_6', 'views_bin_7',
       'category_user_count_bin_2', 'category_user_count_bin_3',
       'category_user_count_bin_4', 'category_user_count_bin_5',
       'category_user_count_bin_6', 'category_user_count_bin_7',
       'user_video_count_bin_2', 'user_video_count_bin_3',
       'user_video_count_bin_4', 'user_video_count_bin_5',
       'category_video_count_bin_2', 'category_video_count_bin_3',
       'category_video_count_bin_4', 'category_video_count_bin_5',
       'category_video_count_bin_6', 'category_video_count_bin_7', 'user_id',
       'video_id', 'gender', 'cat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


['user_id_enc_bin_2', 'user_id_enc_bin_3', 'user_id_enc_bin_4', 'user_id_enc_bin_5', 'user_id_enc_bin_6', 'user_id_enc_bin_7', 'user_id_enc_bin_8', 'user_id_enc_bin_9', 'user_id_enc_bin_10', 'user_id_enc_bin_11', 'user_id_enc_bin_12', 'user_id_enc_bin_13', 'user_id_enc_bin_14', 'user_id_enc_bin_15', 'user_id_enc_bin_16', 'user_id_enc_bin_17', 'user_id_enc_bin_18', 'user_id_enc_bin_19', 'user_id_enc_bin_20', 'video_id_enc_bin_2', 'video_id_enc_bin_3', 'video_id_enc_bin_4', 'video_id_enc_bin_5', 'video_id_enc_bin_6', 'video_id_enc_bin_7', 'category_id_enc_bin_2', 'category_id_enc_bin_3', 'category_id_enc_bin_4']
features used for KNN Feats generation: ['age_bin_2', 'age_bin_3', 'age_bin_4', 'age_bin_5', 'age_bin_6', 'age_bin_7', 'age_bin_8', 'category_id_enc_bin_2', 'category_id_enc_bin_3', 'category_id_enc_bin_4', 'category_id_orig', 'category_user_count_bin_2', 'category_user_count_bin_3', 'category_user_count_bin_4', 'category_user_count_bin_5', 'category_user_count_bin_6', 'category_

['user_id_enc_bin_2', 'user_id_enc_bin_3', 'user_id_enc_bin_4', 'user_id_enc_bin_5', 'user_id_enc_bin_6', 'user_id_enc_bin_7', 'user_id_enc_bin_8', 'user_id_enc_bin_9', 'user_id_enc_bin_10', 'user_id_enc_bin_11', 'user_id_enc_bin_12', 'user_id_enc_bin_13', 'user_id_enc_bin_14', 'user_id_enc_bin_15', 'user_id_enc_bin_16', 'user_id_enc_bin_17', 'user_id_enc_bin_18', 'user_id_enc_bin_19', 'user_id_enc_bin_20', 'video_id_enc_bin_2', 'video_id_enc_bin_3', 'video_id_enc_bin_4', 'video_id_enc_bin_5', 'video_id_enc_bin_6', 'video_id_enc_bin_7', 'category_id_enc_bin_2', 'category_id_enc_bin_3', 'category_id_enc_bin_4']
NN Fit Start..
NN Fit End..
Fit Exec Time: 12.131147861480713
NN Load start
Load Exec Time: 0.0001800060272216797
NN query start
neighs nan shape: (0,)
NN query end
Query Exec Time: 3.255586862564087
Feature Start
no of cpus: 8
Feature End
Feature Exec Time: 160.37733840942383
fold:4 train shape:(68279,) val shape: (7587,)
debug shape: (68279, 45) (68279,)
45
Index(['age_bin_2', 

['user_id_enc_bin_2', 'user_id_enc_bin_3', 'user_id_enc_bin_4', 'user_id_enc_bin_5', 'user_id_enc_bin_6', 'user_id_enc_bin_7', 'user_id_enc_bin_8', 'user_id_enc_bin_9', 'user_id_enc_bin_10', 'user_id_enc_bin_11', 'user_id_enc_bin_12', 'user_id_enc_bin_13', 'user_id_enc_bin_14', 'user_id_enc_bin_15', 'user_id_enc_bin_16', 'user_id_enc_bin_17', 'user_id_enc_bin_18', 'user_id_enc_bin_19', 'user_id_enc_bin_20', 'video_id_enc_bin_2', 'video_id_enc_bin_3', 'video_id_enc_bin_4', 'video_id_enc_bin_5', 'video_id_enc_bin_6', 'video_id_enc_bin_7', 'category_id_enc_bin_2', 'category_id_enc_bin_3', 'category_id_enc_bin_4']
NN Fit Start..
NN Fit End..
Fit Exec Time: 12.409228563308716
NN Load start
Load Exec Time: 0.0001125335693359375
NN query start
neighs nan shape: (0,)
NN query end
Query Exec Time: 3.2719411849975586
Feature Start
no of cpus: 8
Feature End
Feature Exec Time: 165.23633432388306
fold:8 train shape:(68280,) val shape: (7586,)
debug shape: (68280, 45) (68280,)
45
Index(['age_bin_2',

Unnamed: 0,knn_prop.1,knn_prop.2,knn_prop.3,knn_prop.4,knn_prop.5,knn_prop.6,knn_prop.7,knn_prop.8,knn_prop.9,knn_prop.10,knn_prop.11,knn_prop.12,knn_prop.13,knn_prop.14,knn_prop.15,knn_prop.16,knn_prop.17,knn_prop.18,knn_prop.19,knn_prop.20,knn_prop.21,knn_prop.22,knn_prop.23,knn_prop.24,knn_prop.25,knn_prop.26,knn_prop.27,knn_prop.28,knn_prop.29,knn_prop.30,knn_prop.31,knn_prop.32,knn_prop.33,knn_prop.34,knn_prop.35,knn_prop.36,knn_prop.37,knn_prop.38,knn_prop.39,knn_prop.40,knn_prop.41,knn_prop.42,knn_prop.43,knn_prop.44,knn_prop.45,knn_prop.46,knn_prop.47,knn_prop.48,knn_prop.49,knn_prop.50,...,knn_min_norm.205,knn_min_norm.205.1,knn_min_norm.206,knn_min_norm.206.1,knn_min_norm.207,knn_min_norm.207.1,knn_min_norm.208,knn_min_norm.208.1,knn_min_norm.209,knn_min_norm.209.1,knn_min_norm.210,knn_min_norm.210.1,knn_min_norm.211,knn_min_norm.211.1,knn_min_norm.212,knn_min_norm.212.1,knn_min_norm.213,knn_min_norm.213.1,knn_min_norm.214,knn_min_norm.214.1,knn_min_norm.215,knn_min_norm.215.1,knn_min_norm.216,knn_min_norm.216.1,knn_min_norm.217,knn_min_norm.217.1,knn_min_norm.218,knn_min_norm.218.1,knn_min_norm.219,knn_min_norm.219.1,knn_min_norm.220,knn_min_norm.220.1,knn_min_norm.221,knn_min_norm.221.1,knn_min_norm.222,knn_min_norm.222.1,knn_min_norm.223,knn_min_norm.223.1,knn_min_norm.224,knn_min_norm.224.1,knn_min_norm.225,knn_min_norm.225.1,knn_min_norm.226,knn_min_norm.226.1,knn_min_norm.227,knn_min_norm.227.1,knn_min_norm.228,knn_min_norm.228.1,knn_min_norm.229,knn_min_norm.229.1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,0.0,0.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,1000000.0,1.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,1000000.0,1.0,999.0,999.0,1000000.0,1.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.04,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,1000000.0,1.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.04,0.0,0.0,0.04,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.12,0.0,0.0,0.0,0.04,0.0,0.0,0.16,0.0,0.0,0.0,0.08,0.0,0.0,0.16,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,1000000.0,1.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,1000000.0,1.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0


In [None]:
pd.options.display.max_rows =500
pd.options.display.max_columns =500

In [None]:
# train_with_knn.loc[(train_with_knn['knn1156']!=0) & (train_with_knn['knn1156']!=999),'knn1156'].shape[0]

382

In [19]:
# # print(list(train_with_knn.columns))
# # train_with_knn[['knn1367','knn1368']].describe()
# ambig_cols=[]
# knncols =[col for col in train_with_knn.columns if col.startswith('knn')]
# for col in knncols:
#     count = train_with_knn[(train_with_knn[col]!=0) & (train_with_knn[col]!=999)].shape[0]
#     print(col,count)
#     if train_with_knn[(train_with_knn[col]!=0) & (train_with_knn[col]!=999)].shape[0]==0:
#         ambig_cols+=[col]
# print(ambig_cols)

In [18]:
train_with_knn.columns

NameError: name 'train_with_knn' is not defined

In [23]:
DEBUG = False
loadneighs=False
saveindex = False
loadindex = False

test['user_id_orig']=test['user_id'].copy()
test['video_id_orig']=test['video_id'].copy()
test['category_id_orig']=test['category_id'].copy()

# Test KNN
test_with_knn = exec_knn_fulltrainortest(True,metric,k_list,train[features],test[features],
                                         target,None)

# test_with_knn = execknn_testblocks(test_sample,train_sample,target_sample)


************************KNN Execution ************************

hamming
45
Index(['age_bin_2', 'age_bin_3', 'age_bin_4', 'age_bin_5', 'age_bin_6',
       'age_bin_7', 'age_bin_8', 'followers_bin_2', 'followers_bin_3',
       'followers_bin_4', 'followers_bin_5', 'views_bin_2', 'views_bin_3',
       'views_bin_4', 'views_bin_5', 'views_bin_6', 'views_bin_7',
       'category_user_count_bin_2', 'category_user_count_bin_3',
       'category_user_count_bin_4', 'category_user_count_bin_5',
       'category_user_count_bin_6', 'category_user_count_bin_7',
       'user_video_count_bin_2', 'user_video_count_bin_3',
       'user_video_count_bin_4', 'user_video_count_bin_5',
       'category_video_count_bin_2', 'category_video_count_bin_3',
       'category_video_count_bin_4', 'category_video_count_bin_5',
       'category_video_count_bin_6', 'category_video_count_bin_7', 'user_id',
       'video_id', 'gender', 'category_id', 'profession_1', 'profession_2',
       'user_category_count_2', 'user_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


['user_id_enc_bin_2', 'user_id_enc_bin_3', 'user_id_enc_bin_4', 'user_id_enc_bin_5', 'user_id_enc_bin_6', 'user_id_enc_bin_7', 'user_id_enc_bin_8', 'user_id_enc_bin_9', 'user_id_enc_bin_10', 'user_id_enc_bin_11', 'user_id_enc_bin_12', 'user_id_enc_bin_13', 'user_id_enc_bin_14', 'user_id_enc_bin_15', 'user_id_enc_bin_16', 'user_id_enc_bin_17', 'user_id_enc_bin_18', 'user_id_enc_bin_19', 'user_id_enc_bin_20', 'video_id_enc_bin_2', 'video_id_enc_bin_3', 'video_id_enc_bin_4', 'video_id_enc_bin_5', 'video_id_enc_bin_6', 'video_id_enc_bin_7', 'category_id_enc_bin_2', 'category_id_enc_bin_3', 'category_id_enc_bin_4']
features used for KNN Feats generation: ['age_bin_2', 'age_bin_3', 'age_bin_4', 'age_bin_5', 'age_bin_6', 'age_bin_7', 'age_bin_8', 'category_id_enc_bin_2', 'category_id_enc_bin_3', 'category_id_enc_bin_4', 'category_id_orig', 'category_user_count_bin_2', 'category_user_count_bin_3', 'category_user_count_bin_4', 'category_user_count_bin_5', 'category_user_count_bin_6', 'category_

In [30]:
print(len(test_with_knn.columns))
test_with_knn.shape

1422


(8121, 1422)

In [43]:
test_with_knn['knn_min.228'].head()

0    999.0
1    999.0
2    999.0
3    999.0
4    999.0
Name: knn_min.228, dtype: float64

In [44]:
# test_with_knn.shape
knncols =[col for col in test_with_knn if col.startswith('knn')]
print(len(knncols))
cols = knncols
print(test_with_knn[cols].shape)
test_with_knn[cols].head()

1377
(8121, 1835)


Unnamed: 0,knn_prop.1,knn_prop.2,knn_prop.3,knn_prop.4,knn_prop.5,knn_prop.6,knn_prop.7,knn_prop.8,knn_prop.9,knn_prop.10,knn_prop.11,knn_prop.12,knn_prop.13,knn_prop.14,knn_prop.15,knn_prop.16,knn_prop.17,knn_prop.18,knn_prop.19,knn_prop.20,knn_prop.21,knn_prop.22,knn_prop.23,knn_prop.24,knn_prop.25,knn_prop.26,knn_prop.27,knn_prop.28,knn_prop.29,knn_prop.30,knn_prop.31,knn_prop.32,knn_prop.33,knn_prop.34,knn_prop.35,knn_prop.36,knn_prop.37,knn_prop.38,knn_prop.39,knn_prop.40,knn_prop.41,knn_prop.42,knn_prop.43,knn_prop.44,knn_prop.45,knn_prop.46,knn_prop.47,knn_prop.48,knn_prop.49,knn_prop.50,...,knn1327,knn1328,knn1329,knn1330,knn1331,knn1332,knn1333,knn1334,knn1335,knn1336,knn1337,knn1338,knn1339,knn1340,knn1341,knn1342,knn1343,knn1344,knn1345,knn1346,knn1347,knn1348,knn1349,knn1350,knn1351,knn1352,knn1353,knn1354,knn1355,knn1356,knn1357,knn1358,knn1359,knn1360,knn1361,knn1362,knn1363,knn1364,knn1365,knn1366,knn1367,knn1368,knn1369,knn1370,knn1371,knn1372,knn1373,knn1374,knn1375,knn1376
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.04,0.0,0.0,0.0,0.08,0.0,0.0,0.12,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.04,0.0,0.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0


In [27]:
features

['age_bin_2',
 'age_bin_3',
 'age_bin_4',
 'age_bin_5',
 'age_bin_6',
 'age_bin_7',
 'age_bin_8',
 'followers_bin_2',
 'followers_bin_3',
 'followers_bin_4',
 'followers_bin_5',
 'views_bin_2',
 'views_bin_3',
 'views_bin_4',
 'views_bin_5',
 'views_bin_6',
 'views_bin_7',
 'category_user_count_bin_2',
 'category_user_count_bin_3',
 'category_user_count_bin_4',
 'category_user_count_bin_5',
 'category_user_count_bin_6',
 'category_user_count_bin_7',
 'user_video_count_bin_2',
 'user_video_count_bin_3',
 'user_video_count_bin_4',
 'user_video_count_bin_5',
 'category_video_count_bin_2',
 'category_video_count_bin_3',
 'category_video_count_bin_4',
 'category_video_count_bin_5',
 'category_video_count_bin_6',
 'category_video_count_bin_7',
 'user_id',
 'video_id',
 'gender',
 'category_id',
 'profession_1',
 'profession_2',
 'user_category_count_2',
 'user_category_count_3',
 'user_id_orig',
 'video_id_orig',
 'category_id_orig',
 'engagement_score']

In [26]:
saveknnfeats(test_with_knn,'test_','euclidean')


Save KNN Feats File Name:  test__knn_feats_euclidean.zip
Save KNN Feats Complete


In [None]:
len(train_scaled.columns)

In [None]:
test_with_knn[knncols].head()

In [None]:
test_with_knn['knn_6.4'].describe()

In [None]:
# train_with_knn_copy = train_with_knn.copy()
# test_with_knn_copy = test_with_knn.copy()

In [None]:
# train_with_knn = train_with_knn_copy.copy()
# test_with_knn = test_with_knn_copy.copy()


In [None]:
# selcols = ['knn_3.1','knn_3.2','knn_5.3','knn_5.4','knn_6.3','knn_6.4','knn_7.1','knn_7.2']
selcols = ['knn_3.1','knn_3.2','knn_5.1','knn_5.2','knn_6.1','knn_6.2','knn_7.1','knn_7.2']
# for col in selcols:
#     train_with_knn[col] = train_with_knn[col].rank(pct=True)

col = 'knn_4.1'

print(train_with_knn[col].describe())
print(train_with_knn.loc[target==1,col].describe())
print(train_with_knn.loc[target==0,col].quantile(0.05))
print(train_with_knn.loc[target==0,col].describe())

# selcols = ['knn_3.2']
# scaler = StandardScaler()
# scaled = scaler.fit_transform(train_with_knn[selcols])
# train_scaled_knn = pd.DataFrame(scaled)
# train_scaled_knn.columns = [selcols]
# print(train_scaled_knn['knn_3.2'].describe())
# print(train_scaled_knn.loc[target==1,'knn_3.2'].describe())
# print(train_scaled_knn.loc[target==0,'knn_3.2'].describe())

In [None]:
# scaler = StandardScaler()
# scaled = scaler.fit_transform(test_with_knn[selcols])
# test_scaled_knn = pd.DataFrame(scaled)
# test_scaled_knn.columns = [selcols]
# test_scaled_knn['knn_3.2'].describe()

# for col in selcols:
#     test_with_knn[col] = test_with_knn[col].rank(pct=True)

test_with_knn['knn_3.2'].describe()

In [None]:
test_with_knn['knn_3.2'].describe()

In [None]:
import time
def runlgb(ispermutefeats,train,test,param,cur_features,score_function=None,isparamFolds=False):

    overall_sel_feats =[]
    overall_imp_df = pd.DataFrame()
    overall_imp_df['feature']= np.array(cur_features)
    overall_imp_df['overall_score_mean'] =0 
    overall_imp_df['overall_score_max'] =-9999 
    overall_imp_df['overall_score_min'] =9999 
    
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    start = time.time()
    valid_scores =[]
    fold_importance_df = pd.DataFrame()
    

#     folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)
    folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=888)
    indices = folds.split(train_orig.values, target.values)
        
    for fold_, (trn_idx, val_idx) in enumerate(indices):
        print()
        print("fold n°{}".format(fold_))

        tr = train.iloc[trn_idx]
        val = train.iloc[val_idx]
        y_val = target.iloc[val_idx]
        y_tr = target.iloc[trn_idx]
        
#         val_index_ser = pd.Series(np.array(val.index))
#         print('val shape:',val.shape)
#         print('val index head:',val_index_ser.head(20))
#         print('val index tail:',val_index_ser.tail(20))
        
        trn_data = lgb.Dataset(tr[cur_features], label=y_tr)#,, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(val[cur_features], label=y_val)#,, categorical_feature=categorical_feats)
        
        if isparamFolds:
            cur_param = param[fold_]
            print('cur param:',cur_param)
        else:
            cur_param = param
            
        clf = lgb.train(cur_param, trn_data, num_round, valid_sets = [val_data], verbose_eval=500, 
                        early_stopping_rounds = 300,
                        feval=lgb_logle,
                        )

        #Prediction based on current fold selected features
        if ispermutefeats:
            
            selected_features, importance_df = permutation_feature_selection(clf, val[cur_features], 
                                                                             y_val,score_function,
                                                                             rep=4,max_delta_score=max_delta_score)
            overall_sel_feats += [selected_features]
            print(selected_features)

#             print('overal imp shape:{0} importance_df shape:{1}'.format(overall_imp_df.shape,importance_df.shape))
            
            overall_imp_df['fold_'+str(fold_)+'score_mean'] = importance_df['delta_score_mean']
            overall_imp_df['fold_'+str(fold_)+'score_max'] = importance_df['delta_score_max']
            overall_imp_df['fold_'+str(fold_)+'score_min'] = importance_df['delta_score_min']
        else:
            oof[val_idx] = clf.predict(val[cur_features], num_iteration=clf.best_iteration)

            fold_importance_df["feature"] = cur_features
            if fold_==0:
                fold_importance_df["importance"] =0
            fold_importance_df["importance"] += clf.feature_importance() / n_splits
            computef1scoreandconfmatrix(y_val,oof[val_idx]) 
            aucscore = roc_auc_score(y_val,oof[val_idx])
            print("AUC score: {:<8.5f}".format(aucscore))
            valid_scores+=[aucscore]
#             valid_scores+=[clf.best_score['valid_0'][cur_param['metric']]]
            predictions += clf.predict(test[cur_features], num_iteration=clf.best_iteration) / folds.n_splits

    if ispermutefeats:
        fold_mean_cols = [col for col in overall_imp_df.columns if ('score_mean' in col) and ('fold_' in col) ]
        fold_max_cols = [col for col in overall_imp_df.columns if ('score_max' in col) and ('fold_' in col) ]
        fold_min_cols = [col for col in overall_imp_df.columns if ('score_min' in col) and ('fold_' in col) ]
        overall_imp_df['overall_score_mean'] = overall_imp_df[fold_mean_cols].mean(axis=1)
        overall_imp_df['overall_score_max'] = overall_imp_df[fold_max_cols].max(axis=1)
        overall_imp_df['overall_score_min'] = overall_imp_df[fold_min_cols].min(axis=1)
    else:
        print('valid scores:',valid_scores)
        print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
        computef1scoreandconfmatrix(target,oof) 

    return fold_importance_df,predictions,oof,overall_imp_df,overall_sel_feats

In [None]:
param = {
         'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
#          'objective':'None',
         'max_depth': -1,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'None',
         "scale_pos_weight": 2,
#          "is_unbalance": True,
         "lambda_l1": 0.1,
#          "lambda_l1": 0.7,
#          "lambda_l2": 0.7,
         "verbosity": -1,
         "nthread": 4,
         'n_estimators' : 10000,
         "random_state": 4590}


early_stopping_rounds = 300

In [None]:
# %%time

# train_orig = pd.read_csv('train_preproc.csv')
# test_orig = pd.read_csv('test_preproc.csv')

In [None]:
# knncols = [col for col in train_with_knn.columns if 'knn_' in col]
# train_knn = pd.concat([train_orig,train_with_knn[knncols]],axis=1)
# test_knn = pd.concat([test_orig,test_with_knn[knncols]],axis=1)

In [None]:

# knn_dist_1 = ['knn_1.1','knn_1.2','knn_5.1','knn_5.2','knn_6.1','knn_6.2']
# knn_dist_2 = ['knn_1.3','knn_1.4','knn_5.3','knn_5.4','knn_6.3','knn_6.4']
# knn_set1567 = [col for col in knncols if 
#                    (col in knn_dist_2) or ('knn_7' in col) or (col in knn_dist_1) ]
# print(knn_set1567)
# knn_set1567_1 = [col for col in knncols if 
#                    (col in knn_dist_1) or ('knn_7' in col)]
# knn_set1567_2 = [col for col in knncols if 
#                    (col in knn_dist_2) or ('knn_7' in col) ]
# print(knn_set1567_2)
# knn_set23 = [col for col in knncols if 
#                    ('knn_2' in col) or ('knn_3' in col)]
# # sel_enc_cols = [col for col in test_encs[0].columns if 'targetenc_' in col]
# # features +=['knn_1.4']
# # features = [c for c in train_orig.columns if c not in exclude_cols]
# # features +=include_knn_cols
# # print(features)

In [None]:
num_round = param['n_estimators']

In [None]:
# features = [c for c in train_orig.columns if c not in exclude_cols]
# print(features)

# fold_importance_df,predictions1,oof1,overall_imp_df,overall_sel_feats = \
#         runlgb(False,train_knn,test_knn,param,features,score_function=None,
#               isparamFolds=False)

In [None]:
# features = [c for c in train_orig.columns if c not in exclude_cols]
# features +=['knn_1.2']
# print(len(features))
# print(features)
# fold_importance_df,predictions2,oof2,overall_imp_df,overall_sel_feats = \
#         runlgb(False,train_knn,test_knn,param,features,score_function=None,
#               isparamFolds=False)

In [None]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features +=knn_set23
print(features)
fold_importance_df,predictions3,oof3,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

In [None]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features +=knn_set1567_1
print(features)
fold_importance_df,predictions4,oof4,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

In [None]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features +=knn_set1567_2
print(features)
fold_importance_df,predictions4b,oof4b,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

In [None]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features += knn_set23 + knn_set1567_1 
print(features)
fold_importance_df,predictions5,oof5,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

In [None]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features += knn_set23 + knn_set1567_2 
print(features)
fold_importance_df,predictions5b,oof5b,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

In [None]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features += knn_set23 + knn_set1567
print(features)
fold_importance_df,predictions5c,oof5c,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

In [None]:
features = [c for c in train_orig.columns if c not in exclude_cols]
features +=['knn_8.1','knn_8.2']
print(features)
fold_importance_df,predictions6,oof6,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_knn,test_knn,param,features,score_function=None,
              isparamFolds=False)

In [None]:
# features = [c for c in train_orig.columns if c not in exclude_cols]
# features += knn_set23 + knn_set1567_1 + ['knn_8.1','knn_8.2']
# print(features)
# fold_importance_df,predictions7,oof7,overall_imp_df,overall_sel_feats = \
#         runlgb(False,train_knn,test_knn,param,features,score_function=None,
#               isparamFolds=False)

In [None]:
# features = [c for c in train_orig.columns if c not in exclude_cols]
# features +=['knn_8.2']
# print(features)
# fold_importance_df,predictions5,oof5,overall_imp_df,overall_sel_feats = \
#         runlgb(False,train_knn,test_knn,param,features,score_function=None,
#               isparamFolds=False)

In [None]:
# oof1 = oof.copy() # without knn
# oof2 = oof.copy() # knn_1.4
# oof4 = oof.copy()  # knn 2,3
# oof4 = oof.copy() # knn 1,5,6,7

In [None]:
w3=0.8
w5b = 0.1


oof =  oof1 +  w3*oof3 + oof4 + oof4b + oof5 + w5b*oof5b + oof5c + oof6
# oof =  oof1 +  w3*oof3 + oof4 + oof4b + oof5 + oof5b + oof5c + oof6
# oof =  oof1 +  w3*oof3 + oof4 + oof5 + oof6
# oof =  oof1 + 0.1*oof2+ 0.8*oof3 + oof4 + oof5 
# oof =  oof1 + 0.1*oof2+ 0.8*oof3 + oof4 + oof5 + oof6
# oof =  oof1 + 0.8*oof3 + oof4 + oof5 + oof6
print('Ens AUC:',roc_auc_score(target,oof))
opt_cutoff, f1score = get_opt_cutoff_prec(target,oof)
print('opt_cutoff:',opt_cutoff)
print('f1 score:',f1score)

predictions =  predictions1 +  w3*predictions3 + predictions4 + predictions5 + w5b*predictions5b + predictions5c + predictions6

In [None]:
#convert to labels
opt_cutoff, f1score = get_opt_cutoff_prec(target,oof)
oof_labels = convert_probtolabels(oof,cutoff=opt_cutoff)

predictions_labels = convert_probtolabels(predictions,cutoff=opt_cutoff)

In [None]:
sub_df = pd.DataFrame({"loan_id":test_orig["loan_id"].values})
sub_df[targetcol] = predictions_labels
sub_df.to_csv("submission_knnfeats_euclidean.csv", index=False)
np.save('oof_labels_knnfeats_euclidean.npy',oof_labels)
np.save('oof_knnfeats_euclidean.npy',oof)
np.save('oof1_knnfeats_euclidean.npy',oof1)
# np.save('oof2_knnfeats_euclidean.npy',oof2)
np.save('oof3_knnfeats_euclidean.npy',oof3)
np.save('oof4_knnfeats_euclidean.npy',oof4)
np.save('oof4b_knnfeats_euclidean.npy',oof4b)
np.save('oof5_knnfeats_euclidean.npy',oof5)
np.save('oof5b_knnfeats_euclidean.npy',oof5b)
np.save('oof5c_knnfeats_euclidean.npy',oof5c)
np.save('oof6_knnfeats_euclidean.npy',oof6)

np.save('pred_labels_knnfeats_euclidean.npy',predictions_labels)
np.save('pred_knnfeats_euclidean.npy',predictions)
np.save('pred1_knnfeats_euclidean.npy',predictions1)
# np.save('pred2_knnfeats_euclidean.npy',predictions2)
np.save('pred3_knnfeats_euclidean.npy',predictions3)
np.save('pred4_knnfeats_euclidean.npy',predictions4)
np.save('pred4b_knnfeats_euclidean.npy',predictions4b)
np.save('pred5_knnfeats_euclidean.npy',predictions5)
np.save('pred5b_knnfeats_euclidean.npy',predictions5b)
np.save('pred5c_knnfeats_euclidean.npy',predictions5c)
np.save('pred6_knnfeats_euclidean.npy',predictions6)