[data](https://www.kaggle.com/c/santander-customer-satisfaction/data)

In [3]:
import joblib #for loading pickle files
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import sklearn
from sklearn import metrics,ensemble,model_selection,linear_model,tree,calibration,cluster
import xgboost as xgb
import lightgbm as lgb 
import scipy
import random
from tqdm import tqdm
import os
import itertools
np.random.seed(13154)

In [4]:
def remove_null_variance(train,test):
    """
    removing all features with null variance/zero variance
    """
    i=0
    for col in train.columns: 
        if train[col].var()==0:
            i+=1
            del train[col]
            del test[col]
    print("%i features were found to have zero variance and these were all removed."%(i))

In [5]:
def remove_sparse(train,test):
    """
    remove sparse features (features that have 99th percentile value = 0)
    """
    i=0
    for col in train.columns: #reomving all sparse features
        if np.percentile(train[col],99)==0:
            i+=1
            del train[col]
            del test[col]
    print("%i features were found to be sparse and these were all removed."%(i))

In [6]:
def remove_identical_columns(train,test):
    """
    remove columns that have identical values
    """
    combinations = list(itertools.combinations(train.columns,2)) #getting all the combinations of columns
    remove=[]
    for f1,f2 in combinations: #finding all columns which have same datavalues, they don't share same feature name
        if (f1 not in remove) & (f2 not in remove):
            if train[f1].equals(train[f2]):
                remove.append(f1)
    train.drop(remove,axis=1,inplace=True)
    test.drop(remove,axis=1,inplace=True)
    print("%i were found to be duplicated columns and %i of these were removed."%(len(remove)*2,len(remove)))
    del remove
    del combinations

In [7]:
def preprocessing(X_test):
    """
    function which takes an input and computes preprocessing
    """
    X_train = pd.read_csv('./train.csv')
    #preprocessing
    remove_null_variance(train=X_train,test=X_test)
    remove_sparse(train=X_train,test=X_test)
    remove_identical_columns(train=X_train,test=X_test)

    X_train.to_pickle("./preprocessed_train.pkl")
    X_test.to_pickle("./preprocessed_test.pkl")
    print("Preprocessing completed")
    return X_train,X_test

In [8]:
def create_var15_below_23(train,test):
    """
    create a new feature which tells whether a customer is below 23 years old or not
    """
    print("Creating 'var15_below_23' feature")
    for df in [train,test]:
        df['var15_below_23'] = np.zeros(df.shape[0],dtype=int)
        df.loc[df['var15']<23,'var15_below_23']=1

In [9]:
def bin_var15(train,test):
    """
    binning var15 feature into 5 bins
    """
    print("binning 'var15' feature")
    _,bins = pd.cut(train['var15'].values,5,retbins=True) #getting the bins
    train['var15'] = pd.cut(train['var15'].values,bins,labels=False)
    test['var15'] = pd.cut(test['var15'].values,bins,labels=False)

In [10]:
def add_feature_no_zeros(train,test):
    """
    Add feature which tells the no. of occurences of zeros,nonzeros across a datapoint
    """
    print("Creating 'no_zeros' and 'no_nonzeroes' feature")
    col = [k for k in train.columns if k!='TARGET']
    for df in [train,test]:
        df['no_zeros'] = (df.loc[:,col]==0).sum(axis=1).values
        df['no_nonzeros'] = (df.loc[:,col]!=0).sum(axis=1).values

In [11]:
def add_feature_no_zeros_keyword(keyword,train,test):
    """
    Add feature which tells the no. of occurences of zeros,nonzeros across a datapoint
    for a specific keyword
    """
    print("Creating 'no_zeros' and 'no_nonzeroes' feature for %s keyword"%(keyword))
    col = [k for k in train.columns if keyword in k]
    for df in [train,test]:
        df['no_zeros_'+keyword] = (df.loc[:,col]==0).sum(axis=1).values
        df['no_nonzeros_'+keyword] = (df.loc[:,col]!=0).sum(axis=1).values

In [12]:
def average_col(col,features,train,test):
  """
  Gets the average numerical values of features for each category in 'col' feature
  and add it as feature 
  """
  print("Creating average numerical values feature")
  if "train_average.pkl" not in os.listdir('./'): #less time
    for df in [train,test]:
        unique_values = df[col].unique()
        for feature in features:
          avg_value=[]
          for value in unique_values:
            avg = df.loc[df[col]==value,feature].mean() #taking average for each category for feature col
            avg_value.append(avg)
          avg_dict = dict(zip(unique_values,avg_value))
          new_col = 'avg_'+col+'_'+feature
          df[new_col] = np.zeros(df.shape[0])
          for value in unique_values:
            df.loc[df[col]==value,new_col] = avg_dict[value]
    df.to_pickle("./train_average.pkl")
  else:
    for df in [test]:
      unique_values = df[col].unique()
      for feature in features:
        avg_value=[]
        for value in unique_values:
          avg = df.loc[df[col]==value,feature].mean() #taking average for each category for feature col
          avg_value.append(avg)
        avg_dict = dict(zip(unique_values,avg_value))
        new_col = 'avg_'+col+'_'+feature
        df[new_col] = np.zeros(df.shape[0])
        for value in unique_values:
          df.loc[df[col]==value,new_col] = avg_dict[value]

In [51]:
def stdzation(train,test):
  """
  apply standardization to features
  for train and test using scaler
  for all columns
  """
  print("Applying standardization")
  filepath = './scaler.pkl'
  col=[i for i in train.columns if (i!='TARGET') & (i!='ID')]
  with open(filepath,'rb') as f:
    scaler = joblib.load(f)
  train.loc[:,col]= scaler.transform(train.loc[:,col])
  test.loc[:,col] = scaler.transform(test.loc[:,col])

In [85]:
def add_kmeans_cluster(train,test,ncluster=[2,4,6,8,10]):
    """
    Function add Kmeans cluster value as features for n in ncluster
    """
    print('adding kmeans cluster features')
    X_tr = train.drop(['TARGET','ID'],axis=1).copy()
    X_te = test.drop(['ID'],axis=1).copy()
    if 'train_cluster.pkl' not in os.listdir('./'):
      for n in ncluster:
          print("for n=%i:"%(n))
          filepath = 'kmeans_'+str(n)+'.pkl' #model pkl
          feat_name = 'Kmeans_'+str(n)
          with open(filepath,'rb') as f:
            kmeans = joblib.load(f)
          train[feat_name] = kmeans.predict(X_tr)
          test[feat_name] = kmeans.predict(X_te)
          train.to_pickle('./train_cluster.pkl')
    else:
      for n in ncluster:
          print("for n=%i:"%(n))
          feat_name = 'Kmeans_'+str(n)
          filepath = 'kmeans_'+str(n)+'.pkl'
          with open(filepath,'rb') as f:
            kmeans = joblib.load(f)
          test[feat_name] = kmeans.predict(X_te)
      train = pd.read_pickle('./train_cluster.pkl')
    return train,test

In [86]:
def remove_corr_var(train,test,target_threshold = 10**-3,within_threshold=0.95):
  """
  Remove correlated features that have low correlation with target 
  and have high correlation with each other (keeping one)
  """
  print("Removing features based on correlation and variance")
  #removing all low correlated variables with target
  initial_feature = train.shape[1]
  corr = train.drop("ID",axis=1).corr().abs()
  corr_target = pd.DataFrame(corr['TARGET']).sort_values(by='TARGET')
  threshold=target_threshold
  feat_df =corr_target[(corr_target['TARGET'])<=threshold]
  print("There are %i features that have a correlation values less than %.3f with 'TARGET'. We will remove all of this."\
        %(feat_df.shape[0],threshold))
  print("Removing.........")
  
  for df in [train,test]:
    df.drop(feat_df.index,axis=1,inplace=True)


  #reomving highly correlated features(keeping one)
  #https://www.dezyre.com/recipes/drop-out-highly-correlated-features-in-python
  corr.drop('TARGET',axis=1,inplace=True)
  corr.drop('TARGET',axis=0,inplace=True)
  corr.drop(feat_df.index,axis=1,inplace=True)
  corr.drop(feat_df.index,inplace=True)
  threshold = within_threshold
  upper = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool)) #getting upper traingle of correlation matrix
  column = [col for col in upper.columns if any(upper[col]>threshold)] #getting all columns that have high correlation with one of the features
  print("There are %i features that have high correlation with another feature with threshold being kept as %.3f and above. We will remove all of this."\
        %(len(column),threshold))
  print("Removing.........")
  for df in [train,test]:
    df.drop(column,axis=1,inplace=True)

  print("The features were changed from %i to %i. %i features were removed."%(initial_feature,train.shape[1],initial_feature-train.shape[1]))


In [87]:
def apply_log(train,test,column):
  """
  apply log transformation to all features in col variable
  """
  tr = train.copy()
  te = test.copy()
  for df in [tr,te]:
    for col in column:
      df.loc[df[col]>=0,col] = np.log(df.loc[df[col]>=0,col].values)

  return tr,te

In [88]:
def response_encoding(df,test_df,column,target='TARGET',alpha=5000):
        """
        Here we are using response encoding with laplace smoothing to a categorical column
        and transform the respective column in train,test,val datasets.
        Here we will be repeating the values of each category alpha times.
        """
        feature = column+'_1'
        feature_ = column + '_0'
        unique_values = set(df[column].values) #all unique values in that categorical column
        dict_values = {} #storing the response encoding values for target=1
        dict_values_ = {} #storing the response encoding values for target=0
        for value in unique_values:
            total = len(df[df[column]==value]) #the total no. of datapoints with 'value' catgeory
            sum_promoted = len(df[(df[column]==value) & df[target]==1]) #no. of all datapoints with category being 'value' and target==1
            sum_unpromoted = total-sum_promoted #no. of all datapoints with category being 'value' and target==0
            dict_values[value] = np.round((sum_promoted+alpha)/(total+alpha*len(unique_values)),2) #storing the obtained result in a dictionary
            dict_values_[value] = np.round((sum_unpromoted+alpha)/(total+alpha*len(unique_values)),2)
        dict_values['unknown']=0.5 #unknown categories that are not seen in train will be assigned a score of 0.5
        dict_values_['unknown'] = 0.5
        df[feature]=(df[column].map(dict_values)).values
        df[feature_] = (df[column].map(dict_values_)).values
        df.drop(column,axis=1,inplace=True)
    
        unique_values_test = set(test_df[column].values)
        test_df[column]=test_df[column].apply(lambda x: 'unknown' if x in (unique_values_test-unique_values) else x )
        test_df[feature] = (test_df[column].map(dict_values)).values
        test_df[feature_] = (test_df[column].map(dict_values_)).values
        test_df.drop(column,axis=1,inplace=True) 
        
        
        


In [89]:
def add_pca_features(train,test,n=2):
  """
  Add pca values as features with n resulting components
  """
  pca = sklearn.decomposition.PCA(n_components=n)
  feat_names = ['pca_'+str(i) for i in range(n)]

  #train
  X= train.drop(["ID","TARGET"],axis=1)
  with open('./pca.pkl','rb') as f:
    pca = joblib.load(f)
  X_embedded=pca.transform(X)
  for i,feat in enumerate(feat_names):
    train[feat] = X_embedded[:,i]

  #test
  X= test.drop("ID",axis=1)
  X_embedded=pca.transform(X)
  for i,feat in enumerate(feat_names):
    test[feat] = X_embedded[:,i]

In [90]:
def feature_engineering(X_test):
    """
    Computes feature engineering
    """
    files = os.listdir('./')

    #creating features
    X_train = pd.read_pickle('./preprocessed_train.pkl')

    create_var15_below_23(train=X_train,test=X_test)

    bin_var15(train=X_train,test=X_test)

    add_feature_no_zeros(train=X_train,test=X_test)

    keywords = ['saldo' , 'ind', 'num', 'imp']
    for k in keywords:
        add_feature_no_zeros_keyword(k,X_train,X_test)

    #we will be taking average columns for saldo and imp with categorical columns being the ones that have between 50 and 210 unqiue values
    features = [i for i in X_train.columns if ('saldo' in i) & ('no_zeros' not in i)]
    features.extend([i for i in X_train.columns if ('imp' in i) & ('no_zeros' not in i)])
    columns = [i for i in X_train.columns if (X_train[i].nunique()<=210) & (X_train[i].nunique()>50)] #categorical features
    for col in tqdm(columns):
        average_col(col,features,X_train,X_test)
    if 'train_average.pkl' not in os.listdir('./'):
        X_train.to_pickle('./train_average.pkl')
    else:
        X_train = pd.read_pickle("train_average.pkl")
    stdzation(X_train,X_test)
    X_train,X_test = add_kmeans_cluster(train=X_train,test=X_test)
    remove_corr_var(train=X_train,test=X_test)

    #log transformation to saldo and imp features
    features = [i for i in X_train.columns if (('saldo' in i)|('imp' in i))&((X_train[i].values>=0).all())]
    X_train,X_test = apply_log(X_train,X_test,features)

    #response encoding columns which have 2-10 unique values
    cat_col = []
    for col in X_train.columns:
        if (X_train[col].nunique()<=10) & (col!='TARGET') & (X_train[col].nunique()>2) & ('Kmeans' not in col):
            cat_col.append(col)

    alpha=100
    for col in tqdm(cat_col):
        response_encoding(X_train,X_test,col,alpha=alpha)
        
        
    print("adding pca features to the datasets")
    add_pca_features(X_train,X_test) #adding pca feature to log response encoded data
    print("Feature Engineering done.....")
    return X_train,X_test


In [91]:
def get_top_features(df,feat_imp,top=50):
  """
  gets the top features.
  if top is of integer dtype it will return top features,
  if top is between 0 and 1, it will return features that have at least feature importance value= top
  """
  if (top>0.)&(top<=1.): #getting the features that have feature importance value greater than top
    feature_to_consider = [1 if i>=top else 0 for i in feat_imp]
    most_important_feat = [i  for i,j in zip(df.columns,feature_to_consider) if (j==1)]#getiing the columns names
  else:
    top_indices = np.argsort(feat_imp)[::-1][:top] #getting the indices with top feature importace
    most_important_feat = df.columns[top_indices] #getiing the columns names
  return most_important_feat

In [92]:
def top_features_log(X_train,X_test,y_train):
    """
    Gets the top 250 features for log transformed data
    """
    if 'log_train_re_250.pkl' not in os.listdir('./'):
        X_train1,X_val,y_tr,y_val = model_selection.train_test_split(X_train,
                                                                         y_train,
                                                                         stratify=y_train,
                                                                         test_size=0.15)
        filepath = 'top_feature_log.pkl'
        with open(filepath,'rb') as f:
            model = joblib.load(filepath)        
        features = get_top_features(X_train,model.feature_importances_,top=250)
        X_train_250 = X_train.loc[:,features]
        X_test_250 = X_test.loc[:,features]
        X_train_250.to_pickle('./log_train_re_250.pkl')
    else:
        X_train_250 = X_train.loc[:,pd.read_pickle('./log_train_re_250.pkl').columns]
        X_test_250 = X_test.loc[:,X_train_250.columns]

    return X_train_250,X_test_250

In [93]:
def modelling_log(X_train,X_test,y_train):
    """
    return y_test for log top 250 dataset 
    """
    filepath = './model_log.pkl'
    with open(filepath,'rb') as file:
        model = joblib.load(file)
    y_test = model.predict_proba(X_test)[:,1]
    return y_test

In [94]:
def top_features_normal(X_train,X_test,y_train):
    """
    Gets the top 250 features for normal transformed data
    """
    if 'normal_train_re_250.pkl' not in os.listdir('./'):
        X_train1,X_val,y_tr,y_val = model_selection.train_test_split(X_train,
                                                                         y_train,
                                                                         stratify=y_train,
                                                                         test_size=0.15)
        with open(filepath,'rb') as f:
            model = joblib.load(filepath)        
        features = get_top_features(X_train,model.feature_importances_,top=250)
        X_train_250 = X_train.loc[:,features]
        X_test_250 = X_test.loc[:,features]
        X_train_250.to_pickle('./normal_train_re_250.pkl')
    else:
        X_train_250 = X_train.loc[:,pd.read_pickle('./normal_train_re_250.pkl').columns]
        X_test_250 = X_test.loc[:,X_train_250.columns]
    
    return X_train_250,X_test_250

In [95]:
def modelling_normal(X_train,X_test,y_train):
    """
    return y_test for normal top 250 dataset 
    """
    filepath = './model_normal.pkl'
    with open(filepath,'rb') as file:
        model = joblib.load(file)
    y_test = model.predict_proba(X_test)[:,1]
    return y_test

In [96]:
def final1(test):
    """
    returns y_pred if raw input file is given
    """
    #preprocessing
    train,test = preprocessing(test)
    
    #feature engineering
    train,test = feature_engineering(test)
    
    #get top 250 features
    X_train = train.drop(['ID','TARGET'],axis=1)
    y_train = train['TARGET'].values
    X_test = test.drop('ID',axis=1)
    del train,test
    X_train1,X_test1 = top_features_log(X_train,X_test,y_train) #get top 250 features for log
    X_train2,X_test2 = top_features_normal(X_train,X_test,y_train) #get top 250 features for normal
    del X_train,X_test
    #modelling log
    y_pred1 = modelling_log(X_train1,X_test1,y_train)
    y_pred2 = modelling_normal(X_train2,X_test2,y_train)
    y_pred =(y_pred1+y_pred2)/2 #simple ensembling
    
    return y_pred

In [97]:
def final2(test,y_test):
    """
    returns auc score when test and y_test are given
    here test will contain "TARGET" (raw input file)
    """
    #preprocessing
    train,test = preprocessing(test)
    
    #feature engineering
    train,test = feature_engineering(test)
    
    #get top 250 features
    X_train = train.drop(['ID','TARGET'],axis=1)
    y_train = train['TARGET'].values
    X_test = test.drop('ID',axis=1)
    del train,test
    X_train1,X_test1 = top_features_log(X_train,X_test,y_train) #get top 250 features for log
    X_train2,X_test2 = top_features_normal(X_train,X_test,y_train) #get top 250 features for normal
    del X_train,X_test
    
    y_pred1 = modelling_log(X_train1,X_test1,y_train) #modelling log
    y_pred2 = modelling_normal(X_train2,X_test2,y_train) #modelling normal
    y_pred =(y_pred1+y_pred2)/2 #simple ensembling
    
    #score
    auc_score = metrics.roc_auc_score(y_test,y_pred)
    return auc_score

In [98]:
def create_testy(y_test):
  """
  creates submission test file csv
  """
  test = X_test_l
  feat = test.columns
  feat_to_drop = set(test.columns)-{'ID'}
  test.drop(feat_to_drop,axis=1,inplace=True)
  test['TARGET'] = y_test
  test.to_csv('./submission.csv',index=False)

In [99]:
test = pd.read_csv('./test.csv')
train = pd.read_csv('./train.csv')
y_train = train['TARGET'].values

In [100]:
%time y_test = final1(test)

34 features were found to have zero variance and these were all removed.
188 features were found to be sparse and these were all removed.
12 were found to be duplicated columns and 6 of these were removed.
Preprocessing completed
Creating 'var15_below_23' feature
binning 'var15' feature
Creating 'no_zeros' and 'no_nonzeroes' feature
Creating 'no_zeros' and 'no_nonzeroes' feature for saldo keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for ind keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for num keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for imp keyword


  0%|                                                                                           | 0/19 [00:00<?, ?it/s]

Creating average numerical values feature


  5%|████▎                                                                              | 1/19 [00:21<06:34, 21.92s/it]

Creating average numerical values feature


 11%|████████▋                                                                          | 2/19 [00:33<05:17, 18.68s/it]

Creating average numerical values feature


 16%|█████████████                                                                      | 3/19 [00:45<04:29, 16.83s/it]

Creating average numerical values feature


 21%|█████████████████▍                                                                 | 4/19 [01:02<04:14, 16.99s/it]

Creating average numerical values feature


 26%|█████████████████████▊                                                             | 5/19 [01:18<03:51, 16.55s/it]

Creating average numerical values feature


 32%|██████████████████████████▏                                                        | 6/19 [01:39<03:52, 17.85s/it]

Creating average numerical values feature


 37%|██████████████████████████████▌                                                    | 7/19 [01:57<03:35, 17.92s/it]

Creating average numerical values feature


 42%|██████████████████████████████████▉                                                | 8/19 [02:16<03:21, 18.33s/it]

Creating average numerical values feature


 47%|███████████████████████████████████████▎                                           | 9/19 [02:41<03:21, 20.16s/it]

Creating average numerical values feature


 53%|███████████████████████████████████████████▏                                      | 10/19 [03:03<03:08, 20.89s/it]

Creating average numerical values feature


 58%|███████████████████████████████████████████████▍                                  | 11/19 [03:31<03:02, 22.86s/it]

Creating average numerical values feature


 63%|███████████████████████████████████████████████████▊                              | 12/19 [03:59<02:50, 24.43s/it]

Creating average numerical values feature


 68%|████████████████████████████████████████████████████████                          | 13/19 [04:26<02:32, 25.35s/it]

Creating average numerical values feature


 74%|████████████████████████████████████████████████████████████▍                     | 14/19 [04:58<02:15, 27.13s/it]

Creating average numerical values feature


 79%|████████████████████████████████████████████████████████████████▋                 | 15/19 [05:37<02:03, 30.96s/it]

Creating average numerical values feature


 84%|█████████████████████████████████████████████████████████████████████             | 16/19 [06:13<01:37, 32.45s/it]

Creating average numerical values feature


 89%|█████████████████████████████████████████████████████████████████████████▎        | 17/19 [06:51<01:08, 34.07s/it]

Creating average numerical values feature


 95%|█████████████████████████████████████████████████████████████████████████████▋    | 18/19 [07:25<00:33, 33.94s/it]

Creating average numerical values feature


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [08:01<00:00, 25.36s/it]


Applying standardization
adding kmeans cluster features
for n=2:
for n=4:
for n=6:
for n=8:
for n=10:
Removing features based on correlation and variance
There are 46 features that have a correlation values less than 0.001 with 'TARGET'. We will remove all of this.
Removing.........
There are 553 features that have high correlation with another feature with threshold being kept as 0.950 and above. We will remove all of this.
Removing.........
The features were changed from 957 to 358. 599 features were removed.


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:24<00:00,  1.51s/it]


adding pca features to the datasets
Feature Engineering done.....
Wall time: 14min 31s


In [101]:
y_test

array([0.05192093, 0.04664471, 0.002612  , ..., 0.00242902, 0.03230181,
       0.00225586], dtype=float32)

In [102]:
X_test = train.drop(['TARGET'],axis=1)
y_test = y_train
%time auc_score = final2(X_test,y_test)

34 features were found to have zero variance and these were all removed.
188 features were found to be sparse and these were all removed.
12 were found to be duplicated columns and 6 of these were removed.
Preprocessing completed
Creating 'var15_below_23' feature
binning 'var15' feature
Creating 'no_zeros' and 'no_nonzeroes' feature
Creating 'no_zeros' and 'no_nonzeroes' feature for saldo keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for ind keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for num keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for imp keyword


  0%|                                                                                           | 0/19 [00:00<?, ?it/s]

Creating average numerical values feature


  5%|████▎                                                                              | 1/19 [00:22<06:50, 22.81s/it]

Creating average numerical values feature


 11%|████████▋                                                                          | 2/19 [00:31<05:16, 18.62s/it]

Creating average numerical values feature


 16%|█████████████                                                                      | 3/19 [00:43<04:26, 16.68s/it]

Creating average numerical values feature


 21%|█████████████████▍                                                                 | 4/19 [01:00<04:10, 16.71s/it]

Creating average numerical values feature


 26%|█████████████████████▊                                                             | 5/19 [01:16<03:49, 16.41s/it]

Creating average numerical values feature


 32%|██████████████████████████▏                                                        | 6/19 [01:36<03:46, 17.44s/it]

Creating average numerical values feature


 37%|██████████████████████████████▌                                                    | 7/19 [01:55<03:34, 17.89s/it]

Creating average numerical values feature


 42%|██████████████████████████████████▉                                                | 8/19 [02:14<03:21, 18.34s/it]

Creating average numerical values feature


 47%|███████████████████████████████████████▎                                           | 9/19 [02:37<03:18, 19.86s/it]

Creating average numerical values feature


 53%|███████████████████████████████████████████▏                                      | 10/19 [02:59<03:04, 20.47s/it]

Creating average numerical values feature


 58%|███████████████████████████████████████████████▍                                  | 11/19 [03:25<02:57, 22.16s/it]

Creating average numerical values feature


 63%|███████████████████████████████████████████████████▊                              | 12/19 [03:53<02:46, 23.78s/it]

Creating average numerical values feature


 68%|████████████████████████████████████████████████████████                          | 13/19 [04:20<02:28, 24.83s/it]

Creating average numerical values feature


 74%|████████████████████████████████████████████████████████████▍                     | 14/19 [04:52<02:14, 26.87s/it]

Creating average numerical values feature


 79%|████████████████████████████████████████████████████████████████▋                 | 15/19 [05:32<02:03, 30.90s/it]

Creating average numerical values feature


 84%|█████████████████████████████████████████████████████████████████████             | 16/19 [06:10<01:38, 32.93s/it]

Creating average numerical values feature


 89%|█████████████████████████████████████████████████████████████████████████▎        | 17/19 [06:56<01:13, 36.88s/it]

Creating average numerical values feature


 95%|█████████████████████████████████████████████████████████████████████████████▋    | 18/19 [07:38<00:38, 38.33s/it]

Creating average numerical values feature


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [08:19<00:00, 26.28s/it]


Applying standardization
adding kmeans cluster features
for n=2:
for n=4:
for n=6:
for n=8:
for n=10:
Removing features based on correlation and variance
There are 46 features that have a correlation values less than 0.001 with 'TARGET'. We will remove all of this.
Removing.........
There are 553 features that have high correlation with another feature with threshold being kept as 0.950 and above. We will remove all of this.
Removing.........
The features were changed from 957 to 358. 599 features were removed.


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:23<00:00,  1.49s/it]


adding pca features to the datasets
Feature Engineering done.....
Wall time: 17min 42s


In [103]:
auc_score

0.8879187176022363

In [109]:
test = pd.read_csv('./test.csv')
train = pd.read_csv('./train.csv')
y_train = train['TARGET'].values

In [110]:
%time y_test = final1(test.iloc[:25,:])

34 features were found to have zero variance and these were all removed.
188 features were found to be sparse and these were all removed.
12 were found to be duplicated columns and 6 of these were removed.
Preprocessing completed
Creating 'var15_below_23' feature
binning 'var15' feature
Creating 'no_zeros' and 'no_nonzeroes' feature
Creating 'no_zeros' and 'no_nonzeroes' feature for saldo keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for ind keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for num keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for imp keyword


  5%|████▎                                                                              | 1/19 [00:00<00:03,  5.35it/s]

Creating average numerical values feature
Creating average numerical values feature


 11%|████████▋                                                                          | 2/19 [00:00<00:04,  4.17it/s]

Creating average numerical values feature


 16%|█████████████                                                                      | 3/19 [00:01<00:05,  3.19it/s]

Creating average numerical values feature


 21%|█████████████████▍                                                                 | 4/19 [00:01<00:05,  2.59it/s]

Creating average numerical values feature


 26%|█████████████████████▊                                                             | 5/19 [00:02<00:05,  2.35it/s]

Creating average numerical values feature


 32%|██████████████████████████▏                                                        | 6/19 [00:02<00:06,  2.09it/s]

Creating average numerical values feature


 37%|██████████████████████████████▌                                                    | 7/19 [00:03<00:06,  1.87it/s]

Creating average numerical values feature


 42%|██████████████████████████████████▉                                                | 8/19 [00:03<00:05,  2.13it/s]

Creating average numerical values feature


 47%|███████████████████████████████████████▎                                           | 9/19 [00:04<00:04,  2.22it/s]

Creating average numerical values feature


 53%|███████████████████████████████████████████▏                                      | 10/19 [00:04<00:03,  2.47it/s]

Creating average numerical values feature


 58%|███████████████████████████████████████████████▍                                  | 11/19 [00:04<00:03,  2.51it/s]

Creating average numerical values feature


 63%|███████████████████████████████████████████████████▊                              | 12/19 [00:05<00:03,  2.11it/s]

Creating average numerical values feature


 68%|████████████████████████████████████████████████████████                          | 13/19 [00:05<00:02,  2.14it/s]

Creating average numerical values feature


 74%|████████████████████████████████████████████████████████████▍                     | 14/19 [00:06<00:02,  2.07it/s]

Creating average numerical values feature


 79%|████████████████████████████████████████████████████████████████▋                 | 15/19 [00:07<00:02,  1.66it/s]

Creating average numerical values feature


 84%|█████████████████████████████████████████████████████████████████████             | 16/19 [00:08<00:02,  1.24it/s]

Creating average numerical values feature


 89%|█████████████████████████████████████████████████████████████████████████▎        | 17/19 [00:09<00:01,  1.06it/s]

Creating average numerical values feature


 95%|█████████████████████████████████████████████████████████████████████████████▋    | 18/19 [00:11<00:01,  1.07s/it]

Creating average numerical values feature


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:12<00:00,  1.50it/s]


Applying standardization
adding kmeans cluster features
for n=2:
for n=4:
for n=6:
for n=8:
for n=10:
Removing features based on correlation and variance
There are 46 features that have a correlation values less than 0.001 with 'TARGET'. We will remove all of this.
Removing.........
There are 553 features that have high correlation with another feature with threshold being kept as 0.950 and above. We will remove all of this.
Removing.........
The features were changed from 957 to 358. 599 features were removed.


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:12<00:00,  1.28it/s]


adding pca features to the datasets
Feature Engineering done.....
Wall time: 5min 15s


In [111]:
len(y_test)

25

In [106]:
y_train[:100] # class imbalance making sure 2 classs are there

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [107]:
X_test = train[:100].drop(['TARGET'],axis=1)
y_test = y_train[:100]
%time auc_score = final2(X_test,y_test)

34 features were found to have zero variance and these were all removed.
188 features were found to be sparse and these were all removed.
12 were found to be duplicated columns and 6 of these were removed.
Preprocessing completed
Creating 'var15_below_23' feature
binning 'var15' feature
Creating 'no_zeros' and 'no_nonzeroes' feature
Creating 'no_zeros' and 'no_nonzeroes' feature for saldo keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for ind keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for num keyword
Creating 'no_zeros' and 'no_nonzeroes' feature for imp keyword


  0%|                                                                                           | 0/19 [00:00<?, ?it/s]

Creating average numerical values feature


  5%|████▎                                                                              | 1/19 [00:00<00:06,  2.75it/s]

Creating average numerical values feature


 11%|████████▋                                                                          | 2/19 [00:00<00:07,  2.32it/s]

Creating average numerical values feature


 16%|█████████████                                                                      | 3/19 [00:02<00:13,  1.15it/s]

Creating average numerical values feature


 21%|█████████████████▍                                                                 | 4/19 [00:03<00:13,  1.13it/s]

Creating average numerical values feature


 26%|█████████████████████▊                                                             | 5/19 [00:04<00:11,  1.19it/s]

Creating average numerical values feature


 32%|██████████████████████████▏                                                        | 6/19 [00:05<00:10,  1.20it/s]

Creating average numerical values feature


 37%|██████████████████████████████▌                                                    | 7/19 [00:06<00:10,  1.17it/s]

Creating average numerical values feature


 42%|██████████████████████████████████▉                                                | 8/19 [00:06<00:09,  1.21it/s]

Creating average numerical values feature


 47%|███████████████████████████████████████▎                                           | 9/19 [00:07<00:08,  1.21it/s]

Creating average numerical values feature


 53%|███████████████████████████████████████████▏                                      | 10/19 [00:08<00:07,  1.24it/s]

Creating average numerical values feature


 58%|███████████████████████████████████████████████▍                                  | 11/19 [00:09<00:06,  1.23it/s]

Creating average numerical values feature


 63%|███████████████████████████████████████████████████▊                              | 12/19 [00:10<00:06,  1.08it/s]

Creating average numerical values feature


 68%|████████████████████████████████████████████████████████                          | 13/19 [00:11<00:05,  1.12it/s]

Creating average numerical values feature


 74%|████████████████████████████████████████████████████████████▍                     | 14/19 [00:12<00:04,  1.09it/s]

Creating average numerical values feature


 79%|████████████████████████████████████████████████████████████████▋                 | 15/19 [00:14<00:04,  1.13s/it]

Creating average numerical values feature


 84%|█████████████████████████████████████████████████████████████████████             | 16/19 [00:17<00:05,  1.70s/it]

Creating average numerical values feature


 89%|█████████████████████████████████████████████████████████████████████████▎        | 17/19 [00:20<00:04,  2.08s/it]

Creating average numerical values feature


 95%|█████████████████████████████████████████████████████████████████████████████▋    | 18/19 [00:22<00:02,  2.16s/it]

Creating average numerical values feature


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:24<00:00,  1.29s/it]


Applying standardization
adding kmeans cluster features
for n=2:
for n=4:
for n=6:
for n=8:
for n=10:
Removing features based on correlation and variance
There are 46 features that have a correlation values less than 0.001 with 'TARGET'. We will remove all of this.
Removing.........
There are 553 features that have high correlation with another feature with threshold being kept as 0.950 and above. We will remove all of this.
Removing.........
The features were changed from 957 to 358. 599 features were removed.


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:12<00:00,  1.24it/s]


adding pca features to the datasets
Feature Engineering done.....
Wall time: 5min 35s


In [108]:
auc_score

0.7091836734693877