In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from   sklearn.linear_model import BayesianRidge
from   sklearn.model_selection import StratifiedKFold,RepeatedKFold
from   sklearn.metrics import mean_squared_error
from   sklearn.ensemble import RandomTreesEmbedding
import utils
import tqdm
import pickle
import multiprocessing as mp
import time
import math
import warnings
import matplotlib.gridspec as gridspec
import scipy.stats as st
import statsmodels as sm
warnings.filterwarnings('ignore')

from scipy.stats import ks_2samp
np.random.seed(0)

%matplotlib inline

#matplotlib.rcParams['figure.figsize'] = (20, 5)

matplotlib.style.use('ggplot')

  from numpy.core.umath_tests import inner1d


In [2]:
train_path   = '../data/input/input_pkl/train/'
df_train     = utils.read_pickles(train_path)

100%|██████████| 5/5 [00:00<00:00, 12.12it/s]


In [3]:
test_path   = '../data/input/input_pkl/test/'
df_test     = utils.read_pickles(test_path)

100%|██████████| 5/5 [00:00<00:00, 15.68it/s]


In [12]:
def get_feature_importances(data,i,shuffle,seed=None):
    
    # Gather real features
    
    param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.4,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.05,
        'learning_rate': 0.01,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }
    
    train_columns = [c for c in df_train.columns if c not in ['ID_code', 'target']]
    target        = df_train['target']

    # Go over fold and keep track of CV score (train and valid) and feature importances
    
    # Shuffle target if required
    y = target.copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = data['target'].copy().sample(frac=1.0)
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data[train_columns], y, free_raw_data=False, silent=True)
    
    # Fit the model
    clf = lgb.train(params=param,
                    train_set=dtrain,
                   # categorical_feature=categorical_feats
                   )

    # Get feature importances
    imp_df                     = pd.DataFrame()
    imp_df["feature"]          = list(train_columns)
    imp_df["importance_gain"]  = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score']        = mean_squared_error(clf.predict(data[train_columns]), y)**0.5
    imp_df['run']              = i
    
    return imp_df

In [13]:
# Get the actual importance, i.e. without shuffling
actual_imp_df = get_feature_importances(data=df_train,i=0,shuffle=False)

In [None]:
def creation_null_imp_data(df_train,start_run,interval):
    
    parent_data = {}
    
    for i in range(start_run,start_run+interval):
        parent_data[i] = df_train

    start      = time.time()
    mp_pool    = mp.Pool()
    runs       = interval
    
    res1       = mp_pool.starmap(get_feature_importances,zip(parent_data.values(),parent_data.keys(),np.repeat(True,runs)))
    
    mp_pool.close()
    mp_pool.join()
    
    time_taken  = (time.time()-start)/60

    null_imp_df = pd.DataFrame()

    for i in range(interval):
        null_imp_df = pd.concat([null_imp_df,res1[i]],axis=0)
        
    print(time_taken)
    
    return null_imp_df