In [36]:
import os
import gc
import pandas as pd
import numpy as np
import lightgbm as lgbm
import xgboost as xgb
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import fancyimpute as fi
from featuretools import selection
from sklearn.base import TransformerMixin

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
from __future__ import print_function
import h2o
from h2o.automl import H2OAutoML

h2o.init(max_mem_size = '8g')


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,33 mins 59 secs
H2O cluster timezone:,America/Regina
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.11
H2O cluster version age:,27 days
H2O cluster name:,H2O_from_python_TheShogun_8anshe
H2O cluster total nodes:,1
H2O cluster free memory:,6.143 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


Function Section

In [2]:
def process_dataframe(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """

    # Label encode categoricals
    print('Label encoding categorical features...')
    categorical_feats = input_df.columns[input_df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))
    print('Label encoding complete.')

    return input_df, categorical_feats.tolist(), encoder_dict

In [3]:
def missing_val_ratio(df):
    perc_na = (df.isnull().sum()/len(df))*100
    ratio_na = perc_na.sort_values(ascending=False)
    missing_data_table = pd.DataFrame({'% of Total Values' :ratio_na})
    return missing_data_table

In [4]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
class DataFrameImputer_Categorical(TransformerMixin):
    
    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [6]:
def separe_numeric_categoric(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df_n = df.select_dtypes(include=numerics)
    df_c = df.select_dtypes(exclude=numerics)
   

    print(f'The DF have {len(list(df_n))} numerical features and {len(list(df_c))} categorical fets')
    return df_n, df_c

In [7]:
def readImputeNumerical(numerical_df):
    del numerical_df
    gc.collect()
    return reduce_mem_usage(pd.read_csv('processed_input_data_Impute.csv'))



In [8]:
def find_missing(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    filter(lambda x: x>=.3, percent)
    return percent


In [9]:

def remove_missing_data(df,minimum=.7):
    
    percent = find_missing(df)
    number = len(list(filter(lambda x: x>=(1.0-minimum), percent)))
    names = list(percent.keys()[:number])

    global noImputeDf 
    noImputeDf = df[names]

    print(noImputeDf.columns)
    df = df.drop(names, 1, errors='ignore')
    print(f'{number} columns exclude because haven`t minimium data.')
    return df


In [10]:
def impute_NumericData(numerical_df):
    columsNum = numerical_df.columns
    #print(np.any(np.isnan(numerical_df.values)))
    #print(np.all(np.isfinite(numerical_df.values)))

    df_numerical_complete = fi.MICE(n_imputations=2, impute_type='col').complete(numerical_df)
    #At the end by default it seems n_imputations is set to a mim of 12
    
    
    print(df_numerical_complete.head())
    
    #n_missing = count_missing(df)
    #print(f'{columns_missing-n_missing} numerical features imputated')

    #Complete the columns name.
    temp = pd.DataFrame(columns=columsNum, data=df_numerical_complete)
    temp.to_csv("MiceCompletedFULL.csv")
    return temp

In [11]:
def impute_CategoricalData( categorical_df):
    return categorical_df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [12]:
def impute_missing_data(df,minimium_data=.3):
    numerical_df, categorical_df = prepareDataToImpute(df,minimium_data=.3)
   
    df_Numerical = impute_NumericData(numerical_df)
    ##SINCE FILE IS BIG, READ FROM FILE
    #df_Numerical = pd.read_csv('MiceCompleted.csv')
    
    df_Categorical = impute_CategoricalData(categorical_df)
    #df = readImputeNumerical(numerical_df)
        
    df = pd.concat([df_Numerical,df_Categorical, noImputeDf], axis=1)
    return df

In [13]:
def prepareDataToImpute(df,minimium_data=.8):

    # remove features without minimium size of information
    df = remove_missing_data(df,minimium_data)
    
    numerical_df, categorical_df = separe_numeric_categoric(df)


    numerical_df = numerical_df.replace([np.inf, -np.inf], np.nan)
    del df
    gc.collect()
    return numerical_df, categorical_df

In [14]:

merged_df = reduce_mem_usage(pd.read_csv('processed_input_data.csv'))

# Separate metadata
meta_cols = ['SK_ID_CURR']


Memory usage of dataframe is 1193.21 MB
Memory usage after optimization is: 484.15 MB
Decreased by 59.4%


In [15]:
meta_df = merged_df[meta_cols]

In [16]:
meta_df.to_csv("meta_df.csv")

In [17]:

merged_df.drop(columns=meta_cols, inplace=True)

merged_df.shape

(356255, 438)

In [18]:
merged_df.TARGET.unique()

array([ 1.,  0., nan])

In [19]:
# Process the data set.
targetCol = ['TARGET']
targetDF = merged_df[targetCol]
merged_df.drop(columns=targetCol, inplace=True)

merged_df, categorical_feats, encoder_dict = process_dataframe(input_df=merged_df)
merged_df.shape


Label encoding categorical features...
Label encoding complete.


(356255, 437)

In [21]:
#merged_df = DataFrameImputer().fit_transform(merged_df)

In [22]:
#Global Variables //Ugly thing to do. 
noImputeDf =  pd.DataFrame()

#Do not include TARGET WHEN IMPUTING. 
merged_df= impute_missing_data(merged_df)
merged_df.shape

Index(['RATE_INTEREST_PRIVILEGED_PRVMAX', 'RATE_INTEREST_PRIMARY_PRVMAX',
       'RATE_INTEREST_PRIMARY_PRVMIN', 'RATE_INTEREST_PRIVILEGED_PRVMIN',
       'RATE_INTEREST_PRIVILEGED', 'RATE_INTEREST_PRIMARY',
       'AMT_PAYMENT_CURRENT_CCMIN', 'AMT_PAYMENT_CURRENT_CCMEAN',
       'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_CURRENT_CCMAX',
       'AMT_DRAWINGS_OTHER_CURRENT_CCMEAN', 'CNT_DRAWINGS_OTHER_CURRENT_CCMAX',
       'AMT_DRAWINGS_POS_CURRENT_CCMAX', 'CNT_DRAWINGS_ATM_CURRENT_CCMAX',
       'AMT_DRAWINGS_POS_CURRENT_CCMIN', 'AMT_DRAWINGS_ATM_CURRENT_CCMIN',
       'AMT_DRAWINGS_OTHER_CURRENT_CCMIN', 'AMT_DRAWINGS_ATM_CURRENT_CCMAX',
       'CNT_DRAWINGS_ATM_CURRENT_CCMIN', 'AMT_DRAWINGS_OTHER_CURRENT_CCMAX',
       'AMT_DRAWINGS_POS_CURRENT_CCMEAN', 'CNT_DRAWINGS_POS_CURRENT_CCMEAN',
       'CNT_DRAWINGS_OTHER_CURRENT_CCMEAN', 'CNT_DRAWINGS_ATM_CURRENT_CCMEAN',
       'CNT_DRAWINGS_POS_CURRENT_CCMIN', 'AMT_DRAWINGS_ATM_CURRENT_CCMEAN',
       'CNT_DRAWINGS_POS_CURRENT', 'CNT_DRAWINGS_OT

(356255, 437)

In [59]:
merged_df.shape

(356255, 437)

In [23]:
merged_df.head

<bound method NDFrame.head of         AMT_ANNUITY  AMT_CREDIT  AMT_GOODS_PRICE  AMT_INCOME_TOTAL  \
0           24700.5    406597.5         351000.0      202500.00000   
1           35698.5   1293502.5        1129500.0      270000.00000   
2            6750.0    135000.0         135000.0       67500.00000   
3           29686.5    312682.5         297000.0      135000.00000   
4           21865.5    513000.0         513000.0      121500.00000   
5           27517.5    490495.5         454500.0       99000.00000   
6           41301.0   1560726.0        1395000.0      171000.00000   
7           42075.0   1530000.0        1530000.0      360000.00000   
8           33826.5   1019610.0         913500.0      112500.00000   
9           20250.0    405000.0         405000.0      135000.00000   
10          21177.0    652500.0         652500.0      112500.00000   
11          10678.5    148365.0         135000.0       38419.15625   
12           5881.5     80865.0          67500.0       67500

In [24]:

merged_df = pd.concat([merged_df,targetDF],axis=1)
merged_df.TARGET.unique()
merged_df.to_csv('ImputedData.csv', index=False)

In [25]:
#merged_df = pd.read_csv('ImputedData.csv')

In [26]:
# Re-separate into train and test
# 307511 -> represent the len_train (hard coded)

merged_df = merged_df.replace([np.inf, -np.inf], np.nan)


train_df = merged_df[:307511]
test_df = merged_df[307511:]

In [27]:
#map_dict = { 1.: "yes",0. :"no" }
#train_df["TARGET"] = train_df["TARGET"].map(map_dict)

In [28]:
merged_df.TARGET.unique()

array([ 1.,  0., nan])

In [29]:
train_df.TARGET.unique()

array([1., 0.])

In [30]:
test_df.TARGET.unique()

array([nan])

In [31]:
train_df.to_csv("trainDF.csv")
test_df.to_csv("ToPredictDF.csv")

del train_df
del test_df
gc.collect()

7

In [32]:
train_df = pd.read_csv('trainDF.csv')


from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(train_df['TARGET'], n_iter=3, test_size=0.2)
for train_index, test_index in sss:
    xtrain, xtest = train_df.iloc[train_index], train_df.iloc[test_index]
    
xtrain.to_csv("trainDF.csv")

xtest.to_csv("testDF.csv")




In [2]:
train_hf = h2o.import_file('trainDF.csv')

test_hf = h2o.import_file('testDF.csv')

predict_hf = h2o.import_file('ToPredictDF.csv')




Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
predict_hf.columns

['C1',
 'AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'APARTMENTS_AVG',
 'APARTMENTS_MEDI',
 'APARTMENTS_MODE',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_MODE',
 'CNT_FAM_MEMBERS',
 'CODE_GENDER',
 'COMMONAREA_AVG',
 'COMMONAREA_MEDI',
 'COMMONAREA_MODE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'ELEVATORS_AVG',
 'ELEVATORS_MEDI',
 'ELEVATORS_MODE',
 'EMERGENCYSTATE_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'ENTRANCES_MODE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FLOORSMAX_AVG',
 'FLOORSMAX_MEDI',
 'FLOORSMAX_MODE',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MEDI',
 'FLOORSMIN_MODE',
 'FONDKAPREMONT_MODE',


In [11]:
#drop undesire columns
train_hf =train_hf.drop('C1')
train_hf =train_hf.drop('Unnamed: 0')

test_hf =test_hf.drop('C1')
test_hf =test_hf.drop('Unnamed: 0')

predict_hf =predict_hf.drop('C1')

In [12]:
# Identify predictors and response
x = train_hf.columns


y = "TARGET"
x.remove(y)

# For binary classification, response should be a factor

#train_hf[y] = train_hf[y].asfactor()
#test_hf[y] = test_hf[y].asfactor()

In [13]:
#aml = H2OAutoML(max_runtime_secs = 25200)
#aml.train(x = x, y = y,
#          training_frame = train_hf
#         )
#aml.leaderboard


In [14]:
# Number of CV folds (to generate level-one data for stacking)
nfolds = 5

In [15]:
my_gbm = H2OGradientBoostingEstimator(ntrees=10,
                                      max_depth=3,
                                      min_rows=2,
                                      learn_rate=0.2,
                                      nfolds=nfolds,
                                      fold_assignment="Modulo",
                                      balance_classes = True,
                                      keep_cross_validation_predictions=True,
                                      stopping_metric = "AUC",
                                      seed=1)
my_gbm.train(x=x, y=y, training_frame=train_hf)


gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [16]:
perf_gbm = my_gbm.model_performance(train=True)
perf_gbm


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.06983625953188011
RMSE: 0.26426550953894856
MAE: 0.14196945099212288
RMSLE: 0.18374713301091475
Mean Residual Deviance: 0.06983625953188011




In [17]:
model_pathGBM = h2o.save_model(model=my_gbm ,force=True)

In [18]:
#pred = my_gbm.predict(predict_hf)
#pred


In [19]:
# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
                                 nfolds=nfolds,
                                 fold_assignment="Modulo",
                                 balance_classes = True,
                                 keep_cross_validation_predictions=True,
                                 stopping_metric = "AUC",
                                 seed=1)
my_rf.train(x=x, y=y, training_frame=train_hf)


# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                       base_models=[my_gbm, my_rf])
ensemble.train(x=x, y=y, training_frame=train_hf)


drf Model Build progress: |███████████████████████████████████████████████| 100%
stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [20]:
model_pathRF = h2o.save_model(model=my_rf ,force=True)
model_pathStack = h2o.save_model(model=ensemble ,force=True)

In [21]:
#ensemble = h2o.load_model('my_ensemble_binomial')

In [22]:
# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test_hf)

# Compare to base learner performance on the test set
perf_gbm_test = my_gbm.model_performance(test_hf)

perf_gbm_test





ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 0.0700860035493424
RMSE: 0.2647376126456957
MAE: 0.14239889031168884
RMSLE: 0.1841870808025383
Mean Residual Deviance: 0.0700860035493424




In [16]:
perf_rf_test = my_rf.model_performance(test_hf)
baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc())
stack_auc_test = perf_stack_test.auc()
print("Best Base-learner Test AUC:  {0}".format(baselearner_best_auc_test))
print("Ensemble Test AUC:  {0}".format(stack_auc_test))

KeyError: 'AUC'

In [23]:
test_hf.shape

(61503, 438)

In [24]:
pred = ensemble.predict(predict_hf)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [25]:
pred.shape

(48744, 1)

In [26]:
pred.columns

['predict']

In [27]:
pred

predict
0.0602848
0.0900025
0.0158962
0.0451389
0.0926713
0.0332346
0.0294519
0.0592977
0.0174415
0.175248




In [28]:
pd = pred.as_data_frame()
dfPred = pd['predict'].tolist()

In [31]:
meta_df = pd.read_csv("meta_df.csv")
meta_df.columns

Index(['Unnamed: 0', 'SK_ID_CURR'], dtype='object')

In [32]:
ids = meta_df['SK_ID_CURR'][307511:].tolist()

In [33]:
out_df = pd.DataFrame({'SK_ID_CURR': ids, 'TARGET': dfPred})
out_df.to_csv('submission-Stack.csv', index=False)

In [39]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
model = H2ODeepLearningEstimator()
model.train(x=x, y=y, training_frame=train_hf)




deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [40]:
perf_model_test = model.model_performance(test_hf)

perf_model_test



ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 0.0728139662540467
RMSE: 0.2698406312141422
MAE: 0.14255725385074933
RMSLE: NaN
Mean Residual Deviance: 0.0728139662540467


