# Functions and useful definitions

In [1]:
# !pip install missingno
# !pip install gc
# !pip install dask
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost
# !pip install hyperopt
# !pip install imbalanced-learn
# !conda install seaborn=0.9.0 -y

In [2]:
import sys
import collections
import numpy as np
import pandas as pd
import missingno as msno
import gc
import os.path
from IPython.display import set_matplotlib_formats
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, SVR
from dask import dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count
from functools import reduce
import ast
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import random
import itertools
from imblearn.over_sampling import SMOTE

import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
import warnings

In [3]:
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.mode.use_inf_as_na = True

In [4]:
# Some defines about plot

set_matplotlib_formats('pdf', 'png')
pd.options.display.float_format = '{:.2f}'.format
rc={'savefig.dpi': 75, 'figure.autolayout': False, 'figure.figsize': [18, 10], 'axes.labelsize': 12,\
   'axes.titlesize': 18, 'font.size': 14, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 10,\
   'xtick.labelsize': 12, 'ytick.labelsize': 12}

sns.set(rc=rc)
sns.set_palette(sns.light_palette("purple", reverse=True))

default_color = 'purple'
default_light_color = 'white'
default_dark_color = 'rebeccapurple'
colormap = 'BuPu'  # plt.cm.cool

In [5]:
def plot_count(df, col, null_replace='NULL', x_adjust=0, y_adjust=0.45):
    plt.figure(figsize=(15,5))
    ax = sns.countplot(y=col, data=df.fillna(null_replace))
    print(ax)
    for p in ax.patches:
        count = p.get_width()
        percent = 100 * count / len(df[col])

        axis_init = 0 if p.get_width() < x_adjust + 500 else x_adjust
        color = default_dark_color if axis_init == 0 else default_light_color
        ax.annotate('{} - ({:.2f}%)'.format(count, percent),
                    (p.get_width() - axis_init, p.get_y() + y_adjust), 
                    color=color)

In [6]:
def plot_count_v(df, col, null_replace='NULL', x_adjust=0, y_adjust=0.45):
    plt.figure(figsize=(15,5))
    ax = sns.countplot(col, data=df.fillna(null_replace))
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
    print(ax)
    for p in ax.patches:
        count = p.get_width()
        percent = 100 * count / len(df[col])

        axis_init = 0 if p.get_width() < x_adjust + 500 else x_adjust
        color = default_dark_color if axis_init == 0 else default_light_color
        ax.annotate('{} - ({:.2f}%)'.format(count, percent),
                    (p.get_x() - axis_init, p.get_width() + y_adjust), 
                    color=color)

In [7]:
def OHE_by_unique(train, one_hot, limit):
    
    #ONE-HOT enconde features with more than 2 and less than 'limit' unique values
    df = train.copy()
    for c in one_hot:
        if len(one_hot[c])>2 and len(one_hot[c]) < limit:
            for val in one_hot[c]:
                df[c+'_oh_' + str(val)] = (df[c].values == val).astype(np.int)
    return df

In [8]:
def cross_val_model(X,y, model, n_splits=3, scoring='roc_auc', model_type='clas'):
    X = np.array(X.astype('float32'))
    y = np.array(y.astype('float32'))

    if model_type == 'reg':
        folds = list(KFold(n_splits=n_splits, shuffle=True, random_state=2017).split(X, y))
    else:
        folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2017).split(X, y))

    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        y_holdout = y[test_idx]

        print ("Fit %s fold %d" % (str(model).split('(')[0], j+1))
        model.fit(X_train, y_train)
        if model_type == 'clas':
            print("    y train: ", collections.Counter(y_train))
            print("    y test:  ", collections.Counter(y_holdout))
            
        if isinstance(scoring, tuple):
            cross_score = cross_validate(model, X_holdout, y_holdout, cv=3, scoring=scoring)
            print("    Fit Time:   ", cross_score['fit_time'])
            print("    Score Time: ", cross_score['score_time'])
            for s in scoring:
                print("    {} test cross_score: {:.5f}".format(s, cross_score['test_'+s].mean()))
                print("    {} train cross_score: {:.5f}".format(s, cross_score['train_'+s].mean()))
        else:
            cross_score = cross_val_score(model, X_holdout, y_holdout, cv=3, scoring=scoring)
            print("    cross_score: {:.5f}".format(cross_score.mean()))
            
        if model_type == 'clas':
            y_pred = cross_val_predict(model, X_holdout, y_holdout, cv=3)
            conf_mat = confusion_matrix(y_holdout, y_pred)
            print(conf_mat)

    return X_train, y_train, X_holdout, y_holdout

In [9]:
class Ensemble(object):

    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit(self, X, y):
        from sklearn.model_selection import StratifiedKFold
        from sklearn.model_selection import cross_val_score
        X = np.array(X)
        y = np.array(y)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((X.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((X.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
        
        y_pred = cross_val_predict(self.stacker, S_train, y, cv=3)
        conf_mat = confusion_matrix(y, y_pred)
        
        self.stacker.fit(S_train, y)
        print("Stacker score: %.5f" % (results.mean()))
        print(conf_mat)

    def predict(self, T):
        from sklearn.model_selection import StratifiedKFold
        from sklearn.model_selection import cross_val_score
        T = np.array(T)

        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            S_test[:, i] = clf.predict_proba(T)[:,1]

        res = self.stacker.predict_proba(S_test)[:,1]
        return res

    def fit_predict(self, X, y, T):
        self.fit(X, y)
        return self.predict(T)