In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, roc_curve, accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer, LabelBinarizer
import category_encoders

import time 
import pickle

import matplotlib.pyplot as plt 
%matplotlib inline

Functions

In [45]:
def ginic(actual, pred):
    actual = np.asarray(actual)
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2
    return 2 * giniSum / n

def gini_normalizedc(a, p):
    if p.ndim == 2:  # Required for sklearn wrapper
        p = p[:, 1]  # If proba array contains proba for both 0 and 1 classes, just pick class 1
    return ginic(a, p) / ginic(a, a)


def cross_val(X, y, model, kf):
    X, y = np.array(X), np.array(y).reshape(-1)
    cv_scores = np.zeros((5,4), dtype=np.float32)

    for i, (train_index, val_index) in enumerate(kf.split(X, y)):

        print( "Fold ", i)

        y_train, y_val = y[train_index].copy(), y[val_index].copy()
        X_train, X_val = X[train_index, :].copy(), X[val_index, :].copy()
        
        fit_model = model.fit(X_train, y_train)
        pred = fit_model.predict(X_val)

        cv_scores[i, :] = [f1_score(y_val, pred), 
                           precision_score(y_val, pred), 
                           recall_score(y_val, pred), 
                           roc_auc_score(y_val, pred)]
        
    return cv_scores

def print_metrics(cv_scores):
    metrics = ['f1', 'precision', 'recall', 'roc auc']
    cvmean = cv_scores.mean(0)
    for i in range(4):
        print("{} = {:.5f}".format(metrics[i], cvmean[i]))

def cutoff_metrics(cutoff, predict_proba):
    for cut in cutoff:
        print("f1: {:.3}\trecall: {:.3}\tprecision: {:.3}\tauc: {:.3}\tacc: {:.3}\tcutoff: {:.2} ".format(
            f1_score(y_valid, predict_proba > cut), 
            recall_score(y_valid, predict_proba > cut), 
            precision_score(y_valid, predict_proba > cut), 
            roc_auc_score(y_valid, predict_proba > cut),
            accuracy_score(y_valid, predict_proba>cut),
            cut
        ))

In [7]:
def transform(train, test, target, to_drop=False, 
              high_cardinality="smoothing", hc_treshold = 10, hc_drop=False, # high cardinality categorical
              eb_k=50, eb_f=10,  # parameters for hc smoothing function 
              encode=False,  # categorical 
              fill_num=-1, scaling=False  # continuous 
             ):
    
    """ 
    data preprocessing 
    
    :train, test: pandas DataFrame
    :high_cardinality: way to handle categorical features with high number of levels
    :encode: category encoding, 'ohe' = one hot, 'bin' = binary
    :fill_num: fill nan for continuous features, -1 = with -1, ('mean', 'median') = strategy
    :scaling: 'standard' = StandartScaler
    
    category features should have type 'object'
    """

    # remove duplicates 
    if to_drop:
        train = train.drop(to_drop, axis=1)
        test = test.drop(to_drop, axis=1)
    
    ######## categorical features 
    
    cat_features = train.columns[train.dtypes=='object']
    num_features = train.columns[train.dtypes!='object']      
        
    # factorize 
    le = LabelEncoder()
    train[cat_features] = train[cat_features].fillna('-1')
    test[cat_features] = test[cat_features].fillna('-1')
    for c in cat_features:
        data=train[c].append(test[c])
        le.fit(data.values.tolist())  # nan = 0 level
        train[c] = le.transform(train[c].values.tolist())
        test[c] = le.transform(test[c].values.tolist())       
    
    # mark nan with -1, if encoding not necessary 
    if not encode:
        train[cat_features] = train[cat_features].replace(0, -1)
        test[cat_features] = test[cat_features].replace(0, -1)        
    ######## high cardinality
    
    if high_cardinality:

        hc_features = train[cat_features].columns[train[cat_features].apply(lambda x: len(x.value_counts())) > hc_treshold]
        target_mean = target.mean()
        S = {}

        for c in hc_features:

            if high_cardinality == "sr":
                # supervised ratio 
                group_means = pd.concat([train[c], pd.DataFrame(target, columns=['target'], index=train.index)], axis=1).groupby(c).mean()
                group_means = group_means.target.to_dict()
                for group in train[c].value_counts().index:
                    S[group] = group_means[group]

            if high_cardinality=="woe":
                # weight of evidence
                group_y1 = pd.concat([train[c], pd.DataFrame(target, columns=['target'], index=train.index)], axis=1).\
                groupby([c]).agg('sum')
                group_y0 = pd.concat([train[c], pd.DataFrame(target, columns=['target'], index=train.index)], axis=1).\
                groupby([c]).agg('count') - group_y1
                y1 = (target==1).sum()
                y0 = (target==0).sum()
                woe = np.log(((group_y1) / y1) / ((group_y0) / y0))
                for i,v in zip(woe.index, np.where(np.isinf(woe), 0, woe)):
                    S[i] = v[0]

            if high_cardinality=="smoothing":
                # empirical bayes (smoothing for small group)
                group_means = pd.concat([train[c], pd.DataFrame(target, columns=['target'], index=train.index)], axis=1).groupby(c).mean()
                group_means = group_means.target.to_dict()
                group_counts = pd.concat([train[c], pd.DataFrame(target, columns=['target'], index=train.index)], axis=1).groupby(c).agg('count')
                group_counts = group_counts.target.to_dict()

                def smoothing_function(n, k, f):
                    return 1 / (1 + np.exp(-(n-k)/f))

                for group in train[c].value_counts().index:
                    lam = smoothing_function(n=group_counts[group], k=eb_k, f=eb_f)
                    S[group] = lam*group_means[group] + (1-lam)*target_mean

            # transform train
            train[c+'_avg'] = train[c].apply(lambda x: S[x]).copy()

            # transform test
            def hc_transform_test(x):
                if x in S: 
                    return S[x]
                else:
                    return target_mean

            test[c+'_avg'] = test[c].apply(hc_transform_test).copy()

        # drop hc features 
        if hc_drop:
            train.drop(hc_features, axis=1, inplace=True)
            test.drop(hc_features, axis=1, inplace=True)

        # update cat features 
        cat_features = sorted(list(set(cat_features).difference(hc_features)))

    ######## for linear models 
    
    # fill missings
    if fill_num in ['mean', 'median']:
        imputer = Imputer(strategy=fill_num)
        train[num_features] = imputer.fit_transform(train[num_features])
        test[num_features] = imputer.transform(test[num_features])
    elif fill_num < 0:
        train[num_features] = train[num_features].fillna(fill_num)
        test[num_features] = test[num_features].fillna(fill_num)
        
    # scaling
    if scaling=='standard':
        scaler = StandardScaler()
        train[num_features] = scaler.fit_transform(train[num_features])
        test[num_features] = scaler.transform(test[num_features])
    
    ######## encoding 
    if encode=='ohe':
        # one hot encoding, memory inefficient
        oh = OneHotEncoder(sparse=False)
        for c in cat_features:
            data=train[c].append(test[c])
            oh.fit(data.reshape(-1,1))            
            train_temp = oh.transform(train[c].reshape(-1,1))
            test_temp = oh.transform(test[c].reshape(-1,1))
            train = pd.concat([train, pd.DataFrame(train_temp, 
                                                   columns=[(c+"_"+str(i)) for i in data.value_counts().index],
                                                   index = train.index
                                                  )], axis=1)
            test = pd.concat([test, pd.DataFrame(test_temp, 
                                                 columns=[(c+"_"+str(i)) for i in data.value_counts().index],
                                                 index = test.index
                                                )], axis=1)
            # drop column
            train.drop(c, axis=1, inplace=True)
            test.drop(c, axis=1, inplace=True)
    
    if encode=='bin':
        # binary encoding 
        pass
            
    return train, test

In [8]:
# train, test = transform(train, test, target, 
#                         encode='ohe', scaling=True, fill_num='median', hc_drop=True,
#                         to_drop=['Var214', 'Var220', 'Var222'])
# train.shape, test.shape

In [9]:
# X.isnull().sum().value_counts()[:5]
# nan_group_1 = X[X.columns[X.isnull().sum() == 34506]].dropna().index
# nan_group_2 = X[X.columns[X.isnull().sum() == 34141]].dropna().index
# nan_group_3 = X[X.columns[X.isnull().sum() == 33975]].dropna().index
# nan_group_4 = X[X.columns[X.isnull().sum() == 33881]].dropna().index
# set(nan_group_3).intersection(list(nan_group_2))
# выставим порог отсева пустых значений на 99%
# X = X[X.columns[(X.isnull().sum() / X.shape[0] ) < 0.75]].copy()

Model fit

In [115]:
MISS_THERSHOLD = 0.95
ENCODE = False
FILL_NUM = -1

# high cardinality 
HC = "smoothing"
HC_DROP = False
HC_K = 50
HC_F = 2

############################################################################################################
# xgb hyperparameters

# model 
model = XGBClassifier(n_jobs=4, tree_method='gpu_hist', predictor = "cpu_predictor", objective="binary:logistic",
                      n_estimators=200, 
                      learning_rate=0.05,
                      max_depth=5,
                      gamma=10, min_child_weight=2,
                      subsample=.8, colsample_bytree=.8,
#                       scale_pos_weight=1.5,
                      reg_alpha=2,
                      reg_lambda=1.3,
                     )

############################################################################################################
# load 
test = pd.read_csv('./input/orange_small_churn_test_data.csv')
train, target = pd.read_csv('./input/orange_small_churn_data.train'), \
np.where(pd.read_csv('./input/orange_small_churn_labels.train', header=-1)==1, 1, 0).ravel()

test_id= test['ID']
test.drop(['ID'], axis=1, inplace=True)

# выставим порог отсева пустых значений на 95%
to_drop = train.columns[(train.isnull().sum() / train.shape[0] ) >= MISS_THERSHOLD]
train.drop(to_drop, axis=1, inplace=True)
test.drop(to_drop, axis=1, inplace=True)

# split data 
X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=5000, random_state=42, stratify=target)

# transform
X_train, X_valid = transform(X_train, X_valid, y_train, 
                        encode=ENCODE, scaling=True, fill_num=FILL_NUM, 
                        hc_drop=HC_DROP, high_cardinality=HC, eb_k=HC_K, eb_f=HC_F,
                        to_drop=['Var214', 'Var220', 'Var222'])
print(train.shape, test.shape )

# cv 
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017)
cv_scores = cross_val(X_train, y_train, model, kf)
print_metrics(cv_scores)

(40000, 77) (10000, 77)
Fold  0
Fold  1
Fold  2
Fold  3
Fold  4
f1 = 0.08469
precision = 0.59812
recall = 0.04569
roc auc = 0.52160


In [116]:
# model 
model = XGBClassifier(n_jobs=4, tree_method='gpu_hist', predictor = "cpu_predictor", objective="binary:logistic",
                      n_estimators=211, 
                      learning_rate=0.05,
                      max_depth=5,
                      gamma=5, min_child_weight=2,
                      subsample=.8, colsample_bytree=.8,
#                       scale_pos_weight=1.5,
                      reg_alpha=2,
                      reg_lambda=1.3,
                     )

In [117]:
model.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_valid, y_valid)])
predict_valid = model.predict_proba(X_valid)[:, 1]

[0]	validation_0-auc:0.655619
Will train until validation_0-auc hasn't improved in 20 rounds.
[1]	validation_0-auc:0.666381
[2]	validation_0-auc:0.666151
[3]	validation_0-auc:0.671602
[4]	validation_0-auc:0.672713
[5]	validation_0-auc:0.676607
[6]	validation_0-auc:0.679461
[7]	validation_0-auc:0.687289
[8]	validation_0-auc:0.688947
[9]	validation_0-auc:0.687634
[10]	validation_0-auc:0.688155
[11]	validation_0-auc:0.689897
[12]	validation_0-auc:0.689954
[13]	validation_0-auc:0.68649
[14]	validation_0-auc:0.683896
[15]	validation_0-auc:0.684376
[16]	validation_0-auc:0.686283
[17]	validation_0-auc:0.687387
[18]	validation_0-auc:0.689326
[19]	validation_0-auc:0.690723
[20]	validation_0-auc:0.68946
[21]	validation_0-auc:0.685064
[22]	validation_0-auc:0.685426
[23]	validation_0-auc:0.685907
[24]	validation_0-auc:0.686403
[25]	validation_0-auc:0.687594
[26]	validation_0-auc:0.686322
[27]	validation_0-auc:0.687625
[28]	validation_0-auc:0.686105
[29]	validation_0-auc:0.688048
[30]	validation_0-

Prediction

In [118]:
# cutoff = np.linspace(0.1, 0.3, 19)
# cutoff_metrics(cutoff, predict_valid)
roc_auc_score(y_valid, predict_valid)

0.71754996468434307

In [121]:
train, test = transform(train, test, target, 
                        encode=ENCODE, scaling=True, fill_num=FILL_NUM, 
                        hc_drop=HC_DROP, high_cardinality=HC, eb_k=HC_K, eb_f=HC_F,
                        to_drop=['Var214', 'Var220', 'Var222'])

In [122]:
model.fit(train, target)
predict_proba = model.predict_proba(test)[:, 1]

In [124]:
# save submission
# sub = pd.concat([test_id, pd.DataFrame(np.where(predict1==0, -1, 1), columns=['result'])], axis=1)
sub = pd.concat([test_id, pd.DataFrame(predict_proba, columns=['result'])], axis=1)
sub.to_csv('./input/sub.csv', index=False)

In [85]:
# model.fit(X_train, y_train)
# predict_proba = model.predict_proba(X_valid)[:, 1]
# fpr, tpr, _ = roc_curve(y_valid, predict_proba)
# roc_auc_score(y_valid, predict_proba)
# plt.plot(fpr, tpr)
# plt.plot([0,1],[0,1],'r--')