In [1]:
import xgboost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn import decomposition
import math

import re
%matplotlib inline

In [3]:
def read_data(filename):
    data = pd.read_csv(filename, header=0, sep=';', encoding='pt154')
    data = data.rename(index=str, columns={'client_id': '_ID_', 'open_acount_flg': '_VAL_'})
    return data

In [5]:
train = read_data('../data/credit_train.csv')
test = read_data('../data/credit_test.csv')

In [6]:
train.head()

Unnamed: 0,_ID_,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0
2,3,M,32,MAR,SPC,1072800,12,1.1,459589,SCH,ОБЛ САРАТОВСКАЯ,23000.0,5.0,0.0,0
3,4,F,27,DIV,SPC,1200909,12,1.1,362536,GRD,ОБЛ ВОЛГОГРАДСКАЯ,17000.0,2.0,0.0,0
4,5,M,45,MAR,SPC,1690889,10,1.1,421385,SCH,ЧЕЛЯБИНСКАЯ ОБЛАСТЬ,25000.0,1.0,0.0,0


In [7]:
bad_names = []

def parse_living_region(data):
    def get_good_name(s_name):
        if s_name.find('КРАЙ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'КРАЙ', tokens))
            return good_name[0]
        if s_name.find('АО') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x.find('АО') == -1, tokens))
            return good_name[0]
        if s_name.find('ОБЛ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'ОБЛ' and x != 'ОБЛАСТЬ', tokens))
            return good_name[-1]
        if s_name.find('РЕСП') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'РЕСП' and x != 'РЕСПУБЛИКА', tokens))
            return good_name[0]
        if len(s_name.split()) == 1 and len(s_name.split('.')) == 1:
            return s_name
        if s_name.find('МОСКВА') != -1:
            return 'МОСКВА'
        if s_name.find('ПЕТЕРБУРГ'):
            return 'САНКТ-ПЕТЕРБУРГ'
        if s_name.find('АВТОНОМНЫЙ'):
            return s_name.split()[0]
        if s_name.find('ЕВРЕЙ'):
            return 'ЕВРЕЙСКАЯ'
        if s_name.find('ДАЛЬНИЙ'):
            return 'ДАЛЬНИЙ ВОСТОК'
        if s_name.find('ФЕДЕРАЛЬНЫЙ'):
            return s_name.split()[0]
        bad_names.append(s_name)
        return np.nan
        
    return [get_good_name(x) if x is not np.nan else 'BAD NAME' for x in data]


def reduction_living_region(data):
    def get_good_name(s_name):
        if s_name == 'BAD NAME':
            bad_names.append(np.nan)
        if s_name == '74':
            return 'ЧЕЛЯБИН'
        if s_name == '98':
            return 'САНКТ-ПЕТЕРБУРГ'
        if s_name == 'РЕСПУБЛИКАТАТАРСТАН':
            return 'ТАТАРСТАН'
        if s_name == 'МОСКВОСКАЯ':
            return 'МОСКОВ'
        if s_name == 'РОССИЯ':
            bad_names.append(s_name)
            return 'BAD NAME'
        if s_name == 'КАМЧАТС??ИЙ':
            return 'КАМЧАТ'
        if s_name == 'ХАНТЫ-МАНСИЙСКИЙ-ЮГРА':
            return 'ХАНТЫ-МАНСИЙ' 
        if s_name.find('СКИЙ') != -1 or s_name.find('СКАЯ') != -1:
            return s_name[:-4]
        return s_name
    
    return [get_good_name(x) for x in data]

In [40]:
class Features:
    def __init__(self, train):
        self._train = train
    
    def get_gender_feature(self, data):
        poss = ['F', 'M']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_gender)] for s_gender in self._train.gender])
        return enc.transform([[poss.index(s_gender)] for s_gender in data.gender]).toarray()
    
    def get_age_feature(self, data):
        return data.age.values.reshape((-1, 1))
    
    def get_marital_status_feature(self, data):
        poss = ['UNM', 'DIV', 'MAR', 'WID', 'CIV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_marital_status)] for s_marital_status in self._train.marital_status])
        return enc.transform([[poss.index(s_marital_status)] for s_marital_status in data.marital_status]).toarray()
    
    def get_job_position_feature(self, data):
        poss = ['SPC', 'DIR', 'HSK', 'INV', 'WOI', 'WRK', 'ATP', 'WRP', 'UMN', 'NOR', 'PNS', 'BIS', 'INP',
                'BIU', 'ONB', 'PNA', 'PNI', 'PNV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_job_position)] for s_job_position in self._train.job_position])
        return enc.transform([[poss.index(s_job_position)] for s_job_position in data.job_position]).toarray()
    
    def get_credit_sum_feature(self, data):
        return np.array([[round(float(s_credit_sum.replace(',', '.')) / 1000)] 
                         for s_credit_sum in data.credit_sum])
    
    def get_credit_month_feature(self, data):
        enc = preprocessing.OneHotEncoder()
        enc.fit([[credit_month] for credit_month in self._train.credit_month])
        return enc.transform([[credit_month] for credit_month in data.credit_month]).toarray()
    
    def get_tariff_id_feature(self, data):
        poss = list(np.unique(self._train.tariff_id))
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(tariff_id)] for tariff_id in poss])
        return enc.transform([[poss.index(tariff_id)] for tariff_id in data.tariff_id]).toarray()
    
    def get_score_shk_feature(self, data):
        return np.array([[float(s_score_shk.replace(',', '.'))] for s_score_shk in data['score_shk']])
    
    def get_education_feature(self, data):
        poss = ['SCH', 'UGR', 'GRD', 'PGR', 'ACD']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_education)] for s_education in self._train.education])
        return enc.transform([[poss.index(s_education)] for s_education in data.education]).toarray()
    
    def get_living_region_feature(self, data):        
        tr_liv_reg = reduction_living_region(parse_living_region([s_name for s_name in self._train.living_region]))
        da_liv_reg = reduction_living_region(parse_living_region([s_name for s_name in data.living_region]))
        poss = list(np.unique(tr_liv_reg))
        
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_tr_liv_reg)] for s_tr_liv_reg in tr_liv_reg])
        da_liv_reg = enc.transform([[poss.index(s_living_region)]
                                     for s_living_region in da_liv_reg]).toarray()
        tr_liv_reg = enc.transform([[poss.index(s_living_region)]
                                     for s_living_region in tr_liv_reg]).toarray()
        pca = decomposition.PCA(8)
        pca.fit(tr_liv_reg)
        return pca.transform(da_liv_reg)
        
    
    def get_monthly_income_feature(self, data):
        monthly_incomes = data.monthly_income.fillna(data.monthly_income.mean())
        return monthly_incomes.values.reshape((-1, 1))
    
    def get_credit_count(self, data):
        credit_counts = data.credit_count.fillna(data.credit_count.mean())
        return credit_counts.values.reshape((-1, 1))
    
    def get_overdue_credit_count(self, data):
        overdue_credit_counts = data.overdue_credit_count.fillna(data.overdue_credit_count.mean())
        return overdue_credit_counts.values.reshape((-1, 1))

In [41]:
make_features = Features(train)
train_features = np.concatenate((make_features.get_gender_feature(train),
                                 make_features.get_age_feature(train),
                                 make_features.get_marital_status_feature(train),
                                 make_features.get_job_position_feature(train),
                                 make_features.get_credit_sum_feature(train),
                                 make_features.get_credit_month_feature(train),
                                 make_features.get_tariff_id_feature(train),
                                 make_features.get_score_shk_feature(train),
                                 make_features.get_education_feature(train),
                                 make_features.get_monthly_income_feature(train),
                                 make_features.get_credit_count(train),
                                 make_features.get_overdue_credit_count(train)), axis=1)

In [42]:
test_features = np.concatenate((make_features.get_gender_feature(test),
                                make_features.get_age_feature(test),
                                make_features.get_marital_status_feature(test),
                                make_features.get_job_position_feature(test),
                                make_features.get_credit_sum_feature(test),
                                make_features.get_credit_month_feature(test),
                                make_features.get_tariff_id_feature(test),
                                make_features.get_score_shk_feature(test),
                                make_features.get_education_feature(test),
                                make_features.get_monthly_income_feature(test),
                                make_features.get_credit_count(test),
                                make_features.get_overdue_credit_count(test)), axis=1)

In [43]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_features, train.open_account_flg,
                                                                      test_size=0.3)

In [44]:
X_test = test_features

In [45]:
print(X_train.shape, X_valid.shape, X_test.shape)

(119522, 99) (51224, 99) (91940, 99)


# Xgboost

### parameter information: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

In [46]:
Dtrain = xgboost.DMatrix(data=X_train, label=y_train)

In [47]:
param = {
    "max_depth" : 9,
    "eta" : 0.05,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_delta_step": 2,
    "alpha": 2,
    "lambda": 2,
}

In [48]:
numround = 401
history = xgboost.cv(params=param, dtrain=Dtrain, num_boost_round=numround, nfold=3, stratified=True,\
                     early_stopping_rounds=25, verbose_eval=25)

[0]	train-auc:0.742456+0.00143356	test-auc:0.730283+0.00197619
[25]	train-auc:0.772235+0.000596052	test-auc:0.748172+0.0030136
[50]	train-auc:0.789698+0.00136587	test-auc:0.753645+0.00166419
[75]	train-auc:0.806248+0.000775621	test-auc:0.756928+0.000822251
[100]	train-auc:0.814623+0.00102504	test-auc:0.75827+0.000979602
[125]	train-auc:0.819947+0.00132763	test-auc:0.758736+0.00105339
[150]	train-auc:0.824507+0.000920317	test-auc:0.758883+0.00119966
[175]	train-auc:0.828961+0.000892694	test-auc:0.75889+0.00122084


In [50]:
numround = 175
bst = xgboost.train(params=param, dtrain=Dtrain)
pred_valid = bst.predict(xgboost.DMatrix(data=X_valid))
print(metrics.roc_auc_score(y_valid, pred_valid))

0.745145370905


In [30]:
pred_test = bst.predict(xgboost.DMatrix(data=X_test))
prediction = pd.DataFrame(data={'_ID_': test['_ID_'], '_VAL_': pred_test}, index=test.index)
prediction.to_csv('../data/prediction.csv', index=False)
print(prediction.head())

     _ID_     _VAL_
0  170747  0.325587
1  170748  0.338372
2  170749  0.394299
3  170750  0.498730
4  170751  0.338825
