In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
import math
import re
%matplotlib inline

In [3]:
def read_data(filename):
    data = pd.read_csv(filename, header=0, sep=';', encoding='pt154', index_col='client_id')
    return data

In [4]:
train = read_data('data/credit_train.csv')
test = read_data('data/credit_test.csv')

In [5]:
train.head()

Unnamed: 0_level_0,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0
3,M,32,MAR,SPC,1072800,12,1.1,459589,SCH,ОБЛ САРАТОВСКАЯ,23000.0,5.0,0.0,0
4,F,27,DIV,SPC,1200909,12,1.1,362536,GRD,ОБЛ ВОЛГОГРАДСКАЯ,17000.0,2.0,0.0,0
5,M,45,MAR,SPC,1690889,10,1.1,421385,SCH,ЧЕЛЯБИНСКАЯ ОБЛАСТЬ,25000.0,1.0,0.0,0


In [21]:
def parse_living_region(data):
    def get_good_name(s_name):
        if s_name.find('КРАЙ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'КРАЙ', tokens))
            return good_name[0]
        if s_name.find('АО') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x.find('АО') == -1, tokens))
            return good_name[0]
        if s_name.find('ОБЛ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'ОБЛ' and x != 'ОБЛАСТЬ', tokens))
            return good_name[-1]
        if s_name.find('РЕСП') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'РЕСП' and x != 'РЕСПУБЛИКА', tokens))
            return good_name[0]
        if len(s_name.split()) == 1 and len(s_name.split('.')) == 1:
            return s_name
        if s_name.find('МОСКВА') != -1:
            return 'МОСКВА'
        if s_name.find('ПЕТЕРБУРГ'):
            return 'САНКТ-ПЕТЕРБУРГ'
        if s_name.find('АВТОНОМНЫЙ'):
            return s_name.split()[0]
        if s_name.find('ЕВРЕЙ'):
            return 'ЕВРЕЙСКАЯ'
        if s_name.find('ДАЛЬНИЙ'):
            return 'ДАЛЬНИЙ ВОСТОК'
        if s_name.find('ФЕДЕРАЛЬНЫЙ'):
            return s_name.split()[0]
        return np.nan
        
    return [get_good_name(x) if x is not np.nan else 'BAD NAME' for x in data]


def reduction_living_region(data):
    def get_good_name(s_name):
        if s_name == '74' or s_name == '98':
            return 'BAD NAME'
        if s_name == 'РЕСПУБЛИКАТАТАРСТАН':
            return 'ТАТАРСТАН'
        if s_name == 'МОСКВОСКАЯ':
            return 'МОСКОВ'
        if s_name == 'РОССИЯ':
            return 'BAD NAME'
        if s_name == 'КАМЧАТС??ИЙ':
            return 'КАМЧАТ'
        if s_name == 'ХАНТЫ-МАНСИЙСКИЙ-ЮГРА':
            return 'ХАНТЫ-МАНСИЙ' 
        if s_name.find('СКИЙ') != -1 or s_name.find('СКАЯ') != -1:
            return s_name[:-4]
        return s_name
    
    return [get_good_name(x) for x in data]

print(np.unique(reduction_living_region(parse_living_region([s_name for s_name in train['living_region']]))))
print(np.unique(reduction_living_region(parse_living_region([s_name for s_name in test['living_region']]))))

['BAD NAME' 'АДЫГЕЯ' 'АЛТАЙ' 'АМУР' 'АРХАНГЕЛЬ' 'АСТРАХАН' 'БАШКОРТОСТАН'
 'БЕЛГОРОД' 'БРЯН' 'БУРЯТИЯ' 'ВЛАДИМИР' 'ВОЛГОГРАД' 'ВОЛОГОД' 'ВОРОНЕЖ'
 'ГОРЬКОВ' 'ДАГЕСТАН' 'ЕВРЕЙ' 'ЗАБАЙКАЛЬ' 'ИВАНОВ' 'ИНГУШЕТИЯ' 'ИРКУТ'
 'КАБАРДИНО-БАЛКАР' 'КАЛИНИНГРАД' 'КАЛМЫКИЯ' 'КАЛУЖ' 'КАМЧАТ'
 'КАРАЧАЕВО-ЧЕРКЕС' 'КАРЕЛИЯ' 'КЕМЕРОВ' 'КИРОВ' 'КОМИ' 'КОСТРОМ'
 'КРАСНОДАР' 'КРАСНОЯР' 'КУР' 'КУРГАН' 'ЛЕНИНГРАД' 'ЛИПЕЦКАЯ' 'МАГАДАН'
 'МАРИЙ' 'МОРДОВИЯ' 'МОСКВА' 'МОСКОВ' 'МУРМАН' 'НЕНЕЦКИЙ' 'НИЖЕГОРОД'
 'НОВГОРОД' 'НОВОСИБИР' 'ОМ' 'ОРЁЛ' 'ОРЕНБУРГ' 'ОРЛОВ' 'ПЕНЗЕН' 'ПЕРМ'
 'ПРИМОР' 'ПСКОВ' 'РОСТОВ' 'РЯЗАН' 'САМАР' 'САНКТ-ПЕТЕРБУРГ' 'САРАТОВ'
 'САХА' 'САХАЛИН' 'СВЕРДЛОВ' 'СЕВЕРНАЯ' 'СМОЛЕН' 'СТАВРОПОЛЬ' 'ТАМБОВ'
 'ТАТАРСТАН' 'ТВЕР' 'ТОМ' 'ТУЛЬ' 'ТЫВА' 'ТЮМЕН' 'УДМУРТ' 'УЛЬЯНОВ'
 'ХАБАРОВ' 'ХАКАСИЯ' 'ХАНТЫ-МАНСИЙ' 'ЧЕЛЯБИН' 'ЧЕЧЕН' 'ЧИТИН' 'ЧУВАШ'
 'ЧУВАШИЯ' 'ЧУКОТ' 'ЭВЕНКИЙ' 'ЯМАЛО-НЕНЕЦКИЙ' 'ЯРОСЛАВ']
['BAD NAME' 'АДЫГЕЯ' 'АЛТАЙ' 'АМУР' 'АРХАНГЕЛЬ' 'АСТРАХАН' 'БАШКОРТОСТАН'
 'БЕЛГОРОД' 'БРЯН' 'БУРЯТИЯ' 'ВЛА

In [7]:
set(train['job_position'].unique()) \
- set(['SPC', 'DIR', 'HSK', 'INV', 'WOI', 'WRK', 'ATP', 'WRP', 'UMN', 'NOR', 'PNS', 'BIS', 'INP'])

{'BIU', 'ONB', 'PNA', 'PNI', 'PNV'}

In [15]:
class Features:
    def __init__(self, train):
        self._train = train
    
    def get_gender_feature(self, data):
        poss = ['F', 'M']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[0], [1]])
        return enc.transform([[poss.index(s_gender)] for s_gender in data['gender']]).toarray()
    
    def get_age_feature(self, data):
        return data['age'].values.reshape((-1, 1))
    
    def get_marital_status_feature(self, data):
        poss = ['UNM', 'DIV', 'MAR', 'WID', 'CIV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_marital_status)] for s_marital_status in self._train['marital_status']])
        return enc.transform([[poss.index(s_marital_status)] for s_marital_status in data['marital_status']]).toarray()
    
    def get_job_position_feature(self, data):
        poss = ['SPC', 'DIR', 'HSK', 'INV', 'WOI', 'WRK', 'ATP', 'WRP', 'UMN', 'NOR', 'PNS', 'BIS', 'INP',
                'BIU', 'ONB', 'PNA', 'PNI', 'PNV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_job_position)] for s_job_position in self._train['job_position']])
        return enc.transform([[poss.index(s_job_position)] for s_job_position in data['job_position']]).toarray()
    
    def get_credit_sum_feature(self, data):
        return np.array([[round(float(s_credit_sum.replace(',', '.')) / 1000)] for s_credit_sum in data['credit_sum']])
    
    def get_credit_month_feature(self, data):
        enc = preprocessing.OneHotEncoder()
        enc.fit([[credit_month] for credit_month in self._train['credit_month']])
        return enc.transform([[credit_month] for credit_month in data['credit_month']]).toarray()
    
    def get_tariff_id_feature(self, data):
        return data['tariff_id'].values.reshape((-1, 1))
    
    def get_score_shk_feature(self, data):
        return np.array([[float(s_score_shk.replace(',', '.'))] for s_score_shk in data['score_shk']])
    
    def get_education_feature(self, data):
        poss = ['SCH', 'UGR', 'GRD', 'PGR', 'ACD']
        return np.array([[poss.index(s_education)] for s_education in data['education']])
    
    def get_living_region_feature(self, data):
        poss = list(np.unique(reduction_living_region(parse_living_region([s_name for s_name in self._train['living_region']]))))
        enc = preprocessing.OneHotEncoder()
        enc.fit([[i] for i in range(len(poss))])
        liv_reg_data = reduction_living_region(parse_living_region([s_name for s_name in data['living_region']]))
        return enc.transform([[poss.index(s_living_region)]
                              for s_living_region in liv_reg_data]).toarray()

In [22]:
make_features = Features(train)
train_features = np.concatenate((make_features.get_gender_feature(train),
                                 make_features.get_age_feature(train),
                                 make_features.get_marital_status_feature(train),
                                 make_features.get_job_position_feature(train),
                                 make_features.get_credit_sum_feature(train),
                                 make_features.get_credit_month_feature(train),
                                 make_features.get_tariff_id_feature(train),
                                 make_features.get_score_shk_feature(train),
                                 make_features.get_education_feature(train), 
                                 make_features.get_living_region_feature(train)), axis=1)

In [23]:
test_features = np.concatenate((make_features.get_gender_feature(test),
                                make_features.get_age_feature(test),
                                make_features.get_marital_status_feature(test),
                                make_features.get_job_position_feature(test),
                                make_features.get_credit_sum_feature(test),
                                make_features.get_credit_month_feature(test),
                                make_features.get_tariff_id_feature(test),
                                make_features.get_score_shk_feature(test),
                                make_features.get_education_feature(test),
                                make_features.get_living_region_feature(test)), axis=1)

In [24]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_features, train['open_account_flg'],
                                                                      test_size=0.2)

In [25]:
X_test = test_features

In [27]:
print(X_train.shape, X_valid.shape, X_test.shape)

(136596, 149) (34150, 149) (91940, 149)


In [34]:
class Models:
    @staticmethod
    def model(model, param_grid, X_train, X_valid, y_train, y_valid, X_test):
        est = model_selection.GridSearchCV(model, param_grid=param_grid, n_jobs=4)
        est.fit(X_train, y_train)

        proba_train = est.predict_proba(X_train)
        print(metrics.roc_auc_score(y_train, proba_train[:, 1]))

        proba_valid = est.predict_proba(X_valid)
        print(metrics.roc_auc_score(y_valid, proba_valid[:, 1]))

        proba_test = est.predict_proba(X_test)
        return proba_train[:, 1], proba_valid[:, 1], proba_test[:, 1]

# Gradient Boosting Classifier

In [35]:
_, _, gb_proba_test = Models.model(ensemble.GradientBoostingClassifier(), 
                                   {'n_estimators': [i for i in range(80, 250, 30)],
                                    'criterion': ['friedman_mse', 'mse'],
                                    'min_samples_leaf': [i for i in range(5, 101, 10)]},
                                   X_train, X_valid, y_train, y_valid, X_test)

KeyboardInterrupt: 