# TINKOFF: https://boosters.pro/champ_3?success=0#

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn import decomposition
import math

import re
%matplotlib inline

In [2]:
def read_data(filename):
    data = pd.read_csv(filename, header=0, sep=';', encoding='pt154')
    data = data.rename(index=str, columns={'client_id': '_ID_', 'open_acount_flg': '_VAL_'})
    return data

In [3]:
train = read_data('../data/credit_train.csv')
test = read_data('../data/credit_test.csv')

In [4]:
train.head()

Unnamed: 0,_ID_,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0
2,3,M,32,MAR,SPC,1072800,12,1.1,459589,SCH,ОБЛ САРАТОВСКАЯ,23000.0,5.0,0.0,0
3,4,F,27,DIV,SPC,1200909,12,1.1,362536,GRD,ОБЛ ВОЛГОГРАДСКАЯ,17000.0,2.0,0.0,0
4,5,M,45,MAR,SPC,1690889,10,1.1,421385,SCH,ЧЕЛЯБИНСКАЯ ОБЛАСТЬ,25000.0,1.0,0.0,0


In [5]:
bad_names = []

def parse_living_region(data):
    def get_good_name(s_name):
        if s_name.find('КРАЙ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'КРАЙ', tokens))
            return good_name[0]
        if s_name.find('АО') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x.find('АО') == -1, tokens))
            return good_name[0]
        if s_name.find('ОБЛ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'ОБЛ' and x != 'ОБЛАСТЬ', tokens))
            return good_name[-1]
        if s_name.find('РЕСП') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'РЕСП' and x != 'РЕСПУБЛИКА', tokens))
            return good_name[0]
        if len(s_name.split()) == 1 and len(s_name.split('.')) == 1:
            return s_name
        if s_name.find('МОСКВА') != -1:
            return 'МОСКВА'
        if s_name.find('ПЕТЕРБУРГ'):
            return 'САНКТ-ПЕТЕРБУРГ'
        if s_name.find('АВТОНОМНЫЙ'):
            return s_name.split()[0]
        if s_name.find('ЕВРЕЙ'):
            return 'ЕВРЕЙСКАЯ'
        if s_name.find('ДАЛЬНИЙ'):
            return 'ДАЛЬНИЙ ВОСТОК'
        if s_name.find('ФЕДЕРАЛЬНЫЙ'):
            return s_name.split()[0]
        bad_names.append(s_name)
        return np.nan
        
    return [get_good_name(x) if x is not np.nan else 'BAD NAME' for x in data]


def reduction_living_region(data):
    def get_good_name(s_name):
        if s_name == 'BAD NAME':
            bad_names.append(np.nan)
        if s_name == '74':
            return 'ЧЕЛЯБИН'
        if s_name == '98':
            return 'САНКТ-ПЕТЕРБУРГ'
        if s_name == 'РЕСПУБЛИКАТАТАРСТАН':
            return 'ТАТАРСТАН'
        if s_name == 'МОСКВОСКАЯ':
            return 'МОСКОВ'
        if s_name == 'РОССИЯ':
            bad_names.append(s_name)
            return 'BAD NAME'
        if s_name == 'КАМЧАТС??ИЙ':
            return 'КАМЧАТ'
        if s_name == 'ХАНТЫ-МАНСИЙСКИЙ-ЮГРА':
            return 'ХАНТЫ-МАНСИЙ' 
        if s_name.find('СКИЙ') != -1 or s_name.find('СКАЯ') != -1:
            return s_name[:-4]
        return s_name
    
    return [get_good_name(x) for x in data]

In [6]:
train_living_region = \
            np.unique(reduction_living_region(parse_living_region([s_name for s_name in train['living_region']])))
test_living_region = \
            np.unique(reduction_living_region(parse_living_region([s_name for s_name in test['living_region']])))
union_living_region = list(set(train_living_region).union(set(test_living_region)))
union_living_region.sort()
print(union_living_region, len(union_living_region))
print(np.unique(bad_names), len(bad_names))

['BAD NAME', 'АДЫГЕЯ', 'АЛТАЙ', 'АМУР', 'АРХАНГЕЛЬ', 'АСТРАХАН', 'БАШКОРТОСТАН', 'БЕЛГОРОД', 'БРЯН', 'БУРЯТИЯ', 'ВЛАДИМИР', 'ВОЛГОГРАД', 'ВОЛОГОД', 'ВОРОНЕЖ', 'ГОРЬКОВ', 'ДАГЕСТАН', 'ЕВРЕЙ', 'ЗАБАЙКАЛЬ', 'ИВАНОВ', 'ИНГУШЕТИЯ', 'ИРКУТ', 'КАБАРДИНО-БАЛКАР', 'КАЛИНИНГРАД', 'КАЛМЫКИЯ', 'КАЛУЖ', 'КАМЧАТ', 'КАРАЧАЕВО-ЧЕРКЕС', 'КАРЕЛИЯ', 'КЕМЕРОВ', 'КИРОВ', 'КОМИ', 'КОСТРОМ', 'КРАСНОДАР', 'КРАСНОЯР', 'КУР', 'КУРГАН', 'ЛЕНИНГРАД', 'ЛИПЕЦКАЯ', 'МАГАДАН', 'МАРИЙ', 'МОРДОВИЯ', 'МОСКВА', 'МОСКОВ', 'МУРМАН', 'НЕНЕЦКИЙ', 'НИЖЕГОРОД', 'НОВГОРОД', 'НОВОСИБИР', 'ОМ', 'ОРЁЛ', 'ОРЕНБУРГ', 'ОРЛОВ', 'ПЕНЗЕН', 'ПЕРМ', 'ПРИМОР', 'ПСКОВ', 'РОСТОВ', 'РЯЗАН', 'САМАР', 'САНКТ-ПЕТЕРБУРГ', 'САРАТОВ', 'САХА', 'САХАЛИН', 'СВЕРДЛОВ', 'СЕВЕРНАЯ', 'СМОЛЕН', 'СТАВРОПОЛЬ', 'ТАМБОВ', 'ТАТАРСТАН', 'ТВЕР', 'ТОМ', 'ТУЛЬ', 'ТЫВА', 'ТЮМЕН', 'УДМУРТ', 'УЛЬЯНОВ', 'ХАБАРОВ', 'ХАКАСИЯ', 'ХАНТЫ-МАНСИЙ', 'ЧЕЛЯБИН', 'ЧЕЧЕН', 'ЧИТИН', 'ЧУВАШ', 'ЧУВАШИЯ', 'ЧУКОТ', 'ЭВЕНКИЙ', 'ЯМАЛО-НЕНЕЦКИЙ', 'ЯРОСЛАВ'] 88
['nan' 'РОССИЯ'] 310


In [7]:
set(train['job_position'].unique()) \
- set(['SPC', 'DIR', 'HSK', 'INV', 'WOI', 'WRK', 'ATP', 'WRP', 'UMN', 'NOR', 'PNS', 'BIS', 'INP'])

{'BIU', 'ONB', 'PNA', 'PNI', 'PNV'}

In [8]:
class Features:
    def __init__(self, train):
        self._train = train
    
    def get_gender_feature(self, data):
        poss = ['F', 'M']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_gender)] for s_gender in self._train.gender])
        return enc.transform([[poss.index(s_gender)] for s_gender in data.gender]).toarray()
    
    def get_age_feature(self, data):
        return data.age.values.reshape((-1, 1))
    
    def get_marital_status_feature(self, data):
        poss = ['UNM', 'DIV', 'MAR', 'WID', 'CIV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_marital_status)] for s_marital_status in self._train.marital_status])
        return enc.transform([[poss.index(s_marital_status)] for s_marital_status in data.marital_status]).toarray()
    
    def get_job_position_feature(self, data):
        poss = ['SPC', 'DIR', 'HSK', 'INV', 'WOI', 'WRK', 'ATP', 'WRP', 'UMN', 'NOR', 'PNS', 'BIS', 'INP',
                'BIU', 'ONB', 'PNA', 'PNI', 'PNV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_job_position)] for s_job_position in self._train.job_position])
        return enc.transform([[poss.index(s_job_position)] for s_job_position in data.job_position]).toarray()
    
    def get_credit_sum_feature(self, data):
        return np.array([[round(float(s_credit_sum.replace(',', '.')) / 1000)] 
                         for s_credit_sum in data.credit_sum])
    
    def get_credit_month_feature(self, data):
        enc = preprocessing.OneHotEncoder()
        enc.fit([[credit_month] for credit_month in self._train.credit_month])
        return enc.transform([[credit_month] for credit_month in data.credit_month]).toarray()
    
    def get_tariff_id_feature(self, data):
        poss = list(np.unique(self._train.tariff_id))
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(tariff_id)] for tariff_id in poss])
        return enc.transform([[poss.index(tariff_id)] for tariff_id in data.tariff_id]).toarray()
    
    def get_score_shk_feature(self, data):
        return np.array([[float(s_score_shk.replace(',', '.'))] for s_score_shk in data['score_shk']])
    
    def get_education_feature(self, data):
        poss = ['SCH', 'UGR', 'GRD', 'PGR', 'ACD']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_education)] for s_education in self._train.education])
        return enc.transform([[poss.index(s_education)] for s_education in data.education]).toarray()
    
    def get_living_region_feature(self, data):        
        tr_liv_reg = reduction_living_region(parse_living_region([s_name for s_name in self._train.living_region]))
        da_liv_reg = reduction_living_region(parse_living_region([s_name for s_name in data.living_region]))
        poss = list(np.unique(tr_liv_reg))
        
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_tr_liv_reg)] for s_tr_liv_reg in tr_liv_reg])
        da_liv_reg = enc.transform([[poss.index(s_living_region)]
                                     for s_living_region in da_liv_reg]).toarray()
        tr_liv_reg = enc.transform([[poss.index(s_living_region)]
                                     for s_living_region in tr_liv_reg]).toarray()
        pca = decomposition.PCA(8)
        pca.fit(tr_liv_reg)
        return pca.transform(da_liv_reg)
        
    
    def get_monthly_income_feature(self, data):
        monthly_incomes = data.monthly_income.fillna(data.monthly_income.mean()).apply(lambda x: np.round(x / 1000))
        return monthly_incomes.values.reshape((-1, 1))
    
    def get_credit_count_feature(self, data):
        credit_counts = data.credit_count.fillna(data.credit_count.mean())
        return credit_counts.values.reshape((-1, 1))
    
    def get_overdue_credit_count_feature(self, data):
        overdue_credit_counts = data.overdue_credit_count.fillna(data.overdue_credit_count.mean())
        return overdue_credit_counts.values.reshape((-1, 1))

In [9]:
make_features = Features(train)
train_features = np.concatenate((make_features.get_gender_feature(train),
                                 make_features.get_age_feature(train),
                                 make_features.get_marital_status_feature(train),
                                 make_features.get_job_position_feature(train),
                                 make_features.get_credit_sum_feature(train),
                                 make_features.get_credit_month_feature(train),
                                 make_features.get_tariff_id_feature(train),
                                 make_features.get_score_shk_feature(train),
                                 make_features.get_education_feature(train),
                                 make_features.get_living_region_feature(train),
                                 make_features.get_monthly_income_feature(train),
                                 make_features.get_credit_count_feature(train),
                                 make_features.get_overdue_credit_count_feature(train)), axis=1)

In [10]:
test_features = np.concatenate((make_features.get_gender_feature(test),
                                make_features.get_age_feature(test),
                                make_features.get_marital_status_feature(test),
                                make_features.get_job_position_feature(test),
                                make_features.get_credit_sum_feature(test),
                                make_features.get_credit_month_feature(test),
                                make_features.get_tariff_id_feature(test),
                                make_features.get_score_shk_feature(test),
                                make_features.get_education_feature(test),
                                make_features.get_living_region_feature(test),
                                make_features.get_monthly_income_feature(test),
                                make_features.get_credit_count_feature(test),
                                make_features.get_overdue_credit_count_feature(test)), axis=1)

In [10]:
pca = decomposition.PCA(120)
pca.fit(train_features)
t_train_features = pca.transform(train_features)
t_test_features = pca.transform(test_features)

PCA(copy=True, iterated_power='auto', n_components=120, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [11]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_features, train.open_account_flg,
                                                                      test_size=0.3)

In [12]:
X_test = test_features

In [13]:
print(X_train.shape, X_valid.shape, X_test.shape)

(119522, 107) (51224, 107) (91940, 107)


In [14]:
def model(model, param_grid, X_train, X_valid, y_train, y_valid, X_test):
    est = model_selection.GridSearchCV(model, param_grid=param_grid, n_jobs=-1)
    est.fit(X_train, y_train)
    print(est.best_estimator_)
    return est


def estimate(est):
    proba_train = est.predict_proba(X_train)
    print(metrics.roc_auc_score(y_train, proba_train[:, 1]))

    proba_valid = est.predict_proba(X_valid)
    print(metrics.roc_auc_score(y_valid, proba_valid[:, 1]))

    proba_test = est.predict_proba(X_test)
    return proba_train[:, 1], proba_valid[:, 1], proba_test[:, 1]

# Gradient Boosting

In [17]:
est = model(ensemble.GradientBoostingClassifier(min_samples_leaf=30), 
            {'n_estimators': [i for i in range(300, 401, 20)]},
            X_train, X_valid, y_train, y_valid, X_test)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=30,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=380, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)


In [18]:
gb_pred_train, gb_pred_valid, gb_pred_test = estimate(est)

0.779200492863
0.763781316104


# Random Forest

In [20]:
rf_est = model(ensemble.RandomForestClassifier(min_samples_leaf=10), 
               {'n_estimators': [i for i in range(50, 401, 30)]},
               X_train, X_valid, y_train, y_valid, X_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=80, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [21]:
rf_pred_train, rf_pred_valid, rf_pred_test = estimate(rf_est)

0.860952776977
0.755441472141


In [30]:
print(metrics.roc_auc_score(y_train, gb_pred_train * 0.1 + rf_pred_train * 0.1))
print(metrics.roc_auc_score(y_valid, gb_pred_valid * 0.1 + rf_pred_valid * 0.1))

0.823910825414
0.764645292232


In [31]:
prediction = pd.DataFrame(data={'_ID_': test._ID_, '_VAL_': gb_pred_test * 0.8 + rf_pred_test * 0.2}, 
                          index=test.index)
prediction.to_csv('../data/prediction.csv', index=False)
print(prediction.head())

     _ID_     _VAL_
0  170747  0.072488
1  170748  0.147013
2  170749  0.235742
3  170750  0.157034
4  170751  0.099641
