In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn import decomposition
import math
import copy

import re
%matplotlib inline

In [2]:
def read_data(filename):
    data = pd.read_csv(filename, header=0, sep=';', encoding='pt154')
    data = data.rename(index=str, columns={'client_id': '_ID_', 'open_acount_flg': '_VAL_'})
    return data

In [27]:
train = read_data('../data/credit_train.csv')
test = read_data('../data/credit_test.csv')

In [28]:
train.head()

Unnamed: 0,_ID_,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0
2,3,M,32,MAR,SPC,1072800,12,1.1,459589,SCH,ОБЛ САРАТОВСКАЯ,23000.0,5.0,0.0,0
3,4,F,27,DIV,SPC,1200909,12,1.1,362536,GRD,ОБЛ ВОЛГОГРАДСКАЯ,17000.0,2.0,0.0,0
4,5,M,45,MAR,SPC,1690889,10,1.1,421385,SCH,ЧЕЛЯБИНСКАЯ ОБЛАСТЬ,25000.0,1.0,0.0,0


In [29]:
bad_names = []

def parse_living_region(data):
    def get_good_name(s_name):
        if s_name.find('КРАЙ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'КРАЙ', tokens))
            return good_name[0]
        if s_name.find('АО') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x.find('АО') == -1, tokens))
            return good_name[0]
        if s_name.find('ОБЛ') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'ОБЛ' and x != 'ОБЛАСТЬ', tokens))
            return good_name[-1]
        if s_name.find('РЕСП') != -1:
            tokens = [x for x in s_name.split() for x in x.split('.')]
            good_name = list(filter(lambda x: x != '' and x != 'РЕСП' and x != 'РЕСПУБЛИКА', tokens))
            return good_name[0]
        if len(s_name.split()) == 1 and len(s_name.split('.')) == 1:
            return s_name
        if s_name.find('МОСКВА') != -1:
            return 'МОСКВА'
        if s_name.find('ПЕТЕРБУРГ'):
            return 'САНКТ-ПЕТЕРБУРГ'
        if s_name.find('АВТОНОМНЫЙ'):
            return s_name.split()[0]
        if s_name.find('ЕВРЕЙ'):
            return 'ЕВРЕЙСКАЯ'
        if s_name.find('ДАЛЬНИЙ'):
            return 'ДАЛЬНИЙ ВОСТОК'
        if s_name.find('ФЕДЕРАЛЬНЫЙ'):
            return s_name.split()[0]
        bad_names.append(s_name)
        return np.nan
        
    return [get_good_name(x) if x is not np.nan else 'BAD NAME' for x in data]


def reduction_living_region(data):
    def get_good_name(s_name):
        if s_name == 'BAD NAME':
            bad_names.append(np.nan)
        if s_name == '74':
            return 'ЧЕЛЯБИН'
        if s_name == '98':
            return 'САНКТ-ПЕТЕРБУРГ'
        if s_name == 'РЕСПУБЛИКАТАТАРСТАН':
            return 'ТАТАРСТАН'
        if s_name == 'МОСКВОСКАЯ':
            return 'МОСКОВ'
        if s_name == 'РОССИЯ':
            bad_names.append(s_name)
            return 'BAD NAME'
        if s_name == 'КАМЧАТС??ИЙ':
            return 'КАМЧАТ'
        if s_name == 'ХАНТЫ-МАНСИЙСКИЙ-ЮГРА':
            return 'ХАНТЫ-МАНСИЙ' 
        if s_name.find('СКИЙ') != -1 or s_name.find('СКАЯ') != -1:
            return s_name[:-4]
        return s_name
    
    return [get_good_name(x) for x in data]

In [31]:
class Features:
    def __init__(self, train):
        self._train = train.copy()
    
    def get_gender_feature(self, data):
        poss = ['F', 'M']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_gender)] for s_gender in self._train.gender])
        encode_genders = enc.transform([[poss.index(s_gender)] for s_gender in data.gender]).toarray()
        tmp = pd.DataFrame(encode_genders, 
                           columns=['gender=' + str(i) for i in range(encode_genders.shape[1])],
                           index=data.index)
        data = pd.concat([data, tmp], axis=1)
        return data
    
    def get_age_feature(self, data):
        return data
    
    def get_marital_status_feature(self, data):
        poss = ['UNM', 'DIV', 'MAR', 'WID', 'CIV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_marital_status)] for s_marital_status in self._train.marital_status])
        encode_marital_status = enc.transform([[poss.index(s_marital_status)] 
                                               for s_marital_status in data.marital_status]).toarray()
        tmp = pd.DataFrame(encode_marital_status, 
                           columns=['marital_status=' + str(i) for i in range(encode_marital_status.shape[1])],
                           index=data.index)
        data = pd.concat([data, tmp], axis=1)
        return data
    
    def get_job_position_feature(self, data):
        poss = ['SPC', 'DIR', 'HSK', 'INV', 'WOI', 'WRK', 'ATP', 'WRP', 'UMN', 'NOR', 'PNS', 'BIS', 'INP',
                'BIU', 'ONB', 'PNA', 'PNI', 'PNV']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_job_position)] for s_job_position in self._train.job_position])
        encode_job_positions = enc.transform([[poss.index(s_job_position)] 
                                              for s_job_position in data.job_position]).toarray()
        tmp = pd.DataFrame(encode_job_positions, 
                           columns=['job_position=' + str(i) for i in range(encode_job_positions.shape[1])],
                           index=data.index)
        data = pd.concat([data, tmp], axis=1)
        return data
    
    def get_credit_sum_feature(self, data):
        data.credit_sum = (data.credit_sum.apply(lambda x: x.replace(',', '.'))
                                          .astype(np.float64)
                                          .apply(lambda x: np.round(x / 1000)))
        return data
    
    def get_credit_month_feature(self, data):
        enc = preprocessing.OneHotEncoder()
        enc.fit([[credit_month] for credit_month in self._train.credit_month])
        encode_credit_months = enc.transform([[credit_month] for credit_month in data.credit_month]).toarray()
        tmp = pd.DataFrame(encode_credit_months, 
                           columns=['credit_month=' + str(i) for i in range(encode_credit_months.shape[1])],
                           index=data.index)
        data = pd.concat([data, tmp], axis=1)
        return data
    
    def get_tariff_id_feature(self, data):
        poss = list(np.unique(self._train.tariff_id))
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(tariff_id)] for tariff_id in poss])
        encode_tariff_ids = enc.transform([[poss.index(tariff_id)] for tariff_id in data.tariff_id]).toarray()
        tmp = pd.DataFrame(encode_tariff_ids, 
                           columns=['tariff_id=' + str(i) for i in range(encode_tariff_ids.shape[1])],
                           index=data.index)
        data = pd.concat([data, tmp], axis=1)
        return data
    
    def get_score_shk_feature(self, data):
        data.score_shk = (data.score_shk.apply(lambda x: x.replace(',', '.'))
                                        .astype(np.float64))
        return data
    
    def get_education_feature(self, data):
        poss = ['SCH', 'UGR', 'GRD', 'PGR', 'ACD']
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_education)] for s_education in self._train.education])
        encode_educations = enc.transform([[poss.index(s_education)] for s_education in data.education]).toarray()
        tmp = pd.DataFrame(encode_educations, 
                           columns=['education=' + str(i) for i in range(encode_educations.shape[1])],
                           index=data.index)
        data = pd.concat([data, tmp], axis=1)
        return data
    
    def get_living_region_feature(self, data):        
        tr_liv_reg = reduction_living_region(parse_living_region([s_name for s_name in self._train.living_region]))
        da_liv_reg = reduction_living_region(parse_living_region([s_name for s_name in data.living_region]))
        poss = list(np.unique(tr_liv_reg))
        
        enc = preprocessing.OneHotEncoder()
        enc.fit([[poss.index(s_tr_liv_reg)] for s_tr_liv_reg in tr_liv_reg])
        da_liv_reg = enc.transform([[poss.index(s_living_region)]
                                     for s_living_region in da_liv_reg]).toarray()
        tr_liv_reg = enc.transform([[poss.index(s_living_region)]
                                     for s_living_region in tr_liv_reg]).toarray()
        pca = decomposition.PCA(8)
        pca.fit(tr_liv_reg)
        encode_living_regions = pca.transform(da_liv_reg)
        tmp = pd.DataFrame(encode_living_regions, 
                           columns=['living_region=' + str(i) for i in range(encode_living_regions.shape[1])],
                           index=data.index)
        data = pd.concat([data, tmp], axis=1)
        return data
        
    def get_monthly_income_feature(self, data):
        data.monthly_income = data.monthly_income.fillna(data.monthly_income.mean())
        return data
    
    def get_credit_count_feature(self, data):
        data.credit_count = data.credit_count.fillna(data.credit_count.mean())
        return data
    
    def get_overdue_credit_count_feature(self, data):
        data.overdue_credit_count = data.overdue_credit_count.fillna(data.overdue_credit_count.mean())
        return data
    
    def get_living_region_count_feature(self, data):
        tmp = data.copy()
        tmp.living_region = reduction_living_region(parse_living_region([s_name for s_name in tmp.living_region]))
        data['living_region_count'] = tmp.living_region.map(tmp.groupby('living_region').size())
        return data
    
    def get_city_mean_income_feature(self, data):
        tmp = data.copy()
        tmp.living_region = reduction_living_region(parse_living_region([s_name for s_name in tmp.living_region]))
        data['city_mean_income'] = tmp.living_region.map(tmp.groupby('living_region')['monthly_income'].mean())
        return data
    
    def get_credit_pay_feature(self, data):
        tmp = data.copy()
        tmp.credit_sum.fillna(tmp.credit_sum.mean(), inplace=True)
        tmp.credit_month.fillna(tmp.credit_month.mean(), inplace=True)
        data['credit_pay'] = tmp.credit_sum / tmp.credit_month
        return data
    
    def get_money_for_life_feature(self, data):
        tmp = data.copy()
        tmp = self.get_credit_pay_feature(tmp)
        tmp.monthly_income.fillna(tmp.monthly_income.mean(), inplace=True)
        data['money_for_life'] = tmp.monthly_income - tmp.credit_pay
        return data
    
    def get_dif_city_feature(self, data):
        tmp = data.copy()
        tmp = self.get_city_mean_income_feature(tmp)
        tmp.monthly_income.fillna(tmp.monthly_income.mean(), inplace=True)
        data['dif_city'] = (tmp.monthly_income - tmp.city_mean_income).values
        return data

In [32]:
make_features = Features(train)
train = make_features.get_gender_feature(train)
train = make_features.get_age_feature(train)
train = make_features.get_marital_status_feature(train)
train = make_features.get_job_position_feature(train)
train = make_features.get_credit_sum_feature(train)
train = make_features.get_credit_month_feature(train)
train = make_features.get_tariff_id_feature(train)
train = make_features.get_score_shk_feature(train)
train = make_features.get_education_feature(train)
train = make_features.get_living_region_feature(train)
train = make_features.get_monthly_income_feature(train)
train = make_features.get_credit_count_feature(train)
train = make_features.get_overdue_credit_count_feature(train)
train = make_features.get_living_region_count_feature(train)
train = make_features.get_city_mean_income_feature(train)
train = make_features.get_credit_pay_feature(train)
train = make_features.get_money_for_life_feature(train)
train = make_features.get_dif_city_feature(train)

In [33]:
test = make_features.get_gender_feature(test)
test = make_features.get_age_feature(test)
test = make_features.get_marital_status_feature(test)
test = make_features.get_job_position_feature(test)
test = make_features.get_credit_sum_feature(test)
test = make_features.get_credit_month_feature(test)
test = make_features.get_tariff_id_feature(test)
test = make_features.get_score_shk_feature(test)
test = make_features.get_education_feature(test)
test = make_features.get_living_region_feature(test)
test = make_features.get_monthly_income_feature(test)
test = make_features.get_credit_count_feature(test)
test = make_features.get_overdue_credit_count_feature(test)
test = make_features.get_living_region_count_feature(test)
test = make_features.get_city_mean_income_feature(test)
test = make_features.get_credit_pay_feature(test)
test = make_features.get_money_for_life_feature(test)
test = make_features.get_dif_city_feature(test)

In [34]:
train.head()

Unnamed: 0,_ID_,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,...,living_region=3,living_region=4,living_region=5,living_region=6,living_region=7,living_region_count,city_mean_income,credit_pay,money_for_life,dif_city
0,1,M,48,MAR,UMN,60.0,10,1.6,0.770249,GRD,...,0.826619,0.140592,0.176357,0.121125,0.005595,8355,37260.771993,6.0,29994.0,-7260.771993
1,2,F,28,MAR,UMN,11.0,6,1.1,0.248514,GRD,...,0.16956,0.067989,0.108327,0.086369,0.004368,9261,62696.116618,1.833333,42998.166667,-19696.116618
2,3,M,32,MAR,SPC,11.0,12,1.1,0.459589,SCH,...,-0.033089,-0.022835,-0.055063,-0.071585,-0.006817,2282,31110.902717,0.916667,22999.083333,-8110.902717
3,4,F,27,DIV,SPC,12.0,12,1.1,0.362536,GRD,...,-0.033572,-0.023153,-0.055825,-0.07255,-0.00657,2361,31319.365523,1.0,16999.0,-14319.365523
4,5,M,45,MAR,SPC,17.0,10,1.1,0.421385,SCH,...,-0.065156,-0.050721,-0.144774,-0.287428,0.751925,5155,33759.671775,1.7,24998.3,-8759.671775


In [35]:
del train['gender']
del train['marital_status']
del train['job_position']
del train['credit_month']
del train['tariff_id']
del train['education']
del train['living_region']

In [36]:
del test['gender']
del test['marital_status']
del test['job_position']
del test['credit_month']
del test['tariff_id']
del test['education']
del test['living_region']

In [37]:
train.head()

Unnamed: 0,_ID_,age,credit_sum,score_shk,monthly_income,credit_count,overdue_credit_count,open_account_flg,gender=0,gender=1,...,living_region=3,living_region=4,living_region=5,living_region=6,living_region=7,living_region_count,city_mean_income,credit_pay,money_for_life,dif_city
0,1,48,60.0,0.770249,30000.0,1.0,1.0,0,0.0,1.0,...,0.826619,0.140592,0.176357,0.121125,0.005595,8355,37260.771993,6.0,29994.0,-7260.771993
1,2,28,11.0,0.248514,43000.0,2.0,0.0,0,1.0,0.0,...,0.16956,0.067989,0.108327,0.086369,0.004368,9261,62696.116618,1.833333,42998.166667,-19696.116618
2,3,32,11.0,0.459589,23000.0,5.0,0.0,0,0.0,1.0,...,-0.033089,-0.022835,-0.055063,-0.071585,-0.006817,2282,31110.902717,0.916667,22999.083333,-8110.902717
3,4,27,12.0,0.362536,17000.0,2.0,0.0,0,1.0,0.0,...,-0.033572,-0.023153,-0.055825,-0.07255,-0.00657,2361,31319.365523,1.0,16999.0,-14319.365523
4,5,45,17.0,0.421385,25000.0,1.0,0.0,0,0.0,1.0,...,-0.065156,-0.050721,-0.144774,-0.287428,0.751925,5155,33759.671775,1.7,24998.3,-8759.671775


In [38]:
train.to_csv('../data/normally_encoding_train.csv', index=False)
test.to_csv('../data/normally_encoding_test.csv', index=False)