In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation

In [2]:
train = pd.read_csv("../credit_train.csv", sep = ";", encoding='cp1251')
test = pd.read_csv("../credit_test.csv", sep = ";", encoding='cp1251')

In [3]:
train.head(2)

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0


In [4]:
y = train['open_account_flg']

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
client_id               170746 non-null int64
gender                  170746 non-null object
age                     170746 non-null int64
marital_status          170746 non-null object
job_position            170746 non-null object
credit_sum              170746 non-null object
credit_month            170746 non-null int64
tariff_id               170746 non-null float64
score_shk               170746 non-null object
education               170746 non-null object
living_region           170554 non-null object
monthly_income          170745 non-null float64
credit_count            161516 non-null float64
overdue_credit_count    161516 non-null float64
open_account_flg        170746 non-null int64
dtypes: float64(4), int64(4), object(7)
memory usage: 19.5+ MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91940 entries, 0 to 91939
Data columns (total 14 columns):
client_id               91940 non-null int64
gender                  91940 non-null object
age                     91940 non-null int64
marital_status          91940 non-null object
job_position            91940 non-null object
credit_sum              91940 non-null object
credit_month            91940 non-null int64
tariff_id               91940 non-null float64
score_shk               91940 non-null object
education               91940 non-null object
living_region           91824 non-null object
monthly_income          91940 non-null int64
credit_count            87237 non-null float64
overdue_credit_count    87237 non-null float64
dtypes: float64(3), int64(4), object(7)
memory usage: 9.8+ MB


I работа с пропусками в данных

In [7]:
train[train.monthly_income.isnull()]

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
19639,19640,F,33,MAR,BIS,1300900,10,1.6,535834,SCH,ПРИМОРСКИЙ КРАЙ,,2.0,0.0,0


In [8]:
train.monthly_income.fillna(value=train.monthly_income.mean(), inplace=True)

In [9]:
city_train = pd.read_csv("../datasets/credit_train_clean.csv")
city_test = pd.read_csv("../datasets/credit_test_clean.csv")

train.living_region = city_train.living_region
test.living_region = city_test.living_region

In [10]:
train.credit_count.fillna(value = train.credit_count.mean(), inplace=True)
train.overdue_credit_count.fillna(value = train.overdue_credit_count.mean(), inplace=True)

In [11]:
test.credit_count.fillna(value = train.credit_count.mean(), inplace=True)
test.overdue_credit_count.fillna(value = train.overdue_credit_count.mean(), inplace=True)

In [12]:
train.gender = train.gender.map({"M":1, "F":0})
test.gender = test.gender.map({"M":1, "F":0})

In [13]:
train.rename(columns={"open_account_flg":"target"}, inplace=True)

II кодировка категориальных признаков

In [14]:
job_position_dict = pd.DataFrame(train.groupby("job_position").target.mean()).to_dict()
train['job_position'] = train.job_position.map(job_position_dict["target"])
test['job_position'] = test.job_position.map(job_position_dict["target"])

marital_status_dict = pd.DataFrame(train.groupby("marital_status").target.mean()).to_dict()
train['marital_status'] = train.marital_status.map(marital_status_dict["target"])
test['marital_status'] = test.marital_status.map(marital_status_dict["target"])

edu_dict = pd.DataFrame(train.groupby("education").target.mean()).to_dict()
train['education'] = train.education.map(edu_dict["target"])
test['education'] = test.education.map(edu_dict["target"])

In [15]:
train['monthly_income'] = train['monthly_income'].astype(float)
test['monthly_income'] = test['monthly_income'].astype(float)

In [16]:
income_region = pd.concat([train[["monthly_income", "living_region"]],\
                           test[["monthly_income", "living_region"]]], axis=0)

income_dict = pd.DataFrame(income_region.groupby("living_region").monthly_income.mean()).to_dict()
train.living_region = train.living_region.map(income_dict["monthly_income"])
test.living_region = test.living_region.map(income_dict["monthly_income"])

V Генерация данных о зарплате и выплатам по кредиту

In [17]:
tmp = train['credit_sum'].apply(lambda x: x.replace(",","."))
train['credit_sum'] = tmp
tmp = test['credit_sum'].apply(lambda x: x.replace(",","."))
test['credit_sum'] = tmp

train['credit_sum'] = train['credit_sum'].astype(float)
test['credit_sum'] = test['credit_sum'].astype(float)

In [18]:
train.head()

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,target
0,1,1,48,0.152923,0.188978,59998.0,10,1.6,770249,0.139543,37297.641479,30000.0,1.0,1.0,0
1,2,0,28,0.152923,0.188978,10889.0,6,1.1,248514,0.139543,57219.334894,43000.0,2.0,0.0,0
2,3,1,32,0.152923,0.166342,10728.0,12,1.1,459589,0.20512,30885.555556,23000.0,5.0,0.0,0
3,4,0,27,0.182086,0.166342,12009.09,12,1.1,362536,0.139543,31275.988932,17000.0,2.0,0.0,0
4,5,1,45,0.152923,0.166342,16908.89,10,1.1,421385,0.20512,33936.121586,25000.0,1.0,0.0,0


credit_pay - ежемесячный расход на кредит

money_for_life - разность между зп и платой по кредиту

dif_city - разница между зп и средней зп по региону

In [19]:
train["credit_pay"] = train["credit_sum"] / train["credit_month"]
test["credit_pay"] = test["credit_sum"] / test["credit_month"]

train["money_for_life"] = train["monthly_income"] - train["credit_pay"]
test["money_for_life"] = test["monthly_income"] - test["credit_pay"]

train["dif_city"] = train["monthly_income"] - train["living_region"]
test["dif_city"] = test["monthly_income"] = test["living_region"]

VI анализ score_shk

In [20]:
tmp = train['score_shk'].apply(lambda x: x.replace(",","."))
train.score_shk = tmp

tmp = test['score_shk'].apply(lambda x: x.replace(",","."))
test.score_shk = tmp

train['score_shk'] = train['score_shk'].astype(float)
test['score_shk'] = test['score_shk'].astype(float)

In [21]:
train['target'] = y

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 18 columns):
client_id               170746 non-null int64
gender                  170746 non-null int64
age                     170746 non-null int64
marital_status          170746 non-null float64
job_position            170746 non-null float64
credit_sum              170746 non-null float64
credit_month            170746 non-null int64
tariff_id               170746 non-null float64
score_shk               170746 non-null float64
education               170746 non-null float64
living_region           170746 non-null float64
monthly_income          170746 non-null float64
credit_count            170746 non-null float64
overdue_credit_count    170746 non-null float64
target                  170746 non-null int64
credit_pay              170746 non-null float64
money_for_life          170746 non-null float64
dif_city                170746 non-null float64
dtypes: float64(13), int64(5)
memor

In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91940 entries, 0 to 91939
Data columns (total 17 columns):
client_id               91940 non-null int64
gender                  91940 non-null int64
age                     91940 non-null int64
marital_status          91940 non-null float64
job_position            91940 non-null float64
credit_sum              91940 non-null float64
credit_month            91940 non-null int64
tariff_id               91940 non-null float64
score_shk               91940 non-null float64
education               91940 non-null float64
living_region           91940 non-null float64
monthly_income          91940 non-null float64
credit_count            91940 non-null float64
overdue_credit_count    91940 non-null float64
credit_pay              91940 non-null float64
money_for_life          91940 non-null float64
dif_city                91940 non-null float64
dtypes: float64(13), int64(4)
memory usage: 11.9 MB


In [24]:
train.to_csv("../datasets/train_credit_pay.csv", columns=train.columns, index=False)
test.to_csv("../datasets/test_credit_pay.csv", columns=test.columns, index=False)