In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation

In [2]:
train = pd.read_csv("../credit_train.csv", sep = ";", encoding='cp1251')
test = pd.read_csv("../credit_test.csv", sep = ";", encoding='cp1251')

In [3]:
train.head(2)

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0


In [4]:
y = train['open_account_flg']

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
client_id               170746 non-null int64
gender                  170746 non-null object
age                     170746 non-null int64
marital_status          170746 non-null object
job_position            170746 non-null object
credit_sum              170746 non-null object
credit_month            170746 non-null int64
tariff_id               170746 non-null float64
score_shk               170746 non-null object
education               170746 non-null object
living_region           170554 non-null object
monthly_income          170745 non-null float64
credit_count            161516 non-null float64
overdue_credit_count    161516 non-null float64
open_account_flg        170746 non-null int64
dtypes: float64(4), int64(4), object(7)
memory usage: 19.5+ MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91940 entries, 0 to 91939
Data columns (total 14 columns):
client_id               91940 non-null int64
gender                  91940 non-null object
age                     91940 non-null int64
marital_status          91940 non-null object
job_position            91940 non-null object
credit_sum              91940 non-null object
credit_month            91940 non-null int64
tariff_id               91940 non-null float64
score_shk               91940 non-null object
education               91940 non-null object
living_region           91824 non-null object
monthly_income          91940 non-null int64
credit_count            87237 non-null float64
overdue_credit_count    87237 non-null float64
dtypes: float64(3), int64(4), object(7)
memory usage: 9.8+ MB


I работа с пропусками в данных

In [7]:
train[train.monthly_income.isnull()]

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
19639,19640,F,33,MAR,BIS,1300900,10,1.6,535834,SCH,ПРИМОРСКИЙ КРАЙ,,2.0,0.0,0


In [8]:
train.monthly_income.fillna(value=train.monthly_income.mean(), inplace=True)

возьмем данные о городах из ../datasets/clean/credit_{train/test}_clean.csv

In [9]:
city_train = pd.read_csv("../datasets/credit_train_clean.csv")
city_test = pd.read_csv("../datasets/credit_test_clean.csv")

In [10]:
train.living_region = city_train.living_region
test.living_region = city_test.living_region

In [11]:
train.credit_count.fillna(value = train.credit_count.mean(), inplace=True)
train.overdue_credit_count.fillna(value = train.overdue_credit_count.mean(), inplace=True)

In [12]:
test.credit_count.fillna(value = train.credit_count.mean(), inplace=True)
test.overdue_credit_count.fillna(value = train.overdue_credit_count.mean(), inplace=True)

In [13]:
train.gender = train.gender.map({"M":1, "F":0})
test.gender = test.gender.map({"M":1, "F":0})

In [14]:
train.rename(columns={"open_account_flg":"target"}, inplace=True)

II выделим группы людей по наличию высшего образования, работе и пенсии 

In [15]:
### по образованию
train["high_edu"] = ((train["education"] != "SCH") & (train["education"] != "UGR"))

### по работе
# 1. не работающие граждание
train["not_work"] = ((train.job_position == "HSK") | (train.job_position == "INV") 
                     | (train.job_position == "NOR") | (train.job_position == "PNS"))

# 2. обычные работники
train["usual_workers"] = ((train.job_position == "SPC") | (train.job_position == "WOI") 
                     | (train.job_position == "WRK") | (train.job_position == "ATP") 
                     | (train.job_position == "WRP"))

# 3. начальники
train["head_workers"] = ((train.job_position == "DIR") | (train.job_position == "UMN")
                         | (train.job_position == "BIS") | (train.job_position == "INP"))

# 4. имеют собственный бизнес
train["have_business"] = ((train.job_position == "BIS") | (train.job_position == "INP"))

### по браку
train["single"] = ((train.marital_status == "UNM") | (train.marital_status == "DIV")
                   | (train.marital_status == "WID"))

In [16]:
### по образованию
test["high_edu"] = ((test["education"] != "SCH") & (test["education"] != "UGR"))

### по работе
# 1. не работающие граждание
test["not_work"] = ((test.job_position == "HSK") | (test.job_position == "INV") 
                     | (test.job_position == "NOR") | (test.job_position == "PNS"))

# 2. обычные работники
test["usual_workers"] = ((test.job_position == "SPC") | (test.job_position == "WOI") 
                     | (test.job_position == "WRK") | (test.job_position == "ATP") 
                     | (test.job_position == "WRP"))

# 3. начальники
test["head_workers"] = ((test.job_position == "DIR") | (test.job_position == "UMN")
                         | (test.job_position == "BIS") | (test.job_position == "INP"))

# 4. имеют собственный бизнес
test["have_business"] = ((test.job_position == "BIS") | (test.job_position == "INP"))

### по браку
test["single"] = ((test.marital_status == "UNM") | (test.marital_status == "DIV")
                   | (test.marital_status == "WID"))

III кодировка категориальных признаков, кроме living region

In [18]:
job_position_dict = pd.DataFrame(train.groupby("job_position").target.mean()).to_dict()
train["target_job_position"] = train.job_position.map(job_position_dict["target"])
test["target_job_position"] = test.job_position.map(job_position_dict["target"])

marital_status_dict = pd.DataFrame(train.groupby("marital_status").target.mean()).to_dict()
train["target_marital_status"] = train.marital_status.map(marital_status_dict["target"])
test["target_marital_status"] = test.marital_status.map(marital_status_dict["target"])

edu_dict = pd.DataFrame(train.groupby("education").target.mean()).to_dict()
train["target_education"] = train.education.map(edu_dict["target"])
test["target_education"] = test.education.map(edu_dict["target"])

tariff_dict = pd.DataFrame(train.groupby("tariff_id").target.mean()).to_dict()
train["target_tariff_id"] = train.tariff_id.map(tariff_dict["target"])
test["target_tariff_id"] = test.tariff_id.map(tariff_dict["target"])

In [21]:
train_col = train.columns.drop("target")
dummy_df = pd.concat([train[train_col], test], axis=0)
dummy_df = pd.get_dummies(dummy_df, columns=["education","job_position", "marital_status"])
train = dummy_df[:train.shape[0]]
test = dummy_df[train.shape[0]:]

In [22]:
train.columns

Index(['client_id', 'gender', 'age', 'credit_sum', 'credit_month', 'tariff_id',
       'score_shk', 'living_region', 'monthly_income', 'credit_count',
       'overdue_credit_count', 'high_edu', 'not_work', 'usual_workers',
       'head_workers', 'have_business', 'single', 'target_job_position',
       'target_marital_status', 'target_education', 'target_tariff_id',
       'education_ACD', 'education_GRD', 'education_PGR', 'education_SCH',
       'education_UGR', 'job_position_ATP', 'job_position_BIS',
       'job_position_BIU', 'job_position_DIR', 'job_position_HSK',
       'job_position_INP', 'job_position_INV', 'job_position_NOR',
       'job_position_ONB', 'job_position_PNA', 'job_position_PNI',
       'job_position_PNS', 'job_position_PNV', 'job_position_SPC',
       'job_position_UMN', 'job_position_WOI', 'job_position_WRK',
       'job_position_WRP', 'marital_status_CIV', 'marital_status_DIV',
       'marital_status_MAR', 'marital_status_UNM', 'marital_status_WID'],
      dtype='

IV кодировка региона через среднюю зарплату

In [None]:
train['monthly_income'].astype(float)
test['monthly_income'].astype(float)

income_region = pd.concat([train[["monthly_income", "living_region"]],\
                           test[["monthly_income", "living_region"]]], axis=0)

income_dict = pd.DataFrame(income_region.groupby("living_region").monthly_income.mean()).to_dict()
train.living_region = train.living_region.map(income_dict["monthly_income"])
test.living_region = test.living_region.map(income_dict["monthly_income"])

V Генерация данных о зарплате и выплатам по кредиту

In [None]:
tmp = train['credit_sum'].apply(lambda x: x.replace(",","."))
train['credit_sum'] = tmp
tmp = test['credit_sum'].apply(lambda x: x.replace(",","."))
test['credit_sum'] = tmp

train['credit_sum'] = train['credit_sum'].astype(float)
test['credit_sum'] = test['credit_sum'].astype(float)

credit_pay - ежемесячный расход на кредит

money_for_life - разность между зп и платой по кредиту

dif_city - разница между зп и средней зп по региону

In [None]:
train["credit_pay"] = train["credit_sum"] / train["credit_month"]
train["money_for_life"] = train["monthly_income"] - train["credit_pay"]
train["dif_city"] = train["monthly_income"] - train["living_region"]

test["credit_pay"] = test["credit_sum"] / test["credit_month"]
test["money_for_life"] = test["monthly_income"] - test["credit_pay"]
test["dif_city"] = test["monthly_income"] - test["living_region"]

VI анализ score_shk

In [None]:
tmp = train['score_shk'].apply(lambda x: x.replace(",","."))
train.score_shk = tmp

tmp = test['score_shk'].apply(lambda x: x.replace(",","."))
test.score_shk = tmp

train['score_shk'] = train['score_shk'].astype(float)
test['score_shk'] = test['score_shk'].astype(float)

In [None]:
train['target'] = y

In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170746 entries, 0 to 170745
Data columns (total 53 columns):
client_id                170746 non-null int64
gender                   170746 non-null int64
age                      170746 non-null int64
credit_sum               170746 non-null float64
credit_month             170746 non-null int64
tariff_id                170746 non-null float64
score_shk                170746 non-null float64
living_region            170746 non-null float64
monthly_income           170746 non-null float64
credit_count             170746 non-null float64
overdue_credit_count     170746 non-null float64
high_edu                 170746 non-null bool
not_work                 170746 non-null bool
usual_workers            170746 non-null bool
head_workers             170746 non-null bool
have_business            170746 non-null bool
single                   170746 non-null bool
target_job_position      170746 non-null float64
target_marital_status    170746 n

In [29]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91940 entries, 0 to 91939
Data columns (total 52 columns):
client_id                91940 non-null int64
gender                   91940 non-null int64
age                      91940 non-null int64
credit_sum               91940 non-null float64
credit_month             91940 non-null int64
tariff_id                91940 non-null float64
score_shk                91940 non-null float64
living_region            91940 non-null float64
monthly_income           91940 non-null float64
credit_count             91940 non-null float64
overdue_credit_count     91940 non-null float64
high_edu                 91940 non-null bool
not_work                 91940 non-null bool
usual_workers            91940 non-null bool
head_workers             91940 non-null bool
have_business            91940 non-null bool
single                   91940 non-null bool
target_job_position      91940 non-null float64
target_marital_status    91940 non-null float64
targe

In [30]:
train.to_csv("../datasets/train_base.csv", columns=train.columns, index=False)
test.to_csv("../datasets/test_base.csv", columns=test.columns, index=False)