In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation

In [2]:
train = pd.read_csv("../credit_train.csv", sep = ";", encoding='cp1251')
test = pd.read_csv("../credit_test.csv", sep = ";", encoding='cp1251')

In [3]:
train.head(2)

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,КРАСНОДАРСКИЙ КРАЙ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,МОСКВА,43000.0,2.0,0.0,0


In [4]:
y = train['open_account_flg']

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
client_id               170746 non-null int64
gender                  170746 non-null object
age                     170746 non-null int64
marital_status          170746 non-null object
job_position            170746 non-null object
credit_sum              170746 non-null object
credit_month            170746 non-null int64
tariff_id               170746 non-null float64
score_shk               170746 non-null object
education               170746 non-null object
living_region           170554 non-null object
monthly_income          170745 non-null float64
credit_count            161516 non-null float64
overdue_credit_count    161516 non-null float64
open_account_flg        170746 non-null int64
dtypes: float64(4), int64(4), object(7)
memory usage: 19.5+ MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91940 entries, 0 to 91939
Data columns (total 14 columns):
client_id               91940 non-null int64
gender                  91940 non-null object
age                     91940 non-null int64
marital_status          91940 non-null object
job_position            91940 non-null object
credit_sum              91940 non-null object
credit_month            91940 non-null int64
tariff_id               91940 non-null float64
score_shk               91940 non-null object
education               91940 non-null object
living_region           91824 non-null object
monthly_income          91940 non-null int64
credit_count            87237 non-null float64
overdue_credit_count    87237 non-null float64
dtypes: float64(3), int64(4), object(7)
memory usage: 9.8+ MB


I работа с пропусками в данных

In [7]:
train[train.monthly_income.isnull()]

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
19639,19640,F,33,MAR,BIS,1300900,10,1.6,535834,SCH,ПРИМОРСКИЙ КРАЙ,,2.0,0.0,0


In [8]:
train.monthly_income.fillna(value=train.monthly_income.mean(), inplace=True)

In [9]:
city_train = pd.read_csv("../datasets/credit_train_clean.csv")
city_test = pd.read_csv("../datasets/credit_test_clean.csv")

train.living_region = city_train.living_region
test.living_region = city_test.living_region

In [11]:
train.credit_count.fillna(value = train.credit_count.mean(), inplace=True)
train.overdue_credit_count.fillna(value = train.overdue_credit_count.mean(), inplace=True)

In [12]:
test.credit_count.fillna(value = train.credit_count.mean(), inplace=True)
test.overdue_credit_count.fillna(value = train.overdue_credit_count.mean(), inplace=True)

In [13]:
train.gender = train.gender.map({"M":1, "F":0})
test.gender = test.gender.map({"M":1, "F":0})
train.rename(columns={"open_account_flg":"target"}, inplace=True)

II кодировка категориальных признаков

In [15]:
train_col = train.columns.drop("target")
dummy_df = pd.concat([train[train_col], test], axis=0)
dummy_df = pd.get_dummies(dummy_df, columns=["education","job_position",\
                                             "marital_status", "living_region"], drop_first=True)
train = dummy_df[:train.shape[0]]
test = dummy_df[train.shape[0]:]

In [16]:
train.columns

Index(['client_id', 'gender', 'age', 'credit_sum', 'credit_month', 'tariff_id',
       'score_shk', 'monthly_income', 'credit_count', 'overdue_credit_count',
       ...
       'living_region_ХАКАСИЯ', 'living_region_ХАНТЫ-МАНСИЙСКИЙ',
       'living_region_ЧЕЛЯБИНСКАЯ', 'living_region_ЧЕЧЕНСКАЯ',
       'living_region_ЧИТИНСКАЯ', 'living_region_ЧУВАШСКАЯ',
       'living_region_ЧУКОТСКИЙ', 'living_region_ЭВЕНКИЙСКИЙ',
       'living_region_ЯМАЛО-НЕНЕЦКИЙ', 'living_region_ЯРОСЛАВСКАЯ'],
      dtype='object', length=117)

In [None]:
train['monthly_income'] = train['monthly_income'].astype(float)
test['monthly_income'] = test['monthly_income'].astype(float)

V Генерация данных о зарплате и выплатам по кредиту

In [None]:
tmp = train['credit_sum'].apply(lambda x: x.replace(",","."))
train['credit_sum'] = tmp
tmp = test['credit_sum'].apply(lambda x: x.replace(",","."))
test['credit_sum'] = tmp

VI анализ score_shk

In [None]:
tmp = train['score_shk'].apply(lambda x: x.replace(",","."))
train.score_shk = tmp

tmp = test['score_shk'].apply(lambda x: x.replace(",","."))
test.score_shk = tmp

train['score_shk'] = train['score_shk'].astype(float)
test['score_shk'] = test['score_shk'].astype(float)

In [None]:
train['target'] = y

In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170746 entries, 0 to 170745
Columns: 353 entries, client_id to target
dtypes: float64(5), int64(5), object(1), uint8(342)
memory usage: 71.3+ MB


In [29]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91940 entries, 0 to 91939
Columns: 352 entries, client_id to living_region_ЯРОСЛАВСКАЯ ОБЛАСТЬ
dtypes: float64(5), int64(4), object(1), uint8(342)
memory usage: 37.7+ MB


In [30]:
train.to_csv("../datasets/train_simple_without_oversampling.csv", columns=train.columns, index=False)
test.to_csv("../datasets/test_simple_without_oversampling.csv", columns=test.columns, index=False)