In [1]:
import pandas as pd

In [2]:
data_train = pd.read_csv('../data/csv/credit_train.csv')
data_test = pd.read_csv('../data/csv/credit_test.csv')

In [3]:
from sklearn.preprocessing import OneHotEncoder

def preproc(data_init):
    preproc_data = data_init.copy()
    
    preproc_data = preproc_data.drop(['Months since last delinquent', 'Customer ID', 'Purpose'], axis=1)
    
    preproc_data = preproc_data.dropna()
    
    #cтатус займа в бинарные значения
    preproc_data.loc[:, 'Loan Status'] = \
    preproc_data.loc[:, 'Loan Status'].replace({'Fully Paid': 1, 'Charged Off': 0})
    
    #срок в бинарные значения
    preproc_data.loc[:, 'Term'] = \
    preproc_data.loc[:, 'Term'].replace({'Long Term': 1, 'Short Term': 0})
    
    #стаж на текущем месте работы в числовые значения
    preproc_data.loc[:, 'Years in current job'] = \
    preproc_data.loc[:, 'Years in current job'].replace({
        '10+ years': 10,
        '2 years': 2,
        '3 years': 3,
        '< 1 year': 0,
        '5 years': 5,
        '1 year': 1,
        '4 years': 4,
        '6 years': 6,
        '7 years': 7,
        '8 years': 8,
        '9 years': 9,
    })
    
    preproc_data = pd.get_dummies(preproc_data, columns = ["Home Ownership"] , prefix_sep = "_", drop_first = True)
    
    preproc_data.set_index('Loan ID', inplace=True)
    
    return preproc_data

In [4]:
data_train_preproc = data_train.pipe(preproc)

In [5]:
data_train_preproc.head()

Unnamed: 0_level_0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent
Loan ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
14dd8831-6af5-400b-83ec-68e61888a048,1,445412,0,709.0,1167493.0,8,5214.74,17.2,6,1,228190,416746.0,1.0,0.0,1,0,0
4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,1,99999999,0,741.0,2231892.0,8,29200.53,14.9,18,1,297996,750090.0,0.0,0.0,0,1,0
77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,1,347666,1,721.0,806949.0,3,8741.9,12.0,9,0,256329,386958.0,0.0,0.0,0,1,0
89d8cb0c-e5c2-4f54-b056-48a645c543dd,0,206602,0,7290.0,896857.0,10,16367.74,17.3,6,0,215308,272448.0,0.0,0.0,1,0,0
273581de-85d8-4332-81a5-19b04ce68666,1,217646,0,730.0,1184194.0,0,10855.08,19.6,13,1,122170,272052.0,1.0,0.0,1,0,0


In [6]:
X, y = data_train_preproc.iloc[:, :-1].values, data_train_preproc.iloc[:, -1].values

In [7]:
from scipy.stats import randint as randint
from scipy.stats import uniform
from sklearn.tree import DecisionTreeClassifier

try:
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import StratifiedKFold
except ImportError:
    from sklearn.cross_validation import GridSearchCV
    from sklearn.cross_validation import RandomizedSearchCV
    from sklearn.cross_validation import StratifiedKFold


RND_SEED = 123

In [8]:
# Определим пространство поиска

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': randint(2, 15),
    'min_samples_leaf': randint(5, 10),
    'class_weight': [None, 'balanced']}

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

model = DecisionTreeClassifier(random_state=123)
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=200, n_jobs=-1,
                                   cv=cv, scoring='roc_auc', random_state=123)

random_search.fit(X, y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=DecisionTreeClassifier(random_state=123),
                   n_iter=200, n_jobs=-1,
                   param_distributions={'class_weight': [None, 'balanced'],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025D5926F880>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025D5926FA60>},
                   random_state=123, scoring='roc_auc')

In [9]:
random_search.best_params_

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 6,
 'min_samples_leaf': 8}

In [10]:
model = random_search.best_estimator_
data = model.feature_importances_

In [11]:
pd.Series(index=data_train_preproc.columns[:-1], data=data)

Loan Status                     0.000000
Current Loan Amount             0.002154
Term                            0.000000
Credit Score                    0.000082
Annual Income                   0.000777
Years in current job            0.000489
Monthly Debt                    0.000294
Years of Credit History         0.000000
Number of Open Accounts         0.000000
Number of Credit Problems       0.000000
Current Credit Balance          0.000419
Maximum Open Credit             0.000146
Bankruptcies                    0.000000
Tax Liens                       0.000000
Home Ownership_Home Mortgage    0.659249
Home Ownership_Own Home         0.336391
dtype: float64

In [12]:
from sklearn.model_selection import train_test_split

y = data_train_preproc['Loan Status']
X = data_train_preproc.drop(['Loan Status'], axis=1)

In [13]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=8)
tree.fit(X, y)

DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=8)

In [14]:
from sklearn.metrics import accuracy_score

y_true = y
y_pred = tree.predict(X)

In [15]:
accuracy_score(y_true, y_pred)

0.8457377282551022