In [1]:
import pandas as pd

In [2]:
data_train = pd.read_csv('../data/csv/credit_train.csv')
data_test = pd.read_csv('../data/csv/credit_test.csv')

In [3]:
from sklearn.preprocessing import OneHotEncoder

def preproc(data_init):
    preproc_data = data_init.copy()
    
    preproc_data = preproc_data.drop(['Months since last delinquent', 'Customer ID', 'Purpose'], axis=1)
    
    preproc_data = preproc_data.dropna()
    
    #cтатус займа в бинарные значения
    preproc_data.loc[:, 'Loan Status'] = \
    preproc_data.loc[:, 'Loan Status'].replace({'Fully Paid': 1, 'Charged Off': 0})
    
    #срок в бинарные значения
    preproc_data.loc[:, 'Term'] = \
    preproc_data.loc[:, 'Term'].replace({'Long Term': 1, 'Short Term': 0})
    
    #стаж на текущем месте работы в числовые значения
    preproc_data.loc[:, 'Years in current job'] = \
    preproc_data.loc[:, 'Years in current job'].replace({
        '10+ years': 10,
        '2 years': 2,
        '3 years': 3,
        '< 1 year': 0,
        '5 years': 5,
        '1 year': 1,
        '4 years': 4,
        '6 years': 6,
        '7 years': 7,
        '8 years': 8,
        '9 years': 9,
    })
    
    preproc_data = pd.get_dummies(preproc_data, columns = ["Home Ownership"] , prefix_sep = "_", drop_first = True)
    
    preproc_data.set_index('Loan ID', inplace=True)
    
    return preproc_data

In [4]:
data_train_preproc = data_train.pipe(preproc)

In [5]:
data_train_preproc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77271 entries, 14dd8831-6af5-400b-83ec-68e61888a048 to 81ab928b-d1a5-4523-9a3c-271ebb01b4fb
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Loan Status                   77271 non-null  int64  
 1   Current Loan Amount           77271 non-null  int64  
 2   Term                          77271 non-null  int64  
 3   Credit Score                  77271 non-null  float64
 4   Annual Income                 77271 non-null  float64
 5   Years in current job          77271 non-null  int64  
 6   Monthly Debt                  77271 non-null  float64
 7   Years of Credit History       77271 non-null  float64
 8   Number of Open Accounts       77271 non-null  int64  
 9   Number of Credit Problems     77271 non-null  int64  
 10  Current Credit Balance        77271 non-null  int64  
 11  Maximum Open Credit           77271 non-null  float64
 12 

In [6]:
X, y = data_train_preproc.iloc[:, :-1].values, data_train_preproc.iloc[:, -1].values

In [7]:
from scipy.stats import randint as randint
from scipy.stats import uniform
from sklearn.tree import DecisionTreeClassifier

try:
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import StratifiedKFold
except ImportError:
    from sklearn.cross_validation import GridSearchCV
    from sklearn.cross_validation import RandomizedSearchCV
    from sklearn.cross_validation import StratifiedKFold


RND_SEED = 123

In [8]:
# Определим пространство поиска

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': randint(2, 15),
    'min_samples_leaf': randint(5, 10),
    'class_weight': [None, 'balanced']}

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

model = DecisionTreeClassifier(random_state=123)
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=200, n_jobs=-1,
                                   cv=cv, scoring='roc_auc', random_state=123)

random_search.fit(X, y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
                   estimator=DecisionTreeClassifier(random_state=123),
                   n_iter=200, n_jobs=-1,
                   param_distributions={'class_weight': [None, 'balanced'],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F505665D30>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F5722B1160>},
                   random_state=123, scoring='roc_auc')

In [9]:
model = random_search.best_estimator_
data = model.feature_importances_

In [10]:
pd.Series(index=data_train_preproc.columns[:-1], data=data)

Loan Status                     0.000000
Current Loan Amount             0.002154
Term                            0.000000
Credit Score                    0.000082
Annual Income                   0.000777
Years in current job            0.000489
Monthly Debt                    0.000294
Years of Credit History         0.000000
Number of Open Accounts         0.000000
Number of Credit Problems       0.000000
Current Credit Balance          0.000419
Maximum Open Credit             0.000146
Bankruptcies                    0.000000
Tax Liens                       0.000000
Home Ownership_Home Mortgage    0.659249
Home Ownership_Own Home         0.336391
dtype: float64