In [2322]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [2323]:
TRAIN_DATASET_PATH = './course_project_train.csv'
TEST_DATASET_PATH = './course_project_test.csv'

In [2324]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)

df_test = pd.read_csv(TEST_DATASET_PATH)

df_train.shape, df_test.shape

((7500, 17), (2500, 16))

In [2397]:
class Data:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.years_mode = None
        self.years_to_numbers = None
        self.Years_of_Credit_History_max = None
        self.Annual_Income_condition = None
        self.Maximum_Open_Credit_min = None
        self.Maximum_Open_Credit_max = None
        self.condition_last_delinquent = None
        self.Current_Loan_Amount_max = None
        self.Credit_Score_max = None
        self.condition_Credit_Score = None
        self.Credit_Score_min = None
        self.Annual_Income_max = None
        self.Monthly_Debt_max = None
        self.Months_since_last_delinquent_max = None
        
        
    def fit(self, df):
        
        
        # Расчет медиан
        self.medians = df.median()
        
        
        self.years_mode = df['Years in current job'].mode()[0]
        
        self.years_to_numbers = {'< 1 year': 0,
                                 '1 year': 1,
                                 '2 years': 2,
                                 '3 years': 3,
                                 '4 years': 4,
                                 '5 years': 5,
                                 '6 years': 6,
                                 '7 years': 7,
                                 '8 years': 8,
                                 '9 years': 9,
                                 '10+ years': 10}
        
        self.Years_of_Credit_History_max = df['Years of Credit History'].quantile(0.99)
        
        self.Annual_Income_max = 4000000
       
        self.Maximum_Open_Credit_max = 1400000
        
        
        self.Credit_Score_max = 751
        self.Credit_Score_min = 600
       
        self.Monthly_Debt_max = 48000
        self.Monthly_Debt_min = 2000
        
        self.Current_Loan_Amount_max = 800000
        
        self.Current_Credit_Balance_max = 1000000
        
        self.Number_of_Open_Accounts_max = 25
        
        self.Months_since_last_delinquent_max = 95
       
        
    def transform(self, df):
        """Трансформация данных"""
            
    # 1. Изменение типа

        # Term, Home Ownership
#         df.loc[(df['Home Ownership'] == 'Have Mortgage'), 'Home Ownership'] = 'Home Mortgage' # объединение одинаковых по смыслу свойства признака
#         df = pd.get_dummies(df, columns=['Term'])
#         df = pd.get_dummies(df, columns=['Home Ownership'])
        df.drop(['Home Ownership'], axis=1, inplace=True)
        df.drop(['Term'], axis=1, inplace=True)
        
        # Years in current job
#         df['Years in current job'].fillna(self.years_mode, inplace=True) # заполнение пропусков наиболее часто встречающимися значениями
#         df['Years in current job'].replace(self.years_to_numbers, inplace=True)
        df.drop('Years in current job', axis=1, inplace=True)
    
    
    # 2. Выбросы (outliers)
    
        # Number of Open Accounts
#         df.loc[df['Number of Open Accounts'] > self.Number_of_Open_Accounts_max, 'Number of Open Accounts'] = None
        df['Number of Open Accounts'].clip(upper=self.Number_of_Open_Accounts_max, axis=0, inplace=True)
        
        # Current Credit Balance
#         df.loc[(df['Current Credit Balance'] > self.Current_Credit_Balance_max), 'Current Credit Balance'] = None
        df['Current Credit Balance'].clip(upper=self.Current_Credit_Balance_max, axis=0, inplace=True)

    
        # Years of Credit History
        df['Years of Credit History'].clip(upper=self.Years_of_Credit_History_max, axis=0, inplace=True)
        
        # Maximum Open Credit
        df['Maximum Open Credit'].clip(upper=self.Maximum_Open_Credit_max, axis=0, inplace=True)
        
        # Monthly Debt
#         df.loc[(df['Monthly Debt'] > self.Monthly_Debt_max), 'Monthly Debt'] = None
        df['Monthly Debt'].clip(upper=self.Monthly_Debt_max, lower= self.Monthly_Debt_min, axis=0, inplace=True)
        
        # Purpose
#         df.loc[(df['Purpose'] != 'debt consolidation') & (df['Purpose'] != 'home improvements'), 'Purpose'] = 'other'
#         df = pd.get_dummies(df, columns=['Purpose'])
        df.drop('Purpose', axis=1, inplace=True)
        
        # Credit Score
        df.loc[(df['Credit Score'] > self.Credit_Score_max), 'Credit Score'] = None 
#         df['Credit Score'].clip(upper=self.Credit_Score_max, lower=self.Credit_Score_min, axis=0, inplace=True)
        
    
        # Annual Income
#         df.loc[df['Annual Income'] > self.Annual_Income_max, 'Annual Income'] = None
        df['Months since last delinquent'].clip(upper=self.Months_since_last_delinquent_max, axis=0, inplace=True)
        
        # Current Loan Amount
#         df.loc[df['Current Loan Amount'] > self.Current_Loan_Amount_max, 'Current Loan Amount'] = None
        df['Current Loan Amount'].clip(upper=self.Current_Loan_Amount_max, axis=0, inplace=True)
    
        # Months since last delinquent
        df['Months since last delinquent'].clip(upper=self.Months_since_last_delinquent_max, axis=0, inplace=True)
        
        
        
        df.drop('Bankruptcies', axis=1, inplace=True)
        df.drop('Tax Liens', axis=1, inplace=True)
#         df.drop('Months since last delinquent', axis=1, inplace=True)
        
        
        
         
        
    # 3. Пропуски
        
        
        # Заполнение оставшихся пропусков медианами
        
        df.fillna(self.medians, inplace=True)
        
        

        return df

In [2398]:
X = df_train.drop(columns='Credit Default')

In [2399]:
y = df_train['Credit Default']

In [2400]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score,  RandomizedSearchCV

# 3. Модели
from sklearn.linear_model import LogisticRegression
import xgboost as xgb, lightgbm as lgbm, catboost as catb
from sklearn.preprocessing import StandardScaler

# 4. Метрики качества
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

In [2401]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=42)

In [2402]:
data = Data()

In [2403]:
data.fit(X_train)

X_train = data.transform(X_train)
X_test = data.transform(X_test)

X_train.isna().sum().sum(), X_test.isna().sum().sum()

(0, 0)

# Нормализация

In [2404]:
NUMERIC_FEATURE_NAMES  = X_train.select_dtypes(include={'float', 'int'}).columns.to_list()

In [2405]:
NUMERIC_FEATURE_NAMES

['Annual Income',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Months since last delinquent',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score']

In [2406]:
scaler = StandardScaler()

def scal(df):
    df_norm = df.copy()
    df_norm[NUMERIC_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUMERIC_FEATURE_NAMES])

    df = df_norm.copy()
    
    return df

In [2407]:
X_train = scal(X_train)

In [2408]:
X_test = scal(X_test)

# Балансировка

In [2409]:
y_train.value_counts()

0    4069
1    1556
Name: Credit Default, dtype: int64

In [2410]:
TARGET_NAME = 'Credit Default'

In [2411]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1)

In [2412]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_for_balancing

Unnamed: 0,Annual Income,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,-1.108504,-0.010576,1.159191,0.287924,1.637151,-0.081792,1.854409,-0.997476,-0.933013,1.035914,0
1,-0.390097,0.836270,-0.429969,1.585242,-0.341997,-0.081792,-0.433690,0.536181,0.042528,0.543944,1
2,-0.752440,-0.010576,2.416071,1.587085,-0.341997,-0.081792,1.854409,0.154150,-0.397907,0.748932,0
3,-0.681504,-0.645711,0.610208,-1.121366,1.637151,-0.081792,-1.047685,-0.783616,-0.613647,-1.218949,0
4,-0.719585,0.412847,-0.675566,-0.497433,1.637151,-0.081792,-1.028680,-0.794849,-1.001475,-0.194011,0
...,...,...,...,...,...,...,...,...,...,...,...
5620,0.006785,-0.222288,0.277929,-0.330770,-0.341997,-0.081792,-0.881814,-0.189066,-0.249230,0.666937,0
5621,-0.249254,-0.433999,-0.357734,0.081309,-0.341997,0.527856,0.282951,0.128078,-0.307339,0.338956,1
5622,-0.202532,-0.433999,0.234589,-0.238430,-0.341997,-0.081792,-0.830068,-0.456496,-0.543506,0.338956,0
5623,-0.075555,-1.915981,0.249036,-1.104843,-0.341997,-0.081792,1.854409,-1.179060,-1.484629,0.789929,0


In [2413]:
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

0    4069
1    3112
Name: Credit Default, dtype: int64

In [2414]:
df_balanced

Unnamed: 0,Annual Income,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
6059,-0.103136,1.683117,1.347000,2.156406,-0.341997,-2.181690,0.643388,1.882133,0.279720,-0.276006,1
2052,-0.202532,0.624559,-0.559991,-0.259730,-0.341997,-0.081792,-0.157835,-0.200216,0.148019,0.338956,0
600,-0.202532,-0.433999,-0.285500,0.617219,-0.341997,-0.081792,-0.610757,1.518797,0.778077,0.338956,0
1758,-0.578239,1.047982,-0.256606,1.690997,-0.341997,-0.081792,0.159230,0.050867,0.031242,0.871924,1
2336,-0.202532,-1.069134,-0.054349,0.156321,-0.341997,-0.014054,-0.540194,1.048657,-0.668398,0.338956,0
...,...,...,...,...,...,...,...,...,...,...,...
5718,0.617356,-0.433999,0.306823,-0.630993,-0.341997,-1.842997,-0.460410,-0.555839,-0.852799,0.338956,1
6917,-0.202532,0.412847,0.740230,0.682848,-0.341997,-1.572042,1.774848,1.752861,1.749514,0.338956,1
4527,-0.202532,-0.222288,0.220142,2.156406,-0.341997,1.137503,1.685280,1.349704,1.531722,0.338956,0
2481,0.210577,0.624559,0.075673,-0.442512,-0.341997,-0.081792,-0.299338,-1.187946,0.779943,0.871924,0


In [2415]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

# Построение модели

In [2416]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [2417]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)
model_catb.fit(X_train, y_train)

y_train_pred = model_catb.predict(X_train)
y_test_pred = model_catb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      4069
           1       0.89      0.86      0.88      3112

    accuracy                           0.90      7181
   macro avg       0.90      0.89      0.89      7181
weighted avg       0.90      0.90      0.89      7181

TEST

              precision    recall  f1-score   support

           0       0.81      0.72      0.76      1318
           1       0.48      0.59      0.53       557

    accuracy                           0.69      1875
   macro avg       0.64      0.66      0.65      1875
weighted avg       0.71      0.69      0.69      1875

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               955  363
1               227  330


In [2418]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

y_train_pred = model_xgb.predict(X_train)
y_test_pred = model_xgb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      4069
           1       0.97      0.98      0.97      3112

    accuracy                           0.98      7181
   macro avg       0.97      0.98      0.98      7181
weighted avg       0.98      0.98      0.98      7181

TEST

              precision    recall  f1-score   support

           0       0.80      0.73      0.76      1318
           1       0.47      0.57      0.52       557

    accuracy                           0.68      1875
   macro avg       0.64      0.65      0.64      1875
weighted avg       0.70      0.68      0.69      1875

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               961  357
1               239  318


In [2419]:
model_lgbm = lgbm.LGBMClassifier(random_state=21)
model_lgbm.fit(X_train, y_train)

y_train_pred = model_lgbm.predict(X_train)
y_test_pred = model_lgbm.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4069
           1       0.91      0.91      0.91      3112

    accuracy                           0.92      7181
   macro avg       0.92      0.92      0.92      7181
weighted avg       0.92      0.92      0.92      7181

TEST

              precision    recall  f1-score   support

           0       0.82      0.71      0.76      1318
           1       0.47      0.62      0.54       557

    accuracy                           0.68      1875
   macro avg       0.64      0.67      0.65      1875
weighted avg       0.71      0.68      0.69      1875

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               932  386
1               210  347


In [2420]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)

In [2421]:
params = {'n_estimators':[50, 100, 200, 300, 400, 500, 1000, 1500, 2000, 2500, 3000], 
          'max_depth':[3, 4, 5, 6, 7, 8, 10],
          'learning_rate':[0.001, 0.01, 0.05, 0.1]}

In [2422]:
cv=KFold(n_splits=3, shuffle=True)

In [2423]:
%%time

rs = RandomizedSearchCV(model_catb, params, scoring='f1', cv=cv, n_jobs=-1)
rs.fit(X_train, y_train)

Wall time: 4min 28s


RandomizedSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
                   estimator=<catboost.core.CatBoostClassifier object at 0x0000019BFA6422B0>,
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.001, 0.01, 0.05,
                                                          0.1],
                                        'max_depth': [3, 4, 5, 6, 7, 8, 10],
                                        'n_estimators': [50, 100, 200, 300, 400,
                                                         500, 1000, 1500, 2000,
                                                         2500, 3000]},
                   scoring='f1')

In [2424]:
rs.best_params_

{'n_estimators': 1500, 'max_depth': 10, 'learning_rate': 0.01}

In [2425]:
rs.best_score_

0.7838171647626737

In [2444]:
%%time

final_model = lgbm.LGBMClassifier(n_estimators=1500, max_depth=10, learning_rate=0.01,
                                      silent=True)
final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      4069
           1       0.94      0.95      0.95      3112

    accuracy                           0.95      7181
   macro avg       0.95      0.95      0.95      7181
weighted avg       0.95      0.95      0.95      7181

TEST

              precision    recall  f1-score   support

           0       0.81      0.72      0.76      1318
           1       0.48      0.61      0.54       557

    accuracy                           0.69      1875
   macro avg       0.65      0.66      0.65      1875
weighted avg       0.71      0.69      0.69      1875

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               943  375
1               215  342
Wall time: 2.72 s


In [2445]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       final_model.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [2446]:
feature_importances

Unnamed: 0,feature_name,importance
6,Current Loan Amount,6333
8,Monthly Debt,5874
0,Annual Income,5777
2,Years of Credit History,5673
3,Maximum Open Credit,5325
7,Current Credit Balance,5079
9,Credit Score,4739
1,Number of Open Accounts,2885
5,Months since last delinquent,2769
4,Number of Credit Problems,529
