In [353]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [354]:
TRAIN_DATASET_PATH = './course_project_train.csv'
TEST_DATASET_PATH = './course_project_test.csv'

In [355]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)

df_test = pd.read_csv(TEST_DATASET_PATH)

df_train.shape, df_test.shape

((7500, 17), (2500, 16))

In [376]:
class Data:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.years_mode = None
        self.years_to_numbers = None
        self.Years_of_Credit_History_max = None
        self.Annual_Income_condition = None
        self.Maximum_Open_Credit_min = None
        self.Maximum_Open_Credit_max = None
        self.condition_last_delinquent = None
        self.Current_Loan_Amount_max = None
        self.Credit_Score_max = None
        self.condition_Credit_Score = None
        self.Credit_Score_min = None
        
        
    def fit(self, df):
        
        
        # Расчет медиан
        self.medians = df.median()
        
        df.loc[df['Home Ownership'] == 'Have Mortgage', 'Home Ownership'] = 'Home Mortgage' # объединение одинаковых по смыслу свойства признака
        df = pd.get_dummies(df, columns=['Term', 'Home Ownership'])
        
        
        self.years_mode = df['Years in current job'].mode()[0]
        
        self.years_to_numbers = {'< 1 year': 0,
                                 '1 year': 1,
                                 '2 years': 2,
                                 '3 years': 3,
                                 '4 years': 4,
                                 '5 years': 5,
                                 '6 years': 6,
                                 '7 years': 7,
                                 '8 years': 8,
                                 '9 years': 9,
                                 '10+ years': 10}
        
        self.Years_of_Credit_History_max = df['Years of Credit History'].quantile(0.99)
        
        
        self.Annual_Income_condition = df.groupby(['Number of Open Accounts', 'Home Ownership_Home Mortgage'], as_index=False).agg({'Annual Income' : 'median'}).rename(columns={'Annual Income': 'Average Annual Income'})
        
        self.Maximum_Open_Credit_min = df['Maximum Open Credit'].quantile(0.01)
        self.Maximum_Open_Credit_max = df['Maximum Open Credit'].quantile(0.99)
        
        self.condition_last_delinquent = df.groupby(['Bankruptcies',
                                                     'Home Ownership_Home Mortgage',
                                                     'Tax Liens'], as_index=False).agg({'Months since last delinquent' : 'median'}).rename(columns={'Months since last delinquent' : 'Average Delinquent'})
        
        self.Credit_Score_max = 751
        self.Credit_Score_min = 625
        self.condition_Credit_Score = df.groupby(['Term_Long Term',
                                                  'Home Ownership_Home Mortgage'], as_index=False).agg({'Credit Score' : 'median'}).rename(columns={'Credit Score' : 'Median Score'})
        
        self.Current_Loan_Amount_max = 800000
        self.condition_Current_Loan_Amount = df.groupby(['Credit Score'],as_index=False).agg({'Current Loan Amount' : 'median'}).rename(columns={'Current Loan Amount' : 'Median Current Loan Amount'})
        
        
    def transform(self, df):
        """Трансформация данных"""
            
    # 1. Изменение типа

        # Term, Home Ownership
        df.loc[(df['Home Ownership'] == 'Have Mortgage'), 'Home Ownership'] = 'Home Mortgage' # объединение одинаковых по смыслу свойства признака
        df = pd.get_dummies(df, columns=['Term', 'Home Ownership'])
        
        # Years in current job
        df['Years in current job'].fillna(self.years_mode, inplace=True) # заполнение пропусков наиболее часто встречающимися значениями
        df['Years in current job'].replace(self.years_to_numbers, inplace=True)
        
    
    # 2. Выбросы (outliers)
    
        # Years of Credit History
        df['Years of Credit History'].clip(upper=self.Years_of_Credit_History_max, axis=0, inplace=True)
        
        # Maximum Open Credit
        df['Maximum Open Credit'].clip(upper=self.Maximum_Open_Credit_max, lower=self.Maximum_Open_Credit_min, axis=0, inplace=True)
        
        # Purpose
        df.loc[(df['Purpose'] != 'debt consolidation') & (df['Purpose'] != 'home improvements'), 'Purpose'] = 'other'
        df = pd.get_dummies(df, columns=['Purpose'])
        
        # Credit Score
        df.loc[(df['Credit Score'] > self.Credit_Score_max) | (df['Credit Score'] < self.Credit_Score_min), 'Credit Score'] = None 
        
        # Current Loan Amount
        df.loc[df['Current Loan Amount'] > self.Current_Loan_Amount_max, 'Current Loan Amount'] = None
        
         
        
    # 3. Пропуски
        
        # Annual Income
        df = df.merge(self.Annual_Income_condition,  on=['Number of Open Accounts', 'Home Ownership_Home Mortgage'], how='left')
        df['Annual Income'].fillna(df['Average Annual Income'], inplace=True)
        df.drop('Average Annual Income', axis=1, inplace=True)
        
        # Bankruptcies
        df['Bankruptcies'].fillna(df['Number of Credit Problems'], inplace=True)
        
        # Months since last delinquent
        df = df.merge(self.condition_last_delinquent, on=['Bankruptcies',
                                                          'Home Ownership_Home Mortgage',
                                                          'Tax Liens'], how='left')
        df['Months since last delinquent'].fillna(df['Average Delinquent'], inplace=True)
        df.drop('Average Delinquent', axis=1, inplace=True)
        
        # Credit Score
        df = df.merge(self.condition_Credit_Score, on=['Term_Long Term', 'Home Ownership_Home Mortgage'], how='left')
        df['Credit Score'].fillna(df['Median Score'], inplace=True)
        df.drop('Median Score', axis=1, inplace=True)
        
        # Current Loan Amount
        df = df.merge(self.condition_Current_Loan_Amount, on=['Credit Score'], how='left')
        df['Current Loan Amount'].fillna(df['Median Current Loan Amount'])
        df.drop('Median Current Loan Amount', axis=1, inplace=True)
        
        
        # Заполнение оставшихся пропусков медианами
        
        df.fillna(self.medians, inplace=True)
        
        

        return df

In [463]:
X = df_train.drop(columns='Credit Default')

In [464]:
y = df_train['Credit Default']

In [465]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score,  RandomizedSearchCV

# 3. Модели
from sklearn.linear_model import LogisticRegression
import xgboost as xgb, lightgbm as lgbm, catboost as catb
from sklearn.preprocessing import StandardScaler

# 4. Метрики качества
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

In [466]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=42)

In [467]:
data = Data()

In [468]:
data.fit(X_train)

X_train = data.transform(X_train)
X_test = data.transform(X_test)

X_train.isna().sum().sum(), X_test.isna().sum().sum()

(0, 0)

# Нормализация

In [469]:
NUMERIC_FEATURE_NAMES  = X_train.select_dtypes(include={'float', 'int'}).columns.to_list()

In [470]:
NUMERIC_FEATURE_NAMES

['Annual Income',
 'Years in current job',
 'Tax Liens',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Months since last delinquent',
 'Bankruptcies',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score',
 'Years in current job NaN',
 'Annual Income NaN']

In [471]:
scaler = StandardScaler()

def scal(df):
    df_norm = df.copy()
    df_norm[NUMERIC_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUMERIC_FEATURE_NAMES])

    df = df_norm.copy()
    
    return df

In [472]:
X_train = scal(X_train)

In [473]:
X_test = scal(X_test)

# Балансировка

In [474]:
y_train.value_counts()

0    4069
1    1556
Name: Credit Default, dtype: int64

In [475]:
TARGET_NAME = 'Credit Default'

In [476]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1)

In [477]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_for_balancing

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Annual Income NaN,Term_Long Term,Term_Short Term,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent,Purpose_debt consolidation,Purpose_home improvements,Purpose_other,Credit Default
0,-1.102058,1.087052,-0.115543,-0.022069,1.159191,0.087769,1.637151,1.320797,2.545244,0.000054,...,-0.513577,0,1,0,1,0,1,0,0,0
1,-0.391468,1.087052,-0.115543,0.787978,-0.429969,0.952660,-0.341997,-0.084804,-0.337179,-0.253273,...,-0.513577,1,0,0,1,0,1,0,0,1
2,-0.749869,0.537715,-0.115543,-0.022069,2.416071,0.953888,-0.341997,-0.280934,-0.337179,0.000054,...,-0.513577,0,1,1,0,0,1,0,0,0
3,-0.679704,-0.011622,-0.115543,-0.629605,0.610208,-0.851771,1.637151,1.320797,2.545244,-1.087807,...,-0.513577,0,1,0,1,0,1,0,0,0
4,-0.717371,0.537715,-0.115543,0.382954,-0.675566,-0.435810,1.637151,-0.084804,-0.337179,-1.061976,...,-0.513577,0,1,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5620,0.001095,0.537715,-0.115543,-0.224581,0.277929,-0.324700,-0.341997,-0.084804,-0.337179,-0.862358,...,-0.513577,0,1,0,0,1,1,0,0,0
5621,-0.252158,-0.835626,-0.115543,-0.427093,-0.357734,-0.049977,-0.341997,0.503587,-0.337179,0.720777,...,-0.513577,1,0,1,0,0,1,0,0,1
5622,-0.015924,-0.286290,-0.115543,-0.427093,0.234589,-0.263138,-0.341997,-0.280934,-0.337179,-0.792024,...,1.947129,0,1,1,0,0,1,0,0,0
5623,-0.080350,-1.110295,-0.115543,-1.844676,0.249036,-0.840756,-0.341997,-0.280934,-0.337179,0.000054,...,-0.513577,1,0,1,0,0,1,0,0,0


In [478]:
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

0    4069
1    3112
Name: Credit Default, dtype: int64

In [479]:
df_balanced

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Annual Income NaN,Term_Long Term,Term_Short Term,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent,Purpose_debt consolidation,Purpose_home improvements,Purpose_other,Credit Default
1286,1.063650,-0.011622,-0.115543,-0.427093,0.595761,-0.120749,-0.341997,-0.084804,-0.337179,1.335105,...,-0.513577,1,0,0,0,1,1,0,0,1
2746,0.320140,-1.110295,-0.115543,0.787978,-0.039903,0.434646,-0.341997,-0.280934,-0.337179,0.000054,...,-0.513577,0,1,1,0,0,1,0,0,0
3387,0.117299,-0.286290,-0.115543,-1.034628,0.451292,-0.901895,-0.341997,-0.280934,-0.337179,1.660811,...,-0.513577,0,1,1,0,0,0,0,1,1
3188,-0.975419,-1.659631,-0.115543,-0.832117,-1.816872,-0.805331,-0.341997,-0.084804,-0.337179,-1.143946,...,-0.513577,0,1,0,0,1,1,0,0,0
1371,0.003430,1.087052,-0.115543,0.382954,-0.011009,-0.494224,1.637151,0.176703,2.545244,-0.772331,...,-0.513577,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,1.308356,1.087052,-0.115543,0.180443,0.956934,0.058754,-0.341997,0.634341,-0.337179,-0.591511,...,-0.513577,0,1,1,0,0,1,0,0,0
4492,-0.809300,-0.011622,-0.115543,0.787978,0.509080,-0.311881,-0.341997,2.922529,-0.337179,0.526018,...,-0.513577,1,0,1,0,0,1,0,0,1
3334,0.531105,1.087052,-0.115543,1.598025,0.480186,1.706019,-0.341997,-0.280934,-0.337179,-1.535254,...,1.947129,0,1,1,0,0,0,0,1,0
2140,-0.411631,-0.835626,-0.115543,0.382954,-1.383465,-0.024454,-0.341997,-0.084804,-0.337179,-0.439208,...,1.947129,0,1,0,0,1,1,0,0,0


In [480]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

# Построение модели

In [481]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [482]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)
model_catb.fit(X_train, y_train)

y_train_pred = model_catb.predict(X_train)
y_test_pred = model_catb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.92      0.93      0.93      4069
           1       0.91      0.89      0.90      3112

    accuracy                           0.92      7181
   macro avg       0.92      0.91      0.91      7181
weighted avg       0.92      0.92      0.92      7181

TEST

              precision    recall  f1-score   support

           0       0.78      0.82      0.80      1318
           1       0.51      0.44      0.47       557

    accuracy                           0.71      1875
   macro avg       0.64      0.63      0.64      1875
weighted avg       0.70      0.71      0.70      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1080  238
1                310  247


In [483]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

y_train_pred = model_xgb.predict(X_train)
y_test_pred = model_xgb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4069
           1       0.98      0.99      0.99      3112

    accuracy                           0.99      7181
   macro avg       0.99      0.99      0.99      7181
weighted avg       0.99      0.99      0.99      7181

TEST

              precision    recall  f1-score   support

           0       0.76      0.81      0.79      1318
           1       0.48      0.40      0.44       557

    accuracy                           0.69      1875
   macro avg       0.62      0.61      0.61      1875
weighted avg       0.68      0.69      0.68      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1072  246
1                333  224


In [484]:
model_lgbm = lgbm.LGBMClassifier(random_state=21)
model_lgbm.fit(X_train, y_train)

y_train_pred = model_lgbm.predict(X_train)
y_test_pred = model_lgbm.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4069
           1       0.91      0.91      0.91      3112

    accuracy                           0.92      7181
   macro avg       0.92      0.92      0.92      7181
weighted avg       0.92      0.92      0.92      7181

TEST

              precision    recall  f1-score   support

           0       0.78      0.79      0.79      1318
           1       0.49      0.46      0.48       557

    accuracy                           0.70      1875
   macro avg       0.63      0.63      0.63      1875
weighted avg       0.69      0.70      0.69      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1047  271
1                299  258


In [423]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)

In [448]:
params = {'n_estimators':[50, 100, 200, 300, 400, 500], 
          'max_depth':[3, 4, 5, 6, 7, 8, 10],
          'learning_rate':[0.001, 0.01, 0.05, 0.1]}

In [456]:
cv=KFold(n_splits=3, shuffle=False)

In [490]:
%%time

rs = GridSearchCV(model_catb, params, scoring='f1', cv=cv, n_jobs=-1)
rs.fit(X_train, y_train)

Wall time: 18min 38s


GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
             estimator=<catboost.core.CatBoostClassifier object at 0x0000028CE28181F0>,
             n_jobs=-1,
             param_grid={'learning_rate': [0.001, 0.01, 0.05, 0.1],
                         'max_depth': [3, 4, 5, 6, 7, 8, 10],
                         'n_estimators': [50, 100, 200, 300, 400, 500]},
             scoring='f1')

In [491]:
rs.best_params_

{'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 500}

In [492]:
rs.best_score_

0.8086809244983008

In [508]:
%%time

final_model = catb.CatBoostClassifier(n_estimators=2000, max_depth=3, learning_rate=0.01,
                                      silent=True)
final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.76      0.82      0.79      4069
           1       0.74      0.67      0.70      3112

    accuracy                           0.75      7181
   macro avg       0.75      0.74      0.75      7181
weighted avg       0.75      0.75      0.75      7181

TEST

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      1318
           1       0.50      0.50      0.50       557

    accuracy                           0.70      1875
   macro avg       0.64      0.64      0.64      1875
weighted avg       0.70      0.70      0.70      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1038  280
1                280  277
Wall time: 7.7 s


In [351]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       model_catb.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [352]:
feature_importances

Unnamed: 0,feature_name,importance
9,Current Loan Amount,27.447412
12,Credit Score,12.601095
0,Annual Income,8.35259
11,Monthly Debt,7.870863
5,Maximum Open Credit,7.653885
10,Current Credit Balance,7.394546
4,Years of Credit History,6.313922
3,Number of Open Accounts,5.001087
7,Months since last delinquent,4.687898
1,Years in current job,3.848486
