In [521]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [522]:
TRAIN_DATASET_PATH = './course_project_train.csv'
TEST_DATASET_PATH = './course_project_test.csv'

In [523]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)

df_test = pd.read_csv(TEST_DATASET_PATH)

df_train.shape, df_test.shape

((7500, 17), (2500, 16))

In [524]:
class Data:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.years_mode = None
        self.years_to_numbers = None
        self.Years_of_Credit_History_max = None
        self.Annual_Income_condition = None
        self.Maximum_Open_Credit_min = None
        self.Maximum_Open_Credit_max = None
        self.condition_last_delinquent = None
        self.Current_Loan_Amount_max = None
        self.Credit_Score_max = None
        self.condition_Credit_Score = None
        self.Credit_Score_min = None
        self.Annual_Income_max = None
        self.Monthly_Debt_max = None
        self.Months_since_last_delinquent_max = None
        
        
    def fit(self, df):
        
        
        # Расчет медиан
        self.medians = df.median()
        
        
        self.years_mode = df['Years in current job'].mode()[0]
        
        self.years_to_numbers = {'< 1 year': 0,
                                 '1 year': 1,
                                 '2 years': 2,
                                 '3 years': 3,
                                 '4 years': 4,
                                 '5 years': 5,
                                 '6 years': 6,
                                 '7 years': 7,
                                 '8 years': 8,
                                 '9 years': 9,
                                 '10+ years': 10}
        
        self.Years_of_Credit_History_max = 45
        
        self.Annual_Income_max = 6000000
       
        self.Maximum_Open_Credit_max = 1500000
        
        
        self.Credit_Score_max = 751
        self.Credit_Score_min = 600
       
        self.Monthly_Debt_max = 48000
        self.Monthly_Debt_min = 0
        
        self.Current_Loan_Amount_max = 900000
        
        self.Current_Credit_Balance_max = 800000
        
        self.Number_of_Open_Accounts_max = 25
        
        self.Months_since_last_delinquent_max = 95
       
        
    def transform(self, df):
        """Трансформация данных"""
            
    # 1. Изменение типа

        # Term, Home Ownership
        df.loc[(df['Home Ownership'] == 'Have Mortgage'), 'Home Ownership'] = 'Home Mortgage' # объединение одинаковых по смыслу свойства признака
        df = pd.get_dummies(df, columns=['Term'])
        df = pd.get_dummies(df, columns=['Home Ownership'])
#         df.drop(['Home Ownership'], axis=1, inplace=True)
#         df.drop(['Term'], axis=1, inplace=True)
        
        # Years in current job
        df['Years in current job'].fillna(self.years_mode, inplace=True) # заполнение пропусков наиболее часто встречающимися значениями
        df['Years in current job'].replace(self.years_to_numbers, inplace=True)
#         df.drop('Years in current job', axis=1, inplace=True)
    
    
    # 2. Выбросы (outliers)
    
        # Number of Open Accounts
#         df.loc[df['Number of Open Accounts'] > self.Number_of_Open_Accounts_max, 'Number of Open Accounts'] = None
        df['Number of Open Accounts'].clip(upper=self.Number_of_Open_Accounts_max, axis=0, inplace=True)
        
        # Current Credit Balance
        df['Current Credit Balance outlier'] = 0
        df.loc[df['Current Credit Balance'] > self.Current_Credit_Balance_max, 'Current Credit Balance outlier'] = 1
#         df.loc[(df['Current Credit Balance'] > self.Current_Credit_Balance_max), 'Current Credit Balance'] = None
        df['Current Credit Balance'].clip(upper=self.Current_Credit_Balance_max, axis=0, inplace=True)

    
        # Years of Credit History
#         df.loc[(df['Years of Credit History'] > self.Years_of_Credit_History_max), 'Years of Credit History'] = None
        df['Years of Credit History'].clip(upper=self.Years_of_Credit_History_max, axis=0, inplace=True)
        
        # Maximum Open Credit
#         df.loc[df['Maximum Open Credit'] > self.Maximum_Open_Credit_max, 'Maximum Open Credit'] = None
        df['Maximum Open Credit'].clip(upper=self.Maximum_Open_Credit_max, axis=0, inplace=True)
        
        # Monthly Debt
#         df.loc[(df['Monthly Debt'] > self.Monthly_Debt_max), 'Monthly Debt'] = None
        df['Monthly Debt'].clip(upper=self.Monthly_Debt_max, lower= self.Monthly_Debt_min, axis=0, inplace=True)
        
        # Purpose
        df.loc[(df['Purpose'] != 'debt consolidation') & (df['Purpose'] != 'home improvements'), 'Purpose'] = 'other'
        df = pd.get_dummies(df, columns=['Purpose'])
#         df.drop('Purpose', axis=1, inplace=True)
        
        # Credit Score
#         df.loc[(df['Credit Score'] > self.Credit_Score_max) | (df['Credit Score'] < self.Credit_Score_min), 'Credit Score'] = None 
        df['Credit Score nan'] = 0
        df.loc[df['Credit Score'].isna(), 'Credit Score nan'] = 1
        df['Credit Score'].clip(upper=self.Credit_Score_max, lower=self.Credit_Score_min, axis=0, inplace=True)
        
    
        # Annual Income
#         df.loc[df['Annual Income'] > self.Annual_Income_max, 'Annual Income'] = None
        df['Annual Income nan'] = 0
        df.loc[df['Annual Income'].isna(), 'Annual Income nan'] = 1
        df['Annual Income'].clip(upper=self.Annual_Income_max, axis=0, inplace=True)
        
        # Current Loan Amount
#         df.loc[df['Current Loan Amount'] > self.Current_Loan_Amount_max, 'Current Loan Amount'] = None
        df['Current Loan Amount outlier'] = 0
        df.loc[df['Current Loan Amount'] > self.Current_Loan_Amount_max, 'Current Loan Amount outlier'] = 1
        df['Current Loan Amount'].clip(upper=self.Current_Loan_Amount_max, axis=0, inplace=True)
    
        # Months since last delinquent
        df['Months since last delinquent nan'] = 0
        df.loc[df['Months since last delinquent'].isna(), 'Months since last delinquent nan'] = 1
        df['Months since last delinquent'].clip(upper=self.Months_since_last_delinquent_max, axis=0, inplace=True)
        
        

        
        
        
         
        
    # 3. Пропуски
        
        
        # Заполнение оставшихся пропусков медианами
        
        df.fillna(self.medians, inplace=True)
        
        

        return df

In [525]:
X = df_train.drop(columns='Credit Default')

In [526]:
y = df_train['Credit Default']

# Разбиение выборки

In [527]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score,  RandomizedSearchCV

# 3. Модели
from sklearn.linear_model import LogisticRegression
import xgboost as xgb, lightgbm as lgbm, catboost as catb
from sklearn.preprocessing import StandardScaler

# 4. Метрики качества
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

In [528]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

In [529]:
data = Data()

In [530]:
data.fit(X_train)

X_train = data.transform(X_train)
X_test = data.transform(X_test)

X_train.isna().sum().sum(), X_test.isna().sum().sum()

(0, 0)

# Балансировка

In [531]:
y_train.value_counts()

0    4046
1    1579
Name: Credit Default, dtype: int64

In [532]:
TARGET_NAME = 'Credit Default'

In [533]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1)

In [534]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_for_balancing

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Home Ownership_Rent,Current Credit Balance outlier,Purpose_debt consolidation,Purpose_home improvements,Purpose_other,Credit Score nan,Annual Income nan,Current Loan Amount outlier,Months since last delinquent nan,Credit Default
2678,1175093.0,2,0.0,19.0,12.0,476476.0,0.0,32.0,0.0,258896.0,...,0,0,1,0,0,1,1,0,1,0
2638,1505408.0,10,0.0,13.0,18.6,616990.0,0.0,51.0,0.0,653664.0,...,0,0,1,0,0,0,0,0,0,0
6509,1128942.0,5,0.0,19.0,15.0,507584.0,0.0,5.0,0.0,544676.0,...,1,0,1,0,0,0,0,0,0,1
1313,767600.0,6,0.0,12.0,10.5,259842.0,0.0,32.0,0.0,177760.0,...,1,0,1,0,0,0,0,0,1,1
2620,1175093.0,7,0.0,11.0,11.0,184206.0,0.0,11.0,0.0,199892.0,...,0,0,1,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3704,603269.0,4,0.0,7.0,6.1,272008.0,0.0,32.0,0.0,900000.0,...,0,0,1,0,0,0,0,1,1,0
722,1175093.0,1,0.0,17.0,22.0,556798.0,1.0,32.0,1.0,336644.0,...,1,0,1,0,0,1,1,0,1,1
1226,2049264.0,1,0.0,5.0,25.4,594022.0,0.0,32.0,0.0,776160.0,...,0,0,1,0,0,0,0,0,1,1
2487,1300208.0,2,0.0,6.0,15.7,549626.0,0.0,7.0,0.0,309694.0,...,0,0,1,0,0,0,0,0,0,1


In [535]:
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

0    4046
1    3158
Name: Credit Default, dtype: int64

In [536]:
df_balanced

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Home Ownership_Rent,Current Credit Balance outlier,Purpose_debt consolidation,Purpose_home improvements,Purpose_other,Credit Score nan,Annual Income nan,Current Loan Amount outlier,Months since last delinquent nan,Credit Default
6206,1730349.0,0,0.0,12.0,17.2,1500000.0,0.0,32.0,0.0,534292.0,...,0,0,1,0,0,0,0,0,1,1
5676,788272.0,1,0.0,9.0,15.6,637098.0,0.0,32.0,0.0,362912.0,...,1,0,1,0,0,0,0,0,1,1
921,1073557.0,1,0.0,7.0,19.7,527252.0,0.0,32.0,0.0,432432.0,...,1,0,1,0,0,0,0,0,0,0
3382,1400357.0,0,0.0,10.0,20.0,242022.0,0.0,24.0,0.0,127556.0,...,0,0,1,0,0,0,0,0,0,0
2399,1170229.0,8,0.0,6.0,11.9,171622.0,0.0,32.0,0.0,550506.0,...,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,1175093.0,10,0.0,11.0,17.4,330968.0,0.0,61.0,0.0,486508.0,...,0,0,1,0,0,1,1,0,0,1
3965,626981.0,2,0.0,4.0,10.0,266222.0,0.0,32.0,0.0,122430.0,...,0,0,1,0,0,0,0,0,1,0
192,1168462.0,7,0.0,17.0,10.8,968880.0,1.0,32.0,1.0,157124.0,...,0,0,0,1,0,0,0,0,1,0
2590,1619199.0,7,0.0,13.0,19.0,485276.0,1.0,70.0,1.0,900000.0,...,0,0,0,0,1,0,0,1,0,0


In [537]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

# Нормализация

In [538]:
NUMERIC_FEATURE_NAMES  = X_train.select_dtypes(include={'float', 'int'}).columns.to_list()

In [539]:
NUMERIC_FEATURE_NAMES

['Annual Income',
 'Years in current job',
 'Tax Liens',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Months since last delinquent',
 'Bankruptcies',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score',
 'Current Credit Balance outlier',
 'Credit Score nan',
 'Annual Income nan',
 'Current Loan Amount outlier',
 'Months since last delinquent nan']

In [540]:
scaler = StandardScaler()

def scal(df):
    df_norm = df.copy()
    df_norm[NUMERIC_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUMERIC_FEATURE_NAMES])

    df = df_norm.copy()
    
    return df

In [541]:
X_train = scal(X_train)
X_test = scal(X_test)

In [542]:
X_test.head(2)

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Home Ownership_Own Home,Home Ownership_Rent,Current Credit Balance outlier,Purpose_debt consolidation,Purpose_home improvements,Purpose_other,Credit Score nan,Annual Income nan,Current Loan Amount outlier,Months since last delinquent nan
406,0.414191,1.0812,-0.125656,-1.074145,0.230531,0.228494,-0.344744,-0.109254,-0.351935,0.644909,...,0,0,-0.219252,1,0,0,-0.506658,-0.506658,-0.356094,0.888325
2994,1.356236,-0.846629,-0.125656,0.634644,-0.466065,2.31879,-0.344744,-0.109254,-0.351935,1.606211,...,0,0,4.560957,1,0,0,-0.506658,-0.506658,-0.356094,0.888325


# Подбор признаков

In [543]:
features = ['Annual Income',
            'Annual Income nan',
#             'Years in current job',
#             'Number of Open Accounts',
#             'Years of Credit History',
#             'Maximum Open Credit',
            'Number of Credit Problems',
            'Months since last delinquent',
            'Months since last delinquent nan',
            'Current Loan Amount',
            'Current Loan Amount outlier',
            'Current Credit Balance',
            'Current Credit Balance outlier',
#             'Term_Short Term',
            'Bankruptcies',
            'Tax Liens',
            'Term_Long Term',
            'Home Ownership_Home Mortgage',
#             'Home Ownership_Own Home',
#             'Home Ownership_Rent',
#             'Monthly Debt',
#             'Purpose_debt consolidation',
#             'Purpose_other',
#             'Purpose_home improvements',
            'Credit Score nan',
            'Credit Score']

In [544]:
X_train, X_test = X_train[features], X_test[features]

# Построение модели

In [545]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [546]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)
model_catb.fit(X_train, y_train)

y_train_pred = model_catb.predict(X_train)
y_test_pred = model_catb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      4046
           1       0.88      0.74      0.80      3158

    accuracy                           0.84      7204
   macro avg       0.85      0.83      0.83      7204
weighted avg       0.84      0.84      0.84      7204

TEST

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1341
           1       0.49      0.46      0.47       534

    accuracy                           0.71      1875
   macro avg       0.64      0.64      0.64      1875
weighted avg       0.71      0.71      0.71      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1088  253
1                289  245


In [547]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

y_train_pred = model_xgb.predict(X_train)
y_test_pred = model_xgb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.93      0.96      0.94      4046
           1       0.94      0.90      0.92      3158

    accuracy                           0.93      7204
   macro avg       0.93      0.93      0.93      7204
weighted avg       0.93      0.93      0.93      7204

TEST

              precision    recall  f1-score   support

           0       0.78      0.79      0.79      1341
           1       0.46      0.46      0.46       534

    accuracy                           0.69      1875
   macro avg       0.62      0.62      0.62      1875
weighted avg       0.69      0.69      0.69      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1056  285
1                290  244


In [548]:
model_lgbm = lgbm.LGBMClassifier(random_state=21)
model_lgbm.fit(X_train, y_train)

y_train_pred = model_lgbm.predict(X_train)
y_test_pred = model_lgbm.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      4046
           1       0.89      0.78      0.83      3158

    accuracy                           0.86      7204
   macro avg       0.86      0.85      0.85      7204
weighted avg       0.86      0.86      0.86      7204

TEST

              precision    recall  f1-score   support

           0       0.79      0.80      0.79      1341
           1       0.48      0.46      0.47       534

    accuracy                           0.70      1875
   macro avg       0.63      0.63      0.63      1875
weighted avg       0.70      0.70      0.70      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1072  269
1                288  246


# Финальная модель

In [549]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)

In [550]:
params = {'n_estimators':[500, 1000, 1500, 2000, 2500, 3000], 
          'max_depth':[5, 6, 7, 8, 10],
          'learning_rate':[0.001, 0.01, 0.05]}

In [551]:
cv=KFold(n_splits=3, shuffle=True)

In [552]:
# %%time

# rs = RandomizedSearchCV(model_catb, params, scoring='f1', cv=cv, n_jobs=-1)
# rs.fit(X_train, y_train)

In [553]:
# rs.best_params_

In [554]:
# rs.best_score_

In [555]:
%%time

final_model = catb.CatBoostClassifier(n_estimators=7000, max_depth=7, learning_rate=0.001,
                                      silent=True)
final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.74      0.89      0.80      4046
           1       0.81      0.59      0.68      3158

    accuracy                           0.76      7204
   macro avg       0.77      0.74      0.74      7204
weighted avg       0.77      0.76      0.75      7204

TEST

              precision    recall  f1-score   support

           0       0.79      0.82      0.81      1341
           1       0.51      0.45      0.48       534

    accuracy                           0.72      1875
   macro avg       0.65      0.64      0.64      1875
weighted avg       0.71      0.72      0.71      1875

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1105  236
1                293  241
Wall time: 35.5 s


In [556]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       final_model.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [557]:
feature_importances

Unnamed: 0,feature_name,importance
14,Credit Score,27.207227
6,Current Loan Amount outlier,21.184455
5,Current Loan Amount,17.549619
0,Annual Income,7.782512
11,Term_Long Term,7.554376
7,Current Credit Balance,5.656047
3,Months since last delinquent,4.478865
12,Home Ownership_Home Mortgage,2.810815
2,Number of Credit Problems,1.564533
4,Months since last delinquent nan,1.058181
