In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [19]:
TRAIN_DATASET_PATH = './course_project_train.csv'
TEST_DATASET_PATH = './course_project_test.csv'

In [20]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)

df_test = pd.read_csv(TEST_DATASET_PATH)

df_train.shape, df_test.shape

((7500, 17), (2500, 16))

In [21]:
class Data:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.years_mode = None
        self.years_to_numbers = None
        self.Years_of_Credit_History_max = None
        self.Annual_Income_condition = None
        self.Maximum_Open_Credit_min = None
        self.Maximum_Open_Credit_max = None
        self.condition_last_delinquent = None
        self.Current_Loan_Amount_max = None
        self.Credit_Score_max = None
        self.condition_Credit_Score = None
        self.Credit_Score_min = None
        
        
    def fit(self, df):
        
        
        # Расчет медиан
        self.medians = df[['Annual Income', 'Months since last delinquent', 'Current Loan Amount']].median()
        
        df.loc[df['Home Ownership'] == 'Have Mortgage', 'Home Ownership'] = 'Home Mortgage' # объединение одинаковых по смыслу свойства признака
        df = pd.get_dummies(df, columns=['Term', 'Home Ownership'])
        
        
        self.years_mode = df['Years in current job'].mode()[0]
        
        self.years_to_numbers = {'< 1 year': 0,
                                 '1 year': 1,
                                 '2 years': 2,
                                 '3 years': 3,
                                 '4 years': 4,
                                 '5 years': 5,
                                 '6 years': 6,
                                 '7 years': 7,
                                 '8 years': 8,
                                 '9 years': 9,
                                 '10+ years': 10}
        
        self.Years_of_Credit_History_max = df['Years of Credit History'].quantile(0.99)
        
        
        self.Annual_Income_condition = df.groupby(['Number of Open Accounts', 'Home Ownership_Home Mortgage'], as_index=False).agg({'Annual Income' : 'median'}).rename(columns={'Annual Income': 'Average Annual Income'})
        
        self.Maximum_Open_Credit_min = df['Maximum Open Credit'].quantile(0.01)
        self.Maximum_Open_Credit_max = df['Maximum Open Credit'].quantile(0.99)
        
        self.condition_last_delinquent = df.groupby(['Bankruptcies',
                                                     'Home Ownership_Home Mortgage',
                                                     'Tax Liens'], as_index=False).agg({'Months since last delinquent' : 'median'}).rename(columns={'Months since last delinquent' : 'Average Delinquent'})
        
        self.Credit_Score_max = 751
        self.Credit_Score_min = 625
        self.condition_Credit_Score = df.groupby(['Term_Long Term',
                                                  'Home Ownership_Home Mortgage'], as_index=False).agg({'Credit Score' : 'median'}).rename(columns={'Credit Score' : 'Median Score'})
        
        self.Current_Loan_Amount_max = 800000
        self.condition_Current_Loan_Amount = df.groupby(['Credit Score'],as_index=False).agg({'Current Loan Amount' : 'median'}).rename(columns={'Current Loan Amount' : 'Median Current Loan Amount'})
        
        
    def transform(self, df):
        """Трансформация данных"""
            
    # 1. Изменение типа

        # Term, Home Ownership
        df.loc[(df['Home Ownership'] == 'Have Mortgage'), 'Home Ownership'] = 'Home Mortgage' # объединение одинаковых по смыслу свойства признака
        df = pd.get_dummies(df, columns=['Term', 'Home Ownership'])
        
        # Years in current job
        df['Years in current job'].fillna(self.years_mode, inplace=True) # заполнение пропусков наиболее часто встречающимися значениями
        df['Years in current job'].replace(self.years_to_numbers, inplace=True)
        
    
    # 2. Выбросы (outliers)
    
        # Years of Credit History
        df['Years of Credit History'].clip(upper=self.Years_of_Credit_History_max, axis=0, inplace=True)
        
        # Maximum Open Credit
        df['Maximum Open Credit'].clip(upper=self.Maximum_Open_Credit_max, lower=self.Maximum_Open_Credit_min, axis=0, inplace=True)
        
        # Purpose
        df.loc[(df['Purpose'] != 'debt consolidation') & (df['Purpose'] != 'home improvements'), 'Purpose'] = 'other'
        df = pd.get_dummies(df, columns=['Purpose'])
        
        # Credit Score
        df.loc[(df['Credit Score'] > self.Credit_Score_max) | (df['Credit Score'] < self.Credit_Score_min), 'Credit Score'] = None 
        
        # Current Loan Amount
        df.loc[df['Current Loan Amount'] > self.Current_Loan_Amount_max, 'Current Loan Amount'] = None
        
         
        
    # 3. Пропуски
        
        # Annual Income
        df = df.merge(self.Annual_Income_condition,  on=['Number of Open Accounts', 'Home Ownership_Home Mortgage'], how='left')
        df['Annual Income'].fillna(df['Average Annual Income'], inplace=True)
        df.drop('Average Annual Income', axis=1, inplace=True)
        
        # Bankruptcies
        df['Bankruptcies'].fillna(df['Number of Credit Problems'], inplace=True)
        
        # Months since last delinquent
        df = df.merge(self.condition_last_delinquent, on=['Bankruptcies',
                                                          'Home Ownership_Home Mortgage',
                                                          'Tax Liens'], how='left')
        df['Months since last delinquent'].fillna(df['Average Delinquent'], inplace=True)
        df.drop('Average Delinquent', axis=1, inplace=True)
        
        # Credit Score
        df = df.merge(self.condition_Credit_Score, on=['Term_Long Term', 'Home Ownership_Home Mortgage'], how='left')
        df['Credit Score'].fillna(df['Median Score'], inplace=True)
        df.drop('Median Score', axis=1, inplace=True)
        
        # Current Loan Amount
        df = df.merge(self.condition_Current_Loan_Amount, on=['Credit Score'], how='left')
        df['Current Loan Amount'].fillna(df['Median Current Loan Amount'])
        df.drop('Median Current Loan Amount', axis=1, inplace=True)
        
        
        # Заполнение оставшихся пропусков медианами
        
        df.fillna(self.medians, inplace=True)
        
        
        
    # 4. Исключение ненужных столбцов
    
        
        
        


        

        
    # 5. Новые фичи (features)
    

        

        return df

In [22]:
X = df_train.drop(columns='Credit Default')

In [23]:
y = df_train['Credit Default']

In [24]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

# 3. Модели
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 4. Метрики качества
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=100)

In [26]:
data = Data()

In [27]:
data.fit(X_train)

X_train = data.transform(X_train)
X_test = data.transform(X_test)

X_train.isna().sum().sum(), X_test.isna().sum().sum()

(0, 0)

# Нормализация

In [28]:
NUMERIC_FEATURE_NAMES  = X_train.select_dtypes(include={'float', 'int'}).columns.to_list()

In [29]:
NUMERIC_FEATURE_NAMES

['Annual Income',
 'Years in current job',
 'Tax Liens',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Months since last delinquent',
 'Bankruptcies',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score']

In [30]:
scaler = StandardScaler()

def scal(df):
    df_norm = df.copy()
    df_norm[NUMERIC_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUMERIC_FEATURE_NAMES])

    df = df_norm.copy()
    
    return df

In [31]:
scal(X_train)

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Monthly Debt,Credit Score,Term_Long Term,Term_Short Term,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent,Purpose_debt consolidation,Purpose_home improvements,Purpose_other
0,-0.511820,1.078440,-0.115625,-0.848066,0.264145,-0.816401,-0.345647,-1.526691,-0.341457,-0.920352,...,0.280956,-0.502142,0,1,1,0,0,1,0,0
1,0.209972,-0.020336,-0.115625,0.778318,1.175005,2.241261,-0.345647,-0.284741,-0.341457,-1.300497,...,-0.487293,0.625379,0,1,1,0,0,0,0,1
2,-0.440520,1.078440,-0.115625,0.575020,-0.270805,-0.269035,-0.345647,-0.284741,-0.341457,0.934650,...,0.248507,-3.624508,1,0,1,0,0,1,0,0
3,0.523854,0.254358,-0.115625,0.778318,-0.314180,-0.257504,-0.345647,1.284038,-0.341457,1.309226,...,0.730235,-1.629663,1,0,1,0,0,1,0,0
4,0.098101,1.078440,-0.115625,0.575020,0.914759,-0.420483,-0.345647,-0.284741,-0.341457,-1.545680,...,0.846950,0.625379,0,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5020,-0.459284,0.529052,-0.115625,-0.034874,-0.039476,-0.716708,5.685171,1.153306,8.472684,-1.172271,...,-0.662706,0.235084,0,1,0,0,1,1,0,0
5021,6.587218,1.078440,-0.115625,-0.238172,0.567765,0.489673,-0.345647,-1.069130,-0.341457,1.503119,...,1.303704,-2.106691,1,0,1,0,0,0,1,0
5022,0.653390,1.078440,-0.115625,-0.238172,0.177396,-0.029270,-0.345647,-0.284741,-0.341457,1.440949,...,-0.702716,-0.415409,1,0,1,0,0,0,1,0
5023,1.354448,0.529052,-0.115625,0.778318,-0.661174,0.367689,-0.345647,-0.284741,-0.341457,0.044969,...,2.158598,-1.109269,1,0,1,0,0,1,0,0


In [32]:
scal(X_test)

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,...,Monthly Debt,Credit Score,Term_Long Term,Term_Short Term,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent,Purpose_debt consolidation,Purpose_home improvements,Purpose_other
0,-0.025476,-1.127493,-0.101015,-0.419205,0.248222,-0.292867,-0.331586,0.499164,-0.329073,-0.535375,...,0.650256,-2.427375,0,1,1,0,0,0,0,1
1,-0.360255,-0.852806,-0.101015,-0.828487,-0.159104,-0.113399,-0.331586,-0.090415,-0.329073,0.198659,...,-0.052368,-3.195238,1,0,0,0,1,1,0,0
2,-0.211176,-0.303432,-0.101015,-0.828487,-0.551882,-0.013691,-0.331586,2.660954,-0.329073,-0.235707,...,-0.231991,0.373063,0,1,0,0,1,1,0,0
3,-0.867662,1.070003,-0.101015,-0.214563,1.266535,-0.310814,1.665191,0.171620,2.450640,-0.373307,...,-0.247859,-0.575472,1,0,1,0,0,1,0,0
4,0.380374,-0.028745,-0.101015,-0.214563,-1.672027,0.105025,-0.331586,-0.090415,-0.329073,1.293153,...,1.146837,-0.891651,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2470,1.794526,-0.852806,-0.101015,-0.828487,-0.711903,-0.069141,-0.331586,-1.597117,-0.329073,-0.029623,...,-0.331700,0.915084,0,1,1,0,0,1,0,0
2471,-0.390222,0.795316,-0.101015,-0.214563,0.131843,-0.034544,-0.331586,-0.155924,-0.329073,0.025493,...,-0.058584,0.824747,0,1,1,0,0,1,0,0
2472,-0.343997,0.245942,-0.101015,0.194719,0.030012,0.434703,-0.331586,-0.483468,-0.329073,-0.001371,...,0.005952,-0.891651,1,0,0,0,1,1,0,0
2473,0.218695,1.070003,-0.101015,-0.828487,0.131843,0.230338,-0.331586,-0.286942,-0.329073,1.825643,...,-1.185319,1.050589,0,1,1,0,0,0,1,0


# Балансировка

In [36]:
import xgboost as xgb, lightgbm as lgbm, catboost as catb

In [37]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [38]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)
model_catb.fit(X_train, y_train)

y_train_pred = model_catb.predict(X_train)
y_test_pred = model_catb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      3629
           1       0.94      0.58      0.72      1396

    accuracy                           0.87      5025
   macro avg       0.90      0.78      0.82      5025
weighted avg       0.88      0.87      0.86      5025

TEST

              precision    recall  f1-score   support

           0       0.75      0.94      0.84      1758
           1       0.62      0.25      0.36       717

    accuracy                           0.74      2475
   macro avg       0.69      0.59      0.60      2475
weighted avg       0.72      0.74      0.70      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1647  111
1                536  181


In [39]:
model_xgb = xgb.XGBClassifier(random_state=21)
model_xgb.fit(X_train, y_train)

y_train_pred = model_xgb.predict(X_train)
y_test_pred = model_xgb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3629
           1       0.99      0.92      0.96      1396

    accuracy                           0.98      5025
   macro avg       0.98      0.96      0.97      5025
weighted avg       0.98      0.98      0.98      5025

TEST

              precision    recall  f1-score   support

           0       0.76      0.90      0.83      1758
           1       0.56      0.32      0.41       717

    accuracy                           0.73      2475
   macro avg       0.66      0.61      0.62      2475
weighted avg       0.71      0.73      0.70      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1582  176
1                490  227


In [40]:
model_lgbm = lgbm.LGBMClassifier(random_state=21)
model_lgbm.fit(X_train, y_train)

y_train_pred = model_lgbm.predict(X_train)
y_test_pred = model_lgbm.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.89      0.99      0.94      3629
           1       0.95      0.68      0.80      1396

    accuracy                           0.90      5025
   macro avg       0.92      0.84      0.87      5025
weighted avg       0.91      0.90      0.90      5025

TEST

              precision    recall  f1-score   support

           0       0.76      0.91      0.83      1758
           1       0.57      0.29      0.38       717

    accuracy                           0.73      2475
   macro avg       0.67      0.60      0.61      2475
weighted avg       0.71      0.73      0.70      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1605  153
1                511  206


In [1232]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       clf.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [1233]:
feature_importances

Unnamed: 0,feature_name,importance
9,Current Loan Amount,0.223414
5,Maximum Open Credit,0.158764
11,Monthly Debt,0.096732
3,Number of Open Accounts,0.090353
10,Current Credit Balance,0.084567
6,Number of Credit Problems,0.072555
0,Annual Income,0.071868
12,Credit Score,0.06723
7,Months since last delinquent,0.037237
4,Years of Credit History,0.036636
