In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re

from matplotlib import pyplot as plt, rcParams

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
rcParams['figure.autolayout'] = False

pd.options.display.max_columns = 100

### Data preparation

In [2]:
DATA_DIR = 'course_project_data/'
TRAIN_DATA_FNAME = 'course_project_train.csv'
TEST_DATA_FNAME = 'course_project_test.csv'

TARGET_NAME = 'Credit Default'

In [3]:
def check_data(df):
    print(df.info())
    print()
    for col_name in df.select_dtypes(include='object').columns:
        print(str(col_name)+'   \n'+str(df[col_name].value_counts()))
        print()
    print('Check NULLs')
    print(len(df)-df.count())

In [4]:
df_train = pd.read_csv(f'{DATA_DIR}{TRAIN_DATA_FNAME}')
df_test = pd.read_csv(f'{DATA_DIR}{TEST_DATA_FNAME}')

In [5]:
check_data(df_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
Home Ownership                  7500 non-null object
Annual Income                   5943 non-null float64
Years in current job            7129 non-null object
Tax Liens                       7500 non-null float64
Number of Open Accounts         7500 non-null float64
Years of Credit History         7500 non-null float64
Maximum Open Credit             7500 non-null float64
Number of Credit Problems       7500 non-null float64
Months since last delinquent    3419 non-null float64
Bankruptcies                    7486 non-null float64
Purpose                         7500 non-null object
Term                            7500 non-null object
Current Loan Amount             7500 non-null float64
Current Credit Balance          7500 non-null float64
Monthly Debt                    7500 non-null float64
Credit Score                    5943 non-null float64
Credit Default                  7

In [6]:
def prepare_years(df):
    df.loc[df['Years in current job'] == '< 1 year', 'Years in current job'] = 0
    df['Years in current job'] = list(map(float, df['Years in current job'].str.replace(r'\D', '')))
    
    # df['Years in current job'].value_counts()/len(df['Years in current job'])
    
    df.loc[df['Years in current job'].isna(), 'flag_empty_job_years'] = 1
    df.loc[~df['Years in current job'].isna(), 'flag_empty_job_years'] = 0
    return df

In [7]:
def gen_flags_fill_nas(df):
    
    df.loc[df['Months since last delinquent'].isna(), 'flag_empty_delinquent'] = 1
    df.loc[~df['Months since last delinquent'].isna(), 'flag_empty_delinquent'] = 0
    
    df.loc[df['Current Loan Amount'] < 1e+7, 'flag_max_loan'] = 0
    df.loc[~(df['Current Loan Amount'] < 1e+7), 'flag_max_loan'] = 1
    
    df['Months since last delinquent'] = df['Months since last delinquent'].fillna(118 + 12)  # max + year
    
    df.loc[df['Number of Credit Problems']==0, 'flag_no_problems'] = 1
    df.loc[df['Number of Credit Problems']>0, 'flag_no_problems'] = 0
    
    df['Bankruptcies'] = df['Bankruptcies'].fillna(0)
    df.loc[df['Bankruptcies']==0, 'flag_no_bancruptcies'] = 1
    df.loc[df['Bankruptcies']>0, 'flag_no_bancruptcies'] = 0
    
    df['flag_long_term']=df['Term'].map({'Long Term':'1','Short Term':'0'}).astype(int)
    
    df.loc[df['Annual Income'].isna(), 'flag_empty_income'] = 1
    df.loc[~(df['Annual Income'].isna()), 'flag_empty_income'] = 0
    
    
    # использование на выбор: заполнение медианным значением или нулями
    df['annual_income_zerona'] = df['Annual Income'].fillna(0)
    df['annual_income_medna'] = df['Annual Income'].fillna(df['Annual Income'].median())
    
    df['Credit Score medna'] = df['Credit Score'].fillna(df['Credit Score'].median())
    df['Credit Score zerona'] = df['Credit Score'].fillna(0)
    
    df['Years in current job'] = df['Years in current job'].fillna(0)
    
    features = pd.DataFrame(index=df.index)
    
    old_features_names = ['Years in current job', 'Tax Liens',
       'Number of Open Accounts', 'Years of Credit History',
       'Maximum Open Credit', 'Number of Credit Problems',
       'Months since last delinquent', 'Bankruptcies',
       'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
        'annual_income_zerona', 'annual_income_medna', 'Credit Score medna',
       'Credit Score zerona']
    
    features = df[old_features_names]
    features.columns = [x.lower().replace(' ', '_') for x in features.columns]
    
    features['monthly_debt_annual_income_medna'] = features['monthly_debt']/features['annual_income_medna']
    
    return df, features

In [8]:
def get_col_dummies(df, colname, drop_val=None, drop_first=False):
    if drop_val:
        dummies_df = pd.get_dummies(df[colname])
        dummies_df.drop(drop_val, axis=1)
    else:
        dummies_df = pd.get_dummies(df[colname], drop_first=drop_first)
        
    dummies_df.columns = [f"{colname.lower().replace(' ', '_')}_{x.lower().replace(' ', '_')}" for x in dummies_df.columns]
    return dummies_df

def add_col_dummies(from_df, colname, to_df=None, drop_val=None, drop_first=False):
    if type(to_df)!=pd.core.frame.DataFrame:
        to_df = from_df
    dummies = get_col_dummies(from_df, colname, drop_val, drop_first)
    to_df = pd.concat([to_df, dummies], axis=1)
    return to_df

In [9]:
def gen_dummies(features, df):
    features = add_col_dummies(df, 'Purpose', drop_val='other', to_df=features)
    features = add_col_dummies(df, 'Home Ownership', drop_val='Have Mortgage', to_df=features)
    features = pd.concat([features, df[df.columns[[bool(x.find('flag') + 1) for x in df.columns]]]], axis=1)
    return features

In [10]:
def features_prep(df, set_name):
    df = prepare_years(df)
    df, features = gen_flags_fill_nas(df)
    features = gen_dummies(features, df)
    features.to_pickle(f'features_{set_name}.pkl')
    if set_name=='train':
        df[TARGET_NAME].to_pickle(f'target_{set_name}.pkl')

In [11]:
def corr_matrix(features, df):
    plt.figure(figsize = (15,10))

    sns.set(font_scale=0.5)
    sns.heatmap(pd.concat([features, df[TARGET_NAME]], axis=1).corr().round(3), annot=True, linewidths=.5, cmap='GnBu')

    plt.title('Correlation matrix')
    plt.show()

In [12]:
features_prep(df_train, 'train')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
features_prep(df_test, 'test')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Model generation

In [14]:
import pandas as pd
import numpy as np
import pickle
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb, lightgbm as lgbm, catboost as catb

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [15]:
import warnings
warnings.simplefilter('ignore')

In [16]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [17]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1)

In [18]:
def new_balansing(df, target_name, k=1):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name])


    major_sample = df[df[target_name] == major_class_name].sample(int(target_counts[major_class_name]/disbalance_coeff * k))
    minor_sample = df[df[target_name] == minor_class_name]

    new_df = major_sample
    new_df = new_df.append(minor_sample)
    return new_df.sample(frac=1)

In [19]:
# input
# DATASET_PATH = '../training_project_data.csv'
FEATURES_PATH = 'features_train.pkl'
TARGET_PATH = 'target_train.pkl'

# output
TRAIN_FULL_PATH = '../training_project_train_full.csv'
TRAIN_PART_PATH = '../training_project_train_part_b.csv'
TEST_PART_PATH = '../training_project_test_part.csv'

SCALER_FILE_PATH = 'scaler.pkl'

TARGET_NAME = 'Credit Default'

In [20]:
features_df = pd.read_pickle(FEATURES_PATH)
features_df.head()

target = pd.read_pickle(TARGET_PATH)

In [21]:
features_df.columns

Index(['years_in_current_job', 'tax_liens', 'number_of_open_accounts',
       'years_of_credit_history', 'maximum_open_credit',
       'number_of_credit_problems', 'months_since_last_delinquent',
       'bankruptcies', 'current_loan_amount', 'current_credit_balance',
       'monthly_debt', 'annual_income_zerona', 'annual_income_medna',
       'credit_score_medna', 'credit_score_zerona',
       'monthly_debt_annual_income_medna', 'purpose_business_loan',
       'purpose_buy_a_car', 'purpose_buy_house', 'purpose_debt_consolidation',
       'purpose_educational_expenses', 'purpose_home_improvements',
       'purpose_major_purchase', 'purpose_medical_bills', 'purpose_moving',
       'purpose_other', 'purpose_renewable_energy', 'purpose_small_business',
       'purpose_take_a_trip', 'purpose_vacation', 'purpose_wedding',
       'home_ownership_have_mortgage', 'home_ownership_home_mortgage',
       'home_ownership_own_home', 'home_ownership_rent',
       'flag_empty_job_years', 'flag_empty_d

In [22]:
# не забыть "ВЫБИРАТЬ" 'annual_income_zerona', 'annual_income_medna'

NUM_FEATURE_NAMES = ['years_in_current_job', 'tax_liens', 'number_of_open_accounts',
       'years_of_credit_history', 'maximum_open_credit',
       'number_of_credit_problems', 'months_since_last_delinquent',
       'bankruptcies', 'current_loan_amount', 'current_credit_balance',
       'monthly_debt', 'credit_score_medna', 'credit_score_zerona', 'annual_income_zerona',
       'annual_income_medna', 'monthly_debt_annual_income_medna']

# CAT_FEATURE_NAMES = [] # надо заполнить и добавть в prep категориальные данные без one-hot!!!

FLAG_FEATURE_NAME = ['flag_empty_job_years', 'flag_empty_delinquent', 'flag_max_loan',
       'flag_no_problems', 'flag_no_bancruptcies', 'flag_long_term',
       'flag_empty_income']

ONE_HOT_FEATURE_NAME = ['annual_income_medna', 'purpose_business_loan', 'purpose_buy_a_car',
       'purpose_buy_house', 'purpose_debt_consolidation',
       'purpose_educational_expenses', 'purpose_home_improvements',
       'purpose_major_purchase', 'purpose_medical_bills', 'purpose_moving',
       'purpose_other', 'purpose_renewable_energy', 'purpose_small_business',
       'purpose_take_a_trip', 'purpose_vacation', 'purpose_wedding',
       'home_ownership_have_mortgage', 'home_ownership_home_mortgage',
       'home_ownership_own_home', 'home_ownership_rent']

In [23]:
def select_x_y(features_df, target):
    X = features_df.drop('annual_income_zerona', axis=1)
    X = features_df.drop('credit_score_medna', axis=1)
    # X = features_df.drop(ONE_HOT_FEATURE_NAME, axis=1)
    y = target
    return X, y

In [24]:
X, y = select_x_y(features_df, target)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=21)

In [26]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_for_balancing[TARGET_NAME].value_counts()

0    3771
1    1479
Name: Credit Default, dtype: int64

In [27]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = new_balansing(df_for_balancing, TARGET_NAME, k=2)
    
df_balanced[TARGET_NAME].value_counts()

0    3771
1    1479
Name: Credit Default, dtype: int64

In [28]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

In [29]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [30]:
train.to_csv(TRAIN_PART_PATH, index=False, encoding='utf-8')
test.to_csv(TEST_PART_PATH, index=False, encoding='utf-8')

In [31]:
%%time

final_model = xgb.XGBClassifier(colsample_bytree=1,
                       scale_pos_weight=2,
                       subsample=0.5,
                        random_state=21, reg_lambda=20,
                        nrounds=700,
                        max_depth=4)

final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      3771
           1       0.64      0.64      0.64      1479

    accuracy                           0.80      5250
   macro avg       0.75      0.75      0.75      5250
weighted avg       0.80      0.80      0.80      5250

TEST

              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1616
           1       0.54      0.53      0.54       634

    accuracy                           0.74      2250
   macro avg       0.68      0.68      0.68      2250
weighted avg       0.74      0.74      0.74      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1333  283
1                298  336
CPU times: user 1.3 s, sys: 9.47 ms, total: 1.31 s
Wall time: 1.34 s


In [32]:
final_model = xgb.XGBClassifier(colsample_bytree=1,
                       scale_pos_weight=2,
                       subsample=0.5,
                        random_state=21, reg_lambda=20,
                        nrounds=700,
                        max_depth=4)

final_model.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nrounds=700, nthread=None, objective='binary:logistic',
              random_state=21, reg_alpha=0, reg_lambda=20, scale_pos_weight=2,
              seed=None, silent=None, subsample=0.5, verbosity=1)

In [33]:
TEST_FEATURES_PATH = 'features_test.pkl'

In [34]:
test_features = pd.read_pickle(TEST_FEATURES_PATH)

In [35]:
check_data(test_features)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 41 columns):
years_in_current_job                2500 non-null float64
tax_liens                           2500 non-null float64
number_of_open_accounts             2500 non-null float64
years_of_credit_history             2500 non-null float64
maximum_open_credit                 2500 non-null float64
number_of_credit_problems           2500 non-null float64
months_since_last_delinquent        2500 non-null float64
bankruptcies                        2500 non-null float64
current_loan_amount                 2500 non-null float64
current_credit_balance              2500 non-null float64
monthly_debt                        2500 non-null float64
annual_income_zerona                2500 non-null float64
annual_income_medna                 2500 non-null float64
credit_score_medna                  2500 non-null float64
credit_score_zerona                 2500 non-null float64
monthly_debt_annual_inc

In [36]:
X = test_features.drop('annual_income_zerona', axis=1)
X = test_features.drop('credit_score_medna', axis=1)


In [39]:
X['purpose_renewable_energy'] = 0

In [44]:
preds = final_model.predict(X[X_train.columns])

In [48]:
preds = pd.DataFrame(preds, columns=['Credit Default'])

In [49]:
preds.to_csv('ANikitina_predictions.csv', index=False)