## Library Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
pd.options.display.float_format = '{:.2f}'.format

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from typing import List, Tuple, Optional

from sklearn.pipeline import make_pipeline
import missingno as msno

In [2]:
# Load data
applications_history_data = pd.read_csv('data/applications_history.csv')
bki_data = pd.read_csv('data/bki.csv')
payments_data = pd.read_csv('data/payments.csv')

In [3]:
df_train = pd.read_csv('data/df_train_missing_clean.csv')
df_test = pd.read_csv('data/df_test_missing_clean.csv')

### Feature engineering

##### Process applications_history_data features

In [None]:
applications_history_data.head()

In [None]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [None]:
aggs = {
    'PREV_APPLICATION_NUMBER': ['count'],
    'AMT_APPLICATION': ['sum'],
    'AMOUNT_CREDIT': ['mean', 'sum'],
    'AMOUNT_ANNUITY': ['mean', 'sum'],
    'AMOUNT_PAYMENT': ['sum'],
    'AMOUNT_GOODS_PAYMENT': ['sum'],
    "CNT_PAYMENT": ["mean"],
    "DAYS_DECISION": ["mean", "min", "max"],

}

mask = applications_history_data["NAME_CONTRACT_STATUS"] == "Approved"
stats_approved = create_numerical_aggs(
    applications_history_data[mask], groupby_id="APPLICATION_NUMBER", aggs=aggs, prefix="PREV_APP_", suffix="_APPROVED_APPS"
)

df_train = pd.merge(df_train, stats_approved, how='left', on='APPLICATION_NUMBER', indicator="_merge_PREV_APPS")
df_train.head()


In [None]:
aggs = {
    'PREV_APPLICATION_NUMBER': ['count'],
    'AMT_APPLICATION': ["mean", "min", "max"],
    "DAYS_DECISION": ["mean"],

}

mask = applications_history_data["NAME_CONTRACT_STATUS"] == "Refused"
stats_refused = create_numerical_aggs(
    applications_history_data[mask], groupby_id="APPLICATION_NUMBER", aggs=aggs, prefix="PREV_APP_", suffix="_REFUSED_APPS"
)

df_train = pd.merge(df_train, stats_refused, how='left', on='APPLICATION_NUMBER', indicator="_merge_PREV_APPS")
df_train.head()

In [None]:
df_train.to_csv('data/df_train_new_features.csv', index=None)
df_test.to_csv('data/df_test_new_features.csv', index=None)