In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
from functools import reduce
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics, svm
import warnings
warnings.filterwarnings("ignore")

In [3]:
app_train = pd.read_csv('home-credit-default-risk/application_train.csv')
pos_cash = pd.read_csv('home-credit-default-risk/POS_CASH_balance.csv')
bureau = pd.read_csv('home-credit-default-risk/bureau.csv')
cc_balance = pd.read_csv('home-credit-default-risk/credit_card_balance.csv')
installments_payments = pd.read_csv('home-credit-default-risk/installments_payments.csv')
previous_app = pd.read_csv('home-credit-default-risk/previous_application.csv')

previous_app = previous_app.add_suffix('_PREVIOUS')
pos_cash = pos_cash.add_suffix('_POS_CASH')
bureau = bureau.add_suffix('_BUREAU')
cc_balance = cc_balance.add_suffix('_CC')
installments_payments = installments_payments.add_suffix('_INSTALLMENT_PAYMENTS')

pos_cash = pos_cash.rename(columns={'SK_ID_CURR_POS_CASH':'SK_ID_CURR'})
bureau = bureau.rename(columns={'SK_ID_CURR_BUREAU':'SK_ID_CURR'})
cc_balance = cc_balance.rename(columns={'SK_ID_CURR_CC':'SK_ID_CURR'})
installments_payments = installments_payments.rename(
    columns={'SK_ID_CURR_INSTALLMENT_PAYMENTS':'SK_ID_CURR'}
)
previous_app = previous_app.rename(columns={'SK_ID_CURR_PREVIOUS':'SK_ID_CURR'})

# freshest data
pos_cash = pos_cash[pos_cash.MONTHS_BALANCE_POS_CASH == -1]
bureau = bureau.sort_values('DAYS_CREDIT_UPDATE_BUREAU') \
    .drop_duplicates('SK_ID_CURR',keep='last')
cc_balance = cc_balance[cc_balance.MONTHS_BALANCE_CC == -1]
installments_payments = installments_payments.sort_values(
    'DAYS_INSTALMENT_INSTALLMENT_PAYMENTS').drop_duplicates('SK_ID_CURR',keep='last'
)
previous_app = previous_app.sort_values('DAYS_FIRST_DRAWING_PREVIOUS') \
    .drop_duplicates('SK_ID_CURR',keep='last')

dfs = [
    app_train, pos_cash, bureau,
    cc_balance, installments_payments, previous_app
]
full_application = reduce(lambda  left,right: pd.merge(left,right,on=['SK_ID_CURR'],
                                            how='outer'), dfs)

full_application = full_application.dropna(subset='TARGET')

full_application = full_application[:50000]

current_app_results = app_train[['SK_ID_CURR','TARGET']]
train_set = pd.merge(previous_app, current_app_results, on='SK_ID_CURR')

FileNotFoundError: [Errno 2] No such file or directory: 'home-credit-default-risk/application_train.csv'

In [None]:
def handle_null(dataset: pd.DataFrame) -> pd.DataFrame:
    for column in dataset.columns:
        if (
            dataset[column].isna().sum()/len(dataset) > 
            (len(dataset)/len(column))
        ):
            dataset.drop(subset=f'{column}')
        if dataset[f'{column}'].dtype in ['object','bool']:
            dataset[column] = dataset[column].fillna('Unknown')
        if dataset[column].dtype in ['float64','int64']:
            dataset[column] = dataset[column].fillna(dataset[column].mean())
    
    return dataset


def handle_outliers(dataset: pd.DataFrame, y:str) -> pd.DataFrame:
    for column in dataset.columns:
        if dataset[column].dtype in [
            'int32','int64','float32','float64','uint8'
        ] and column != y:
            dataset[f'{column}_outlier'] = _outlier_detection(dataset[column])
            dataset[f'{column}_outlier'] = np.where(
                dataset[f'{column}_outlier'] == True, 1, 0
            )

    return dataset

def _outlier_detection(column: str) -> pd.Series:
    z = np.abs(stats.zscore(column))
    outliers = np.where(z>3, True, False)
    
    return pd.Series(outliers, index=column.index)


def create_encoding(
        dataset: pd.DataFrame,
        encoder = preprocessing.LabelEncoder()
) -> pd.DataFrame:
    object_cols = []
    for column in dataset.columns:
        if dataset[column].dtype in ['object','bool']:
            object_cols = object_cols + [column]
    for column in object_cols:
        dataset[column] = encoder.fit_transform(dataset[column])

    return dataset


def full_cleanse(dataset: pd.DataFrame, y: str):
    dataset = handle_null(dataset)
    dataset = handle_outliers(dataset, y=y)
    dataset = create_encoding(dataset)

    return dataset

full_application = full_cleanse(full_application, y='TARGET')

### GBT

In [None]:
y = full_application.TARGET
X = full_application[[col for col in full_application.columns if not col in ['TARGET']]]
X = RobustScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=1
)

param_grid = {
    'n_estimators': [1000],
    'learning_rate': [0.01],
    'max_depth': [7],
    'max_features': ['sqrt']
}

model = GradientBoostingClassifier()

clf = GridSearchCV(model, param_grid, cv=5)
clf.fit(X_train, y_train)

clf = clf.fit(X_train, y_train)
y_hat = clf.predict(X_train)
y_pred = clf.predict(X_test)
print("Train Precision:", metrics.precision_score(y_train, y_hat.round()))
print("Test Precision:", metrics.precision_score(y_test, y_pred.round()))

### SVM

In [None]:
svc = svm.SVC()
param_grid = {
    'C': [0.1, 1, 5],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 0.1, 1]
}
grid_search = GridSearchCV(
    svc,
    param_grid,
    cv=3
)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = abs(grid_search.best_score_)
best_model = svm.SVC(**best_params)
best_model.fit(X_train, y_train)

y_hat = best_model.predict(X_train)
y_pred = best_model.predict(X_test)
print("Train Precision:", metrics.precision_score(y_train, y_hat.round()))
print("Test Precision:", metrics.precision_score(y_test, y_pred.round()))