In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
%matplotlib inline

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, PowerTransformer, QuantileTransformer, FunctionTransformer, RobustScaler, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier, BalancedBaggingClassifier, BalancedRandomForestClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier,HistGradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

import warnings
warnings.simplefilter('ignore')

In [None]:
na_values = ['nan', 'na','#VALUE!','missing']
train = pd.read_csv('data/TrainingData.csv', na_values=na_values)
test = pd.read_csv('data/testX.csv', na_values=na_values)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['mvar47'] = le.fit_transform(train['mvar47'])
test['mvar47'] = le.transform(test['mvar47'])

In [None]:
X = train.drop(['application_key', 'default_ind'], axis=1)
y = train[['default_ind']]

In [None]:
# X.dropna(thresh=len(X)*0.20, axis=1, inplace=True) # drop high nullity features; 0.20 for no dropping

In [None]:
# X.drop(['mvar18', 'mvar20'], axis=1, inplace=True) # drop high correlation features

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = LogisticRegression(
    penalty='l2',
    C=1.0,
    fit_intercept=False,
    class_weight='balanced',
    random_state=0,
    solver='lbfgs',
    max_iter=100,
    n_jobs=-1
)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    # transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    # X_train[:] = transformer.fit_transform(X_train[:])
    # X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    scaler = StandardScaler()
    X_train[:] = scaler.fit_transform(X_train[:])
    X_test[:] = scaler.transform(X_test[:])

    imputer = SimpleImputer(strategy='median')
    # imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    X_train[:] = imputer.fit_transform(X_train[:])
    X_test[:] = imputer.transform(X_test[:])

    # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = GaussianNB()

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    X_train[:] = transformer.fit_transform(X_train[:])
    X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    scaler = StandardScaler()
    X_train[:] = scaler.fit_transform(X_train[:])
    X_test[:] = scaler.transform(X_test[:])

    # imputer = SimpleImputer(strategy='median')
    imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    X_train[:] = imputer.fit_transform(X_train[:])
    X_test[:] = imputer.transform(X_test[:])

    over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = LinearSVC(
    penalty='l2',
    dual=False,
    C=1.0,
    fit_intercept=False,
    class_weight='balanced',
    random_state=0,
    max_iter=1000
)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    X_train[:] = transformer.fit_transform(X_train[:])
    X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    scaler = StandardScaler()
    X_train[:] = scaler.fit_transform(X_train[:])
    X_test[:] = scaler.transform(X_test[:])

    imputer = SimpleImputer(strategy='median')
    # imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    X_train[:] = imputer.fit_transform(X_train[:])
    X_test[:] = imputer.transform(X_test[:])

    # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = BalancedRandomForestClassifier(
    n_estimators=100,
    replacement=False,
    n_jobs=-1,
    random_state=0,
    class_weight=None
)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    X_train[:] = transformer.fit_transform(X_train[:])
    X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    scaler = StandardScaler()
    X_train[:] = scaler.fit_transform(X_train[:])
    X_test[:] = scaler.transform(X_test[:])

    imputer = SimpleImputer(strategy='median')
    # imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    X_train[:] = imputer.fit_transform(X_train[:])
    X_test[:] = imputer.transform(X_test[:])

    # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=0,
    class_weight=None
)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    X_train[:] = transformer.fit_transform(X_train[:])
    X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    scaler = StandardScaler()
    X_train[:] = scaler.fit_transform(X_train[:])
    X_test[:] = scaler.transform(X_test[:])

    imputer = SimpleImputer(strategy='median')
    # imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    X_train[:] = imputer.fit_transform(X_train[:])
    X_test[:] = imputer.transform(X_test[:])

    over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = XGBClassifier(
    n_jobs=-1,
    random_state=0,
    scale_pos_weight=59145/23855, 
)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    # transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    # X_train[:] = transformer.fit_transform(X_train[:])
    # X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    # scaler = StandardScaler()
    # X_train[:] = scaler.fit_transform(X_train[:])
    # X_test[:] = scaler.transform(X_test[:])

    # imputer = SimpleImputer(strategy='median')
    # imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    # X_train[:] = imputer.fit_transform(X_train[:])
    # X_test[:] = imputer.transform(X_test[:])

    # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = LGBMClassifier(
    learning_rate=0.1,
    n_estimators=100,
    is_unbalance=True,
    random_state=0,
    n_jobs=-1,
)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    # transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    # X_train[:] = transformer.fit_transform(X_train[:])
    # X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    # scaler = StandardScaler()
    # X_train[:] = scaler.fit_transform(X_train[:])
    # X_test[:] = scaler.transform(X_test[:])

    # imputer = SimpleImputer(strategy='median')
    # imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    # X_train[:] = imputer.fit_transform(X_train[:])
    # X_test[:] = imputer.transform(X_test[:])

    # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
f1_score_list, accuracy_score_list = [], []

classifier = CatBoostClassifier(
    n_estimators=800,
    learning_rate=0.05,
    random_state=0,
    auto_class_weights='Balanced',
    verbose=False
)

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y.loc[train_index,:], y.loc[test_index,:]

    # transformer = FunctionTransformer(func=np.log1p)
    # transformer = PowerTransformer()
    # transformer = QuantileTransformer(output_distribution='normal', random_state=0)
    # X_train[:] = transformer.fit_transform(X_train[:])
    # X_test[:] = transformer.transform(X_test[:])

    # scaler = MaxAbsScaler()
    # scaler = MinMaxScaler()
    # scaler = RobustScaler()
    # scaler = StandardScaler()
    # X_train[:] = scaler.fit_transform(X_train[:])
    # X_test[:] = scaler.transform(X_test[:])

    # imputer = SimpleImputer(strategy='median')
    # imputer = IterativeImputer(n_nearest_features=None, initial_strategy='median', random_state=0)
    # imputer = KNNImputer()
    # X_train[:] = imputer.fit_transform(X_train[:])
    # X_test[:] = imputer.transform(X_test[:])

    # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
    # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
    # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    f1_score_list.append(f1_score(y_test.values.ravel(), y_pred))
    accuracy_score_list.append(accuracy_score(y_test.values.ravel(), y_pred))
    print('Iteration Done')

print("Accuracy: %.2f%% (%.2f%%)" % (np.mean(accuracy_score_list)*100, np.std(accuracy_score_list)*100))
print("F1 Score: %.2f%% (%.2f%%)" % (np.mean(f1_score_list)*100, np.std(f1_score_list)*100))