In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
%matplotlib inline

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, PowerTransformer, QuantileTransformer, FunctionTransformer, RobustScaler, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier, BalancedBaggingClassifier, BalancedRandomForestClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier,HistGradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score, accuracy_score

In [2]:
na_values = ['nan', 'na','#VALUE!','missing']
train = pd.read_csv('data/TrainingData.csv', na_values=na_values)
test = pd.read_csv('data/testX.csv', na_values=na_values)

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['mvar47'] = le.fit_transform(train['mvar47'])
test['mvar47'] = le.transform(test['mvar47'])

In [4]:
X_train = train.drop(['application_key', 'default_ind'], axis=1)
y_train = train[['default_ind']]
X_test = test.drop(['application_key'], axis=1)
test_index = test['application_key']

X_train.dropna(thresh=len(X_train)*0.50, axis=1, inplace=True) # 0.20 for no dropping
X_test = X_test[X_train.columns]

In [5]:
cat_cols = [col for col in X_train.columns if X_train[col].nunique() < 30]
num_cols = [col for col in X_train.columns if col not in cat_cols]

In [6]:
# train[['mvar11','mvar12','mvar13','mvar44']].corr() # amount paid
# train[['mvar21','mvar22','mvar23','mvar24','mvar40','mvar41']].corr() # average utilisation
# train[['mvar6','mvar7','mvar8','mvar9','mvar10']].corr() # credit available
# train[['mvar3','mvar4','mvar5','mvar28']].corr() # default severity
# train[['mvar16','mvar17','mvar18','mvar19','mvar20','mvar34','mvar35','mvar36','mvar37','mvar38','mvar39','mvar43','mvar45','mvar46']].corr() # no. of credit cards/lines
# train[['mvar25','mvar26','mvar27','mvar29','mvar30','mvar31','mvar32']].corr() # tenure
# train[['mvar47']].corr()
# train[['mvar14','mvar15','mvar33']].corr() # wealth indicator
# train[['mvar1','mvar2','mvar42','mvar48','mvar49','mvar50']].corr() # worthiness/riskiness score

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# cat_pipeline = Pipeline(
#     steps=[
#         ('scaler', StandardScaler()),
#         # ('imputer', SimpleImputer(strategy='most_frequent'))
#     ]
# )

# num_pipeline = Pipeline(
#     steps=[
#         # ('transformer', FunctionTransformer(np.log1p)),
#         ('scaler', StandardScaler()),
#         # ('imputer', SimpleImputer(strategy='median'))
#     ]
# )

# col_pipeline = ColumnTransformer(
#     transformers=[
#         ('cat', cat_pipeline, cat_cols),
#         ('num', num_pipeline, num_cols)
#     ]
# )

# xgb_pipeline = Pipeline(
#     steps=[
#         ('preprocessing', col_pipeline),
#         ('xgb', XGBClassifier(
#             random_state=0,
#             n_jobs=-1,
#             scale_pos_weight=59145/23855
#         ))
#     ]
# )

# lgbm_pipeline = Pipeline(
#     steps=[
#         ('preprocessing', col_pipeline),
#         ('lgbm', LGBMClassifier(
#             random_state=0,
#             n_jobs=-1,
#             is_unbalance=True
#         ))
#     ]
# )

# catb_pipeline = Pipeline(
#     steps=[
#         ('preprocessing', col_pipeline),
#         ('catb', CatBoostClassifier(
#             random_state=0,
#             auto_class_weights='Balanced',
#             verbose=False
#         ))
#     ]
# )

# classifier = StackingClassifier(
#     estimators=[
#         ('xgb', xgb_pipeline),
#         ('lgbm', lgbm_pipeline),
#         ('catb', catb_pipeline)
#     ],
#     final_estimator=CatBoostClassifier(
#         random_state=0,
#         auto_class_weights='Balanced',
#         verbose=False
#     ),
#     cv=cv,
#     stack_method='predict_proba',
#     n_jobs=-1,
#     passthrough=True
# )

# # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

# classifier.fit(X_train, y_train.values.ravel())
# y_pred = classifier.predict(X_test)

# print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
# print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))

In [8]:
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# cat_pipeline = Pipeline(
#     steps=[
#         ('scaler', StandardScaler()),
#         # ('imputer', SimpleImputer(strategy='most_frequent'))
#     ]
# )

# num_pipeline = Pipeline(
#     steps=[
#         # ('transformer', FunctionTransformer(np.log1p)),
#         ('scaler', StandardScaler()),
#         # ('imputer', SimpleImputer(strategy='median'))
#     ]
# )

# col_pipeline = ColumnTransformer(
#     transformers=[
#         ('cat', cat_pipeline, cat_cols),
#         ('num', num_pipeline, num_cols)
#     ]
# )

# xgb_pipeline = Pipeline(
#     steps=[
#         ('preprocessing', col_pipeline),
#         ('xgb', XGBClassifier(
#             random_state=0,
#             n_jobs=-1,
#             scale_pos_weight=59145/23855
#         ))
#     ]
# )

# lgbm_pipeline = Pipeline(
#     steps=[
#         ('preprocessing', col_pipeline),
#         ('lgbm', LGBMClassifier(
#             random_state=0,
#             n_jobs=-1,
#             is_unbalance=True
#         ))
#     ]
# )

# catb_pipeline = Pipeline(
#     steps=[
#         ('preprocessing', col_pipeline),
#         ('catb', CatBoostClassifier(
#             random_state=0,
#             auto_class_weights='Balanced',
#             verbose=False
#         ))
#     ]
# )

# classifier = StackingClassifier(
#     estimators=[
#         ('xgb', xgb_pipeline),
#         ('lgbm', lgbm_pipeline),
#         ('catb', catb_pipeline)
#     ],
#     final_estimator=CatBoostClassifier(
#         random_state=0,
#         auto_class_weights='Balanced',
#         verbose=False
#     ),
#     cv=cv,
#     stack_method='predict',
#     n_jobs=-1,
#     passthrough=True
# )

# # over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# # over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# # X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

# classifier.fit(X_train, y_train.values.ravel())
# y_test_pred = classifier.predict(X_test)

In [9]:
# sample = pd.read_csv('submissions/sample_submission.csv', header=None)
# sample[1] = y_test_pred
# sample.to_csv('submissions/FirstDegreeBurn_stack.csv', index=False, header=False)

In [10]:
clf1 = LGBMClassifier(
    boosting_type='gbdt',
    num_leaves=31,
    max_depth=-1,
    learning_rate=0.1,
    n_estimators=100,
    subsample_for_bin=200000,
    objective=None,
    is_unbalance=True,
    min_split_gain=0.,
    min_child_weight=1e-3,
    min_child_samples=20,
    subsample=1.,
    subsample_freq=0,
    colsample_bytree=1.,
    reg_alpha=0.,
    reg_lambda=0.,
    random_state=0,
    n_jobs=-1,
    importance_type='split',
)

clf2 = CatBoostClassifier(
    n_estimators=800,
    learning_rate=0.05,
    random_state=0,
    auto_class_weights='Balanced',
    verbose=False
)

clf1.fit(X_train, y_train)
y_test_pred_1 = clf1.predict(X_test)

clf2.fit(X_train, y_train)
y_test_pred_2 = clf2.predict(X_test)

preds = np.maximum(y_test_pred_1, y_test_pred_2)

sample = pd.read_csv('submissions/sample_submission.csv', header=None)
sample[1] = preds
sample.to_csv('submissions/FirstDegreeBurn_npmax.csv', index=False, header=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
