### Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd


import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from xgboost import XGBClassifier
from xgboost import plot_importance

from lightgbm import LGBMClassifier
import lightgbm

from scipy.stats import mode

### Reading in the data

In [None]:
df_train = pd.read_csv('TrainingData.csv', index_col = 'application_key',low_memory = False)

In [None]:
label_dict = pd.read_csv('Data_Dictionary.csv')

In [None]:
label_dict = label_dict.set_index('Name').T.to_dict('list')

In [None]:
label_dict

In [None]:
df_train.head()

In [None]:
df_train.info()

### Data cleaning

Function for extracting non-numeric elements:

In [None]:
def get_non_numeric(elements):
    
    non_numerics = []
    
    for el in elements:
        try:
            el = float(el)
        except:
            non_numerics.append(el)
    
    return non_numerics

#### Variable 1

In [None]:
label_dict['mvar1'][0]

In [None]:
var1_non = get_non_numeric(list(df_train['mvar1'].unique()))

In [None]:
var1_non

In [None]:
for el in var1_non:
    df_train = df_train.replace(el, np.NaN)

In [None]:
df_train['mvar1'] = pd.to_numeric(df_train['mvar1'])

In [None]:
var6_non = get_non_numeric(list(df_train['mvar6'].unique()))
var6_non

In [None]:
for el in var6_non:
    df_train = df_train.replace(el, np.NaN)

df_train['mvar6'] = pd.to_numeric(df_train['mvar6'])   

In [None]:
var11_non = get_non_numeric(list(df_train['mvar15'].unique()))
var11_non

In [None]:
for i in range(51):
    lol =get_non_numeric(list(df_train.iloc[:, i].unique()))
    print(i, lol)

In [None]:
for el in ['#VALUE!']:
    df_train = df_train.replace(el, np.NaN)


In [None]:
for i in range(51):
    lol =get_non_numeric(list(df_train.iloc[:, i].unique()))
    print(i, lol)

In [None]:
df_train = pd.get_dummies(df_train, columns = ['mvar47'], prefix = ['type'])

In [None]:
for i in range(51):
    lol =get_non_numeric(list(df_train.iloc[:, i].unique()))
    print(i, lol)

In [None]:
for el in list(df_train.columns):
    df_train[el] = pd.to_numeric(df_train[el]) 

In [None]:
df_train.info()

#### Data exploration

In [None]:
sns.boxplot(x = 'mvar1', data = df_train)

In [None]:
sns.boxplot(x = 'mvar2', data = df_train)

In [None]:
sns.boxplot(x = 'mvar2', data = df_train_imputed)

In [None]:
df_train['mvar2'].describe()

In [None]:
sns.boxplot(y = 'mvar2', data = df_train, x = 'default_ind')

In [None]:
sns.boxplot(y = 'mvar9', data = df_train, x = 'default_ind')

In [None]:
sns.boxplot(x = 'mvar15', data = df_train)

In [None]:
df_train['mvar15'].describe()

In [None]:
df_train_imputed[['mvar3', 'mvar4', 'mvar5']].corr()

#### Simple imputation

In [None]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mean_imputer.fit(df_train)

In [None]:
df_train_imputed = mean_imputer.transform(df_train)
df_train_imputed = pd.DataFrame(df_train_imputed, columns = df_train.columns)

In [None]:
df_train_imputed.info()

In [None]:
df_train_imputed['default_ind'] = df_train_imputed['default_ind'].astype('int')

#### Todo: Better strategies

#### Iterative imputer

In [None]:
x = list(df_train.columns)
x.remove('default_ind')
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')


df_train_data = df_train[non_label_cols]

In [None]:
imp_iterative = IterativeImputer(random_state=0)
imp_iterative.fit(df_train_data)

In [None]:
df_train_data_imputed = imp_iterative.transform(df_train_data)
df_train_data_imputed = pd.DataFrame(df_train_data_imputed, columns = df_train_data.columns)

In [None]:
df_train_data_imputed.info()

### Data visualization

In [None]:
sns.countplot(x = 'default_ind', data = df_train_imputed)

In [None]:
df_train_imputed['default_ind'].sum()/len(df_train_imputed['default_ind'])

Imbalanced dataset, though it is not that bad.

In [None]:
2, 49, 33, 29, 1

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar2', data = df_train)

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar49', data = df_train)

In [None]:
df_train['mvar49'].unique()

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar33', data = df_train)

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar29', data = df_train)

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar1', data = df_train)

In [None]:
df_feat = df_train_imputed.copy()

In [None]:
df_feat['gen1'] = (df_feat['mvar1'] - df_feat['mvar1'].mean())/df_feat['mvar1'].std()

In [None]:
df_feat['gen1'] = df_feat['gen1'] - (df_feat['mvar2'] - df_feat['mvar2'].mean())/df_feat['mvar2'].std()

In [None]:
sns.boxplot(x = 'default_ind', y = 'gen1', data = df_feat)

In [None]:
df_feat['gen2'] = ((df_feat['mvar1'] - df_feat['mvar1'].mean())/df_feat['mvar1'].std())*((df_feat['mvar49'] - df_feat['mvar49'].mean())/df_feat['mvar49'].std())

In [None]:
sns.boxplot(x = 'default_ind', y = 'gen2', data = df_feat)

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar12', data = df_feat)

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar14', data = df_feat)

In [None]:
df_feat['inc_per_owed'] = df_feat['mvar14']/(df_feat['mvar12']+1)

In [None]:
sns.boxplot(x = 'default_ind', y = 'inc_per_owed', data = df_feat)

In [None]:
df_feat['type_C'] = df_feat['type_C'].astype("category")
df_feat['type_L'] = df_feat['type_L'].astype("category")

In [None]:
df_feat.info()

In [None]:
df_feat['type_C'] = df_feat['type_C'].astype("int")
df_feat['type_L'] = df_feat['type_L'].astype("int")

In [None]:
df_feat.info()

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar45', data = df_train)

In [None]:
df_feat['mvar45'].value_counts()

### Features, features, features

In [None]:
df_eng = pd.read_csv('TrainingData.csv', index_col = 'application_key', low_memory = False)

In [None]:
df_eng['mvar46'].value_counts()

In [None]:
df_train.info()

In [None]:
df_train['mvar48'].value_counts()

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar7', data = df_train)
plt.savefig('Images/no_log.png', bbox_inches = 'tight')

In [None]:
sns.boxplot(x = df_train['default_ind'], y = np.log(df_train['mvar7']+1))
plt.savefig('Images/yes_log.png', bbox_inches = 'tight')

In [None]:
df_eng['mvar2'].value_counts()

In [None]:
df_eng = df_train.copy()

In [None]:
df_eng['log_mvar2'] = np.log(df_eng['mvar2']+1)

In [None]:
sns.boxplot(x = 'default_ind', y = 'log_mvar2', data = df_eng)

48, 24, 25, 22

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar31', data = df_eng)

In [None]:
sns.boxplot(x = df_eng['default_ind'], y = df_eng['mvar30']+df_eng['mvar31'])

In [None]:
df_eng['log_mvar13'] = np.log(df_eng['mvar13']+1)

In [None]:
sns.boxplot(x = 'default_ind', y = 'log_mvar6', data = df_eng)

In [None]:
df_eng = df_eng.drop(['log_mvar3', 'log_mvar4'], axis = 1)

In [None]:
df_eng.info()

### Building models

#### Test set generation

In [None]:
x = list(df_train_imputed.columns)
x.remove('default_ind')
print(x)

In [None]:
non_label_cols = list(df_train_imputed.columns)
non_label_cols.remove('default_ind')


X = df_train_imputed[non_label_cols]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
y_test.sum()/len(y_test)

#### Iterative imputer test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train_data_imputed, y, stratify = y, test_size = 0.3, random_state = 42)

In [None]:
y_train.sum()/len(y_train)

#### First model - untuned XGBoost

In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_train)

In [None]:
print(classification_report(y_pred, y_train))

In [None]:
y_pred = xgb_model.predict(X_test)

In [None]:
print(classification_report(y_pred, y_test))

#### Tuning XGBoost

In [None]:
tuning_xgb_model = XGBClassifier()

eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["auc"]
%time tuning_xgb_model.fit(X_train, y_train, eval_metric = eval_metric, eval_set = eval_set, verbose=True)

In [None]:
results = tuning_xgb_model.evals_result()

In [None]:
plt.plot(list(range(len(results['validation_0']['auc']))), results['validation_0']['auc'], label = 'train')
plt.plot(list(range(len(results['validation_1']['auc']))), results['validation_1']['auc'], label = 'test')
plt.legend()

In [None]:
basic_xg = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 1,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 1000,
                      max_depth = 3, 
                      gamma = 1)

In [None]:
depths = list(range(3, 10))
f_scores = []

for el in depths:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 1,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 1000,
                      max_depth = el, 
                      gamma = 1)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores = np.array(f_scores)
plt.plot()

In [None]:
depths = list(range(8, 14))
f_scores = []

for el in depths:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 1,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 1000,
                      max_depth = el, 
                      gamma = 1)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

In [None]:
depths[2]

<b> Max Depth = 10 is a good choice </b>

In [None]:
col_samples = list(np.linspace(0.4, 0.9, 10))
f_scores = []

for el in col_samples:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = el,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 1000,
                      max_depth = 10, 
                      gamma = 1)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

In [None]:
col_samples[4]

<b> 0.6 for columns used by each tree </b>

In [None]:
estimator_samples = list(range(500, 4001, 500))
f_scores = []

for el in estimator_samples:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = el,
                      max_depth = 10, 
                      gamma = 1)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

In [None]:
estimator_samples[5]

In [None]:
partly_tuned_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 1)

In [None]:
partly_tuned_model.fit(X_train, y_train)

y_train_pred = partly_tuned_model.predict(X_train)
y_test_pred = partly_tuned_model.predict(X_test)

print(classification_report(y_train_pred, y_train))


In [None]:
print(classification_report(y_test_pred, y_test))

In [None]:
f1_score(y_test, y_test_pred)

#### Trying class weighting 

In [None]:
weights = [1, 10, 25, 50, 75, 99, 100, 1000]

f_scores = []

for el in weights:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 1,
                    scale_pos_weight = el)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

<b>We go with a value of 10 </b>

In [None]:
weights = [2, 3, 4, 5, 6, 7, 8, 9]

f_scores = []

for el in weights:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 1,
                    scale_pos_weight = el)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

<b>Choose 5 for scale_pos_weight.</b>

In [None]:
weighted_tuned_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 1,
                    scale_pos_weight = 5)

In [None]:
weighted_tuned_model.fit(X_train, y_train)

y_train_pred = weighted_tuned_model.predict(X_train)
y_test_pred = weighted_tuned_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

#### Adjusting regularization

In [None]:
gammas = [1, 2, 3, 4, 5]

f_scores = []

for el in gammas:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = el,
                    scale_pos_weight = 5)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

No definitive conclusion

<b>Still lets try gamma of 5</b>

In [None]:
regularized_xgb_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5)

In [None]:
regularized_xgb_model.fit(X_train, y_train)

y_train_pred = regularized_xgb_model.predict(X_train)
y_test_pred = regularized_xgb_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
regularized_xgb_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5)

In [None]:
%time regularized_xgb_model.fit(X, y)

y_train_pred = regularized_xgb_model.predict(X_train)
y_test_pred = regularized_xgb_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

#### Iterative imputation

In [None]:
it_regularized_xgb_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5)

In [None]:
%time it_regularized_xgb_model.fit(X_train, y_train)

y_train_pred = it_regularized_xgb_model.predict(X_train)
y_test_pred = it_regularized_xgb_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
f1_score(y_test, y_test_pred)

In [None]:
gammas = [5, 6, 7, 8]

f_scores = []

for el in gammas:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = el,
                    scale_pos_weight = 5)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

#### XGBoost without explicit imputaion

In [None]:
X = df_train[non_label_cols]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
regularized_nonimp_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5)

In [None]:
%time regularized_nonimp_model.fit(X_train, y_train)

y_train_pred = regularized_nonimp_model.predict(X_train)
y_test_pred = regularized_nonimp_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
plt.figure(figsize = (15, 10))
plot_importance(regularized_nonimp_model, max_num_features = 20)

In [None]:
print(label_dict['mvar2'][0])
print(label_dict['mvar15'][0])
print(label_dict['mvar33'][0])
print(label_dict['mvar13'][0])
print(label_dict['mvar9'][0])

In [None]:
df_train.info()

#### Considering L2 reg

In [None]:
lambdas = [1, 5, 6, 7, 8]

f_scores = []

for el in lambdas:
    
    test_depth_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5,
                    reg_lambda = el)
    
    test_depth_model.fit(X_train, y_train)
    
    y_train_pred = test_depth_model.predict(X_train)
    y_test_pred = test_depth_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])

In [None]:
f_scores

In [None]:
lambda_xgb_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5,
                    reg_lambda = 6)

In [None]:
%time lambda_xgb_model.fit(X_train, y_train)

y_train_pred = lambda_xgb_model.predict(X_train)
y_test_pred = lambda_xgb_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
f1_score(y_test, y_test_pred)

In [None]:
%time lambda_xgb_model.fit(X, y)

y_train_pred = lambda_xgb_model.predict(X_train)
y_test_pred = lambda_xgb_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
plot_importance(regularized_xgb_model, max_num_features = 15)

In [None]:
plot_importance(lambda_xgb_model, max_num_features = 15)

#### Feature engineering 

In [None]:
x = list(df_feat.columns)
x.remove('default_ind')

non_label_cols = list(df_feat.columns)
non_label_cols.remove('default_ind')

X = df_feat[non_label_cols]
y = df_feat['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
feat_xgb_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5,
                    reg_lambda = 6,
                    enable_categorical=True,
                    tree_method="hist")

In [None]:
feat_xgb_model.fit(X_train, y_train)

y_train_pred = feat_xgb_model.predict(X_train)
y_test_pred = feat_xgb_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
plot_importance(feat_xgb_model, max_num_features = 15)

In [None]:
f1_score(y_test, y_test_pred)

Without category

In [None]:
x = list(df_feat.columns)
x.remove('default_ind')

non_label_cols = list(df_feat.columns)
non_label_cols.remove('default_ind')

X = df_feat[non_label_cols]
y = df_feat['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
X

In [None]:
df_train_imputed

In [None]:
feat_xgb_model = XGBClassifier(silent = False, 
                      learning_rate = 0.01,  
                      colsample_bytree = 0.6,
                      subsample = 0.8,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 10, 
                      gamma = 5,
                    scale_pos_weight = 5,
                    reg_lambda = 6)

In [None]:
feat_xgb_model.fit(X_train, y_train)

y_train_pred = feat_xgb_model.predict(X_train)
y_test_pred = feat_xgb_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
f1_score(y_test, y_test_pred)

In [None]:
plot_importance(feat_xgb_model, max_num_features = 15)

In [None]:
feat_xgb_model.feature_importances_.argsort()[::-1]

#### Random Forest

In [None]:
rf_model = RandomForestClassifier(class_weight = 'balanced')

In [None]:
max_depths = [7, 8, 9, 10, 11, 12]

f_scores = []

for el in max_depths:
    
    rf_model = RandomForestClassifier(class_weight = 'balanced', max_depth = el)
    
    rf_model.fit(X_train, y_train)
    
    y_train_pred = rf_model.predict(X_train)
    y_test_pred = rf_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])
    
    print(el, 'done')

In [None]:
f_scores

In [None]:
rf_model = RandomForestClassifier(max_depth = 10, class_weight = 'balanced')

In [None]:
rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

print(classification_report(y_train_pred, y_train))
print(classification_report(y_test_pred, y_test))

In [None]:
f1_score(y_test, y_test_pred)

In [None]:
estimators = [300, 400, 500, 600]

f_scores = []

for el in estimators:
    
    rf_model = RandomForestClassifier(class_weight = 'balanced', max_depth = 10, n_estimators = el)
    
    rf_model.fit(X_train, y_train)
    
    y_train_pred = rf_model.predict(X_train)
    y_test_pred = rf_model.predict(X_test)
    
    train_f = f1_score(y_train, y_train_pred)
    test_f = f1_score(y_test, y_test_pred)
    
    f_scores.append([train_f, test_f])
    
    print(el, 'done')

In [None]:
f_scores

### RF model

In [None]:
X = df_train_imputed[non_label_cols]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
rf_model = RandomForestClassifier(class_weight = 'balanced', max_depth = 10,
                                  min_samples_split = 2, 
                                  n_estimators = 100, random_state = 42)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

print(classification_report(y_train_pred_rf, y_train))
print(classification_report(y_test_pred_rf, y_test))

In [None]:
f1_score(y_test, y_test_pred_rf)

0.5865355430097736

In [None]:
sorted_idx = rf_model.feature_importances_.argsort()[:15]
plt.barh(X.columns[sorted_idx], rf_model.feature_importances_[sorted_idx])

depth = 10, est = 400 

In [None]:
y_train_prob_rf = rf_model.predict_proba(X_train)
y_test_prob_rf = rf_model.predict_proba(X_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
from sklearn.ensemble import RandomForestRegressor


# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(class_weight = 'balanced')
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, scoring='f1',param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X, y)

#### Xgboost:

In [None]:
X = df_train[non_label_cols]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
pisham_xgb_model = XGBClassifier(learning_rate = 0.01,  
                      colsample_bytree = 0.35,
                      subsample = 0.5,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 14, 
                      scale_pos_weight = 3,
                      reg_lambda = 8,
                      gamma = 6,
                      tree_method = 'hist',
                      random_state = 42,
                    eval_metric = 'auc')

In [None]:
%time pisham_xgb_model.fit(X_train, y_train)

In [None]:
y_train_pred_xg = pisham_xgb_model.predict(X_train)
y_test_pred_xg = pisham_xgb_model.predict(X_test)

print(classification_report(y_train_pred_xg, y_train))
print(classification_report(y_test_pred_xg, y_test))

In [None]:
f1_score(y_test, y_test_pred_xg)

In [None]:
plot_importance(pisham_xgb_model, max_num_features = 15)

In [None]:
y_train_prob_xg = pisham_xgb_model.predict_proba(X_train)
y_test_prob_xg = pisham_xgb_model.predict_proba(X_test)

In [None]:
y_train_prob_xg

#### Half dataset

In [None]:
X = df_train[non_label_cols]
X = X.iloc[:, :25]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
first_xgb_model = XGBClassifier(learning_rate = 0.01,  
                      colsample_bytree = 0.35,
                      subsample = 0.5,
                      objective = 'binary:logistic', 
                      n_estimators = 2500,
                      max_depth = 14, 
                      scale_pos_weight = 3,
                      reg_lambda = 8,
                      gamma = 6,
                      tree_method = 'hist',
                      random_state = 42,
                    eval_metric = 'auc')

In [None]:
%time first_xgb_model.fit(X_train, y_train)

In [None]:
y_train_pred_f = first_xgb_model.predict(X_train)
y_test_pred_f = first_xgb_model.predict(X_test)

print(classification_report(y_train_pred_f, y_train))
print(classification_report(y_test_pred_f, y_test))

In [None]:
f1_score(y_test, y_test_pred_f)

In [None]:
plot_importance(first_xgb_model, max_num_features = 15)

In [None]:
X = df_train[non_label_cols]
X = X.iloc[:, 25:]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
second_xgb_model = XGBClassifier(learning_rate = 0.01,  
                      colsample_bytree = 0.35,
                      subsample = 0.5,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 14, 
                      scale_pos_weight = 3,
                      reg_lambda = 8,
                      gamma = 6,
                      tree_method = 'hist',
                      random_state = 42,
                    eval_metric = 'auc')

In [None]:
%time second_xgb_model.fit(X_train, y_train)

In [None]:
y_train_pred_s = second_xgb_model.predict(X_train)
y_test_pred_s = second_xgb_model.predict(X_test)

print(classification_report(y_train_pred_s, y_train))
print(classification_report(y_test_pred_s, y_test))

In [None]:
f1_score(y_test, y_test_pred_s)

In [None]:
plot_importance(second_xgb_model, max_num_features = 15)

In [None]:
good_feat = ['mvar2', 'mvar1', 'mvar25', 'mvar9', 'mvar7', 'mvar15', 'mvar13', 'mvar21', 'mvar24', 'mvar12',
            'mvar33', 'mvar44', 'mvar32', 'mvar27', 'mvar26', 'mvar49', 'mvar51', 'mvar29', 'mvar30', 'mvar42']

In [None]:
X = df_train[non_label_cols]
X = X[good_feat]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
sel_xgb_model = XGBClassifier(learning_rate = 0.01,  
                      colsample_bytree = 0.7,
                      subsample = 0.5,
                      objective = 'binary:logistic', 
                      n_estimators = 3000,
                      max_depth = 14, 
                      scale_pos_weight = 3,
                      reg_lambda = 4,
                      gamma = 6,
                      tree_method = 'hist',
                      random_state = 42,
                    eval_metric = 'auc')

In [None]:
%time sel_xgb_model.fit(X_train, y_train)

In [None]:
y_train_pred_sel = sel_xgb_model.predict(X_train)
y_test_pred_sel = sel_xgb_model.predict(X_test)

print(classification_report(y_train_pred_sel, y_train))
print(classification_report(y_test_pred_sel, y_test))

In [None]:
f1_score(y_test, y_test_pred_sel)

In [None]:
y_train_pred = []
for i in range(len(y_train_pred_rf)):
    y_train_pred.append(list(mode(np.array([y_train_pred_f[i], y_train_pred_xg[i], y_train_pred_s[i]]).astype('float'), keepdims = False))[0])
    
y_train_pred = np.array(y_train_pred)

y_test_pred = []
for i in range(len(y_test_pred_rf)):
    y_test_pred.append(list(mode(np.array([y_test_pred_f[i], y_test_pred_xg[i], y_test_pred_s[i]]).astype('float'), keepdims = False))[0])
    
y_test_pred = np.array(y_test_pred)

In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

#### GBM:

In [None]:
df_raw = pd.read_csv('TrainingData.csv', index_col = 'application_key',low_memory = False)

df_raw = df_raw.replace('na', np.NaN)
df_raw = df_raw.replace('missing', np.NaN)
df_raw = df_raw.replace('#VALUE!', np.NaN)

labelencoder = LabelEncoder()
df_raw['mvar47'] = labelencoder.fit_transform(df_raw['mvar47'])

df_raw['mvar47'] = df_raw['mvar47'].astype('int')


for k in range(51):
    
    if k == 46:
        continue
    
    df_raw.iloc[:, k] = pd.to_numeric(df_raw.iloc[:, k]) 

In [None]:
non_label_cols = list(df_raw.columns)
non_label_cols.remove('default_ind')


X = df_raw[non_label_cols]
y = df_raw['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
X['mvar47'].unique()

In [None]:
lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.003,
                       n_estimators = 3000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3,
                       reg_alpha = 0,
                       colsample_bytree = 0.9,
                       min_split_gain = 0.0,
                       random_state = 42,
                       categorical_feature=46
                       )

params = {'boosting_type': 'goss',
          'learning_rate': 0.002,
         'n_estimators': 4000,
         'objective': 'binary',
         'num_leaves': 40,
         'max_depth': -1,
         'is_unbalance': True,
         'reg_lambda': 3,
         'reg_alpha': 0,
         'colsample_bytree': 0.9,
         'min_split_gain': 0.0,
         'random_state': 42,
         'categorical_feature': 46,
         'metric': 'auc'}

train_data = lightgbm.Dataset(X_train, 
                         label=y_train, 
                         categorical_feature=['mvar47'], 
                         free_raw_data=False)

test_data = lightgbm.Dataset(X_test, 
                        label=y_test, 
                        categorical_feature=['mvar47'], 
                        free_raw_data=False)

In [None]:
%time lgbm_model = lightgbm.train(params, train_data, valid_sets = [test_data], categorical_feature=['mvar47'])

In [None]:
y_train_pred_lg = lgbm_model.predict(X_train)
y_train_pred_lg = [np.round(el) for el in y_train_pred_lg]

y_test_pred_lg = lgbm_model.predict(X_test)
y_test_pred_lg = [np.round(line) for line in y_test_pred_lg]

print(classification_report(y_train_pred_lg, y_train))
print(classification_report(y_test_pred_lg, y_test))

In [None]:
f1_score(y_test, y_test_pred_lg)

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
y = df_train['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.003,
                       n_estimators = 3500,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       #scale_pos_weight = 2.5,
                       reg_lambda = 3,
                       reg_alpha = 0,
                       colsample_bytree = 0.9,
                       min_split_gain = 0.0,
                       random_state = 42
                       )

In [None]:
%time lgbm_model.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

print(classification_report(y_train_pred_lgbm, y_train))
print(classification_report(y_test_pred_lgbm, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm)

In [None]:
lightgbm.plot_importance(lgbm_model, max_num_features = 15)

#### Optuna

In [None]:
!pip install optuna

In [None]:
import optuna  # pip install optuna
from optuna.integration import LightGBMPruningCallback

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

In [None]:
X  = X.reset_index().drop(['application_key'], axis = 1)
y  = y.reset_index().drop(['application_key'], axis = 1)

In [None]:
def objective(trial, X, y):
    
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.9, step=0.1
        ),
        "col_sample_bytree": trial.suggest_float('col_sample_bytree', 0.5, 0.9),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 1121218)

    cv_scores = np.empty(5)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        
        print(train_idx)
        
        X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
        y_train, y_test = y.iloc[train_idx, :], y.iloc[test_idx, :]

        model = lightgbm.LGBMClassifier(objective = "binary", is_unbalance = True, boosting_type = 'goss',
                                        min_split_gain = 0.0, **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="f1",
            early_stopping_rounds=100,
            #callbacks=[
            #    LightGBMPruningCallback(trial, "binary_logloss")
            #],  # Add a pruning callback
        )
        preds = model.predict(X_test)
        cv_scores[idx] = f1_score(y_test, preds)

    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=20)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")
    
print(study.system_attrs)

In [None]:
print(f"Number of trials on the Pareto front: {len(study.best_trials)}")

trial_with_highest_accuracy = max(study.best_trials, key=lambda t: t.values[0])
print(f"Trial with highest accuracy: ")
print(f"\tnumber: {trial_with_highest_accuracy.number}")
print(f"\tparams: {trial_with_highest_accuracy.params}")
print(f"\tvalues: {trial_with_highest_accuracy.values}")


In [None]:
study.optimize(func, n_trials=20)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")
    
print(study.user_attrs)

### Engineering

In [None]:
df_last = df_train.copy()

In [None]:
sns.boxplot(x = 'default_ind', y = 'mvar28', data = df_last)

In [None]:
sns.boxplot(x = df_last['default_ind'], y = np.log(1+df_last['mvar1']/(df_last['mvar2']+1)))

In [None]:
df_last['gen2'] = df_last['mvar1']/(df_last['mvar2']+1)

In [None]:
df_last.info()

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
y = df_train['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
non_label_cols = list(df_last.columns)
non_label_cols.remove('default_ind')

X = df_last[non_label_cols]
X = X.drop(['mvar28'], axis = 1)
y = df_last['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 1)

In [None]:
lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.91,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

print(classification_report(y_train_pred_lgbm, y_train))
print(classification_report(y_test_pred_lgbm, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm)

In [None]:
y_train_prob_lgbm = lgbm_model.predict_proba(X_train)

In [None]:
y_train_prob_lgbm

In [None]:
y_train_proba_lgbm = lgbm_model.predict_proba(X_train)

In [None]:
(y_train_proba_lgbm[y_train_pred_lgbm != y_train])

In [None]:
(y_train[y_train_pred_lgbm != y_train])

In [None]:
weights = pd.Series([1]*len(y_train), index = y_train.index)

In [None]:
weights[y_train_pred_lgbm != y_train] = 1.5

In [None]:
weights

In [None]:
lgbm_model_weight = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.5,
                       reg_alpha = 0,
                       colsample_bytree = 0.91,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model_weight.fit(X_train, y_train, sample_weight = weights) 

In [None]:
y_train_pred_lgbmw = lgbm_model_weight.predict(X_train)
y_test_pred_lgbmw = lgbm_model_weight.predict(X_test)

print(classification_report(y_train_pred_lgbmw, y_train))
print(classification_report(y_test_pred_lgbmw, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbmw)

In [None]:
lightgbm.plot_importance(lgbm_model_weight, max_num_features = 20)

In [None]:
(y_train_pred_lgbm != y_train_pred_lgbmw).sum()

In [None]:
len(y_train)

In [None]:
len(y_train_pred_lgbm)

In [None]:
y_train_pred_lgbm

In [None]:
f1_score(y_test, y_test_pred_lgbm)

In [None]:
(0.5987620936241812 + 0.6022331612438468 + 0.6004190362167017)/3

In [None]:
(0.5976707888101812+0.6008009084932161+0.6004560729716754)/3

0.6008009084932161

In [None]:
lightgbm.plot_importance(lgbm_model, max_num_features = 25)

In [None]:
lgbm_model.feature_importances_[list(X.columns).index('mvar28')]

In [None]:
lightgbm.plot_importance(lgbm_model, max_num_features = 15)

In [None]:
y_train_prob_lgbm = lgbm_model.predict_proba(X_train)
y_test_prob_lgbm = lgbm_model.predict_proba(X_test)

In [None]:
lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.91,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model.fit(X, y)

In [None]:
y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

print(classification_report(y_train_pred_lgbm, y_train))
print(classification_report(y_test_pred_lgbm, y_test))

In [None]:
non_label_cols = list(df_feat.columns)
non_label_cols.remove('default_ind')

X = df_feat[non_label_cols]
y = df_feat['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.0005,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.3,
                       reg_alpha = 0,
                       colsample_bytree = 0.89,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

print(classification_report(y_train_pred_lgbm, y_train))
print(classification_report(y_test_pred_lgbm, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm)

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
y = df_train['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
lgbm_model2 = LGBMClassifier(boosting_type = 'dart',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.89,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model2.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm2 = lgbm_model2.predict(X_train)
y_test_pred_lgbm2 = lgbm_model2.predict(X_test)

print(classification_report(y_train_pred_lgbm2, y_train))
print(classification_report(y_test_pred_lgbm2, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm2)

In [None]:
lightgbm.plot_importance(lgbm_model2, max_num_features = 15)

In [None]:
(y_train_pred_lgbm2 != y_train_pred_lgbm).sum()

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
X = X.drop(['mvar33'], axis = 1)
y = df_train['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
lgbm_model3 = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.91,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model3.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm3 = lgbm_model3.predict(X_train)
y_test_pred_lgbm3 = lgbm_model3.predict(X_test)

print(classification_report(y_train_pred_lgbm3, y_train))
print(classification_report(y_test_pred_lgbm3, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm3)

In [None]:
lightgbm.plot_importance(lgbm_model3, max_num_features = 15)

In [None]:
lgbm_model4 = LGBMClassifier(boosting_type = 'gbdt',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.89,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model4.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm4 = lgbm_model4.predict(X_train)
y_test_pred_lgbm4 = lgbm_model4.predict(X_test)

print(classification_report(y_train_pred_lgbm4, y_train))
print(classification_report(y_test_pred_lgbm4, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm4)

In [None]:
lightgbm.plot_importance(lgbm_model4, max_num_features = 15)

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
y = df_train['default_ind']

X = X.iloc[:, :18]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
lgbm_model1 = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 4,
                       reg_alpha = 0,
                       colsample_bytree = 0.89,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model1.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm1 = lgbm_model1.predict(X_train)
y_test_pred_lgbm1 = lgbm_model1.predict(X_test)

print(classification_report(y_train_pred_lgbm1, y_train))
print(classification_report(y_test_pred_lgbm1, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm1)

In [None]:
y_train_prob_lgbm1 = lgbm_model1.predict_proba(X_train)[:, 1]
y_test_prob_lgbm1 = lgbm_model1.predict_proba(X_test)[:, 1]

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
y = df_train['default_ind']

X = X.iloc[:, 18:36]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
lgbm_model2 = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 4,
                       reg_alpha = 0,
                       colsample_bytree = 0.89,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model2.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm2 = lgbm_model2.predict(X_train)
y_test_pred_lgbm2 = lgbm_model2.predict(X_test)

print(classification_report(y_train_pred_lgbm2, y_train))
print(classification_report(y_test_pred_lgbm2, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm2)

In [None]:
y_train_prob_lgbm2 = lgbm_model2.predict_proba(X_train)[:, 1]
y_test_prob_lgbm2 = lgbm_model2.predict_proba(X_test)[:, 1]

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
y = df_train['default_ind']

X = X.iloc[:, 36:]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
lgbm_model3 = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 4,
                       reg_alpha = 0,
                       colsample_bytree = 0.89,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

In [None]:
%time lgbm_model3.fit(X_train, y_train)

In [None]:
y_train_pred_lgbm3 = lgbm_model3.predict(X_train)
y_test_pred_lgbm3 = lgbm_model3.predict(X_test)

print(classification_report(y_train_pred_lgbm3, y_train))
print(classification_report(y_test_pred_lgbm3, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbm3)

In [None]:
y_train_prob_lgbm3 = lgbm_model3.predict_proba(X_train)[:, 1]
y_test_prob_lgbm3 = lgbm_model3.predict_proba(X_test)[:, 1]

In [None]:
non_label_cols = list(df_train.columns)
non_label_cols.remove('default_ind')

X = df_train[non_label_cols]
y = df_train['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

X_train.loc[:, 'm1'] = y_train_prob_lgbm1
X_train.loc[:, 'm2'] = y_train_prob_lgbm2
X_train.loc[:, 'm3'] = y_train_prob_lgbm3

X_test.loc[:, 'm1'] = y_test_prob_lgbm1
X_test.loc[:, 'm2'] = y_test_prob_lgbm2
X_test.loc[:, 'm3'] = y_test_prob_lgbm3

In [None]:
lgbm_modelf = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 20,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3000,
                       reg_alpha = 0,
                       colsample_bytree = 0.6,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 40
                       )

In [None]:
%time lgbm_modelf.fit(X_train, y_train)

In [None]:
y_train_pred_lgbmf = lgbm_modelf.predict(X_train)
y_test_pred_lgbmf = lgbm_modelf.predict(X_test)

print(classification_report(y_train_pred_lgbmf, y_train))
print(classification_report(y_test_pred_lgbmf, y_test))

In [None]:
f1_score(y_test, y_test_pred_lgbmf)

In [None]:
lightgbm.plot_importance(lgbm_modelf, max_num_features = 15)

In [None]:
y_train_pred = np.round(0.0*y_train_pred_rf + 0.3*y_train_pred_lgbm3 + 0.3*y_train_pred_lgbm4 + 0.4*y_train_pred_lgbm)
y_test_pred = np.round(0.0*y_test_pred_rf + 0.3*y_test_pred_lgbm3 + 0.3*y_test_pred_lgbm4 + 0.4*y_test_pred_lgbm)


In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

In [None]:
y_train_stack = np.column_stack((y_train_prob_xg, y_train_prob_lgbm, y_train_prob_rf))


y_test_stack = np.column_stack((y_test_prob_xg, y_test_prob_lgbm, y_test_prob_rf))

In [None]:
y_train_stack

In [None]:
y_train_add_stack = np.column_stack((y_train_stack, y_train_stack[:, 0]*y_train_stack[:, 2]))
y_test_add_stack = np.column_stack((y_test_stack, y_test_stack[:, 0]*y_test_stack[:, 2]))

In [None]:
y_train_add_stack

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
predict_model = LogisticRegression(C = 1e-8, class_weight = 'balanced', solver = 'liblinear', 
                                   penalty = 'l2' ,max_iter = 5000)

In [None]:
predict_model.fit(y_train_stack, y_train)

In [None]:
y_train_pred = predict_model.predict(y_train_stack)
y_test_pred = predict_model.predict(y_test_stack)

In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

In [None]:
predict_model.coef_

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
predict_model = DecisionTreeClassifier(class_weight = 'balanced', max_depth = 3)

In [None]:
predict_model.fit(y_train_stack, y_train)

In [None]:
y_train_pred = predict_model.predict(y_train_stack)
y_test_pred = predict_model.predict(y_test_stack)

In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

In [None]:
y_train_stack[:, 0]

In [None]:
y_train_pred = np.round(0.5*(y_train_stack[:, 1]+y_train_stack[:, 3]))
y_test_pred = np.round(0.5*(y_test_stack[:, 1]+y_test_stack[:, 3]))

In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

In [None]:
y_train_pred = np.round((y_train_stack[:, 3]))
y_test_pred = np.round((y_test_stack[:, 3]))

In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

### First ensemble: xg + rf + lg

In [None]:
y_train_stack = np.column_stack((y_train_pred_rf, y_train_pred_xg, y_train_pred_lg))


y_test_stack = np.column_stack((y_test_pred_rf, y_test_pred_xg, y_test_pred_lg))

In [None]:
y_train_stack

Second model:

In [None]:
predict_model = RandomForestClassifier()

predict_model.fit(y_train_stack, y_train)

In [None]:
y_train_pred = predict_model.predict(y_train_stack)
y_test_pred = predict_model.predict(y_test_stack)

In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

In [None]:
y_train_pred = []
for i in range(len(y_train_pred_rf)):
    y_train_pred.append(list(mode(np.array([y_train_pred_rf[i], y_train_pred_xg[i], y_train_pred_lg[i], y_train_pred_lg[i], y_train_pred_xg[i]]).astype('float'), keepdims = False))[0])
    
y_train_pred = np.array(y_train_pred)

y_test_pred = []
for i in range(len(y_test_pred_rf)):
    y_test_pred.append(list(mode(np.array([y_test_pred_rf[i], y_test_pred_xg[i], y_test_pred_lg[i], y_test_pred_lg[i], y_test_pred_xg[i]]).astype('float'), keepdims = False))[0])
    
y_test_pred = np.array(y_test_pred)

In [None]:
[f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)]

#### SVM

In [None]:
from sklearn.svm import SVC

In [None]:
non_label_cols = list(df_train_imputed.columns)
non_label_cols.remove('default_ind')

X = df_train_imputed[non_label_cols]
y = df_train_imputed['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

In [None]:
svm_model = SVC(C = 1.0, kernel = 'rbf', class_weight = 'balanced', verbose = True, random_state = 42)

In [None]:
%time svm_model.fit(X_train, y_train)

In [None]:
y_train_pred_svm = svm_model.predict(X_train)
y_test_pred_svm = svm_model.predict(X_test)

print(classification_report(y_train_pred_svm, y_train))
print(classification_report(y_test_pred_svm, y_test))

### Best so far

In [None]:
df_best = df_eng.drop(['mvar2', 'mvar6', 'mvar7', 'mvar8', 'mvar11', 'log_mvar15', 
            'log_mvar24', 'mvar33', 'log_mvar33', 'log_mvar10', 'log_mvar13'], axis = 1)

In [None]:
non_label_cols = list(df_best.columns)
non_label_cols.remove('default_ind')

X = df_best[non_label_cols]
y = df_best['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.9,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

%time lgbm_model.fit(X_train, y_train)

y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

print(classification_report(y_train_pred_lgbm, y_train))
print(classification_report(y_test_pred_lgbm, y_test))

print(f1_score(y_test, y_test_pred_lgbm))

In [None]:
df_best

Training on whole set:

In [None]:
non_label_cols = list(df_best.columns)
non_label_cols.remove('default_ind')

X = df_best[non_label_cols]
y = df_best['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.9,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

%time lgbm_model.fit(X, y)

y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

print(classification_report(y_train_pred_lgbm, y_train))
print(classification_report(y_test_pred_lgbm, y_test))

#### Drop test

In [None]:
non_label_cols = list(df_best.columns)
non_label_cols.remove('default_ind')

X = df_best[non_label_cols]
y = df_best['default_ind']

for k in range(6, 44):
    
    X['log_'+str(df_best.columns[k])] = np.log(X[df_best.columns[k]]+1)
    X = X.drop([df_best.columns[k]], axis = 1)
    
    print(X.columns[-1])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

    lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.9,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

    lgbm_model.fit(X_train, y_train)

    y_train_pred_lgbm = lgbm_model.predict(X_train)
    y_test_pred_lgbm = lgbm_model.predict(X_test)

    print("Dropped ", df_best.columns[k], end = ':')
    print("F1 = ", f1_score(y_test, y_test_pred_lgbm))

In [None]:
df_best.columns[51]

In [None]:
df_best.drop(51, axis = 1)

In [None]:
non_label_cols = list(df_eng.columns)
non_label_cols.remove('default_ind')

X = df_eng[non_label_cols]
X = X.drop(['mvar2', 'mvar6', 'mvar7', 'mvar8', 'mvar11', 'log_mvar15', 
            'log_mvar24', 'mvar33', 'log_mvar33', 'log_mvar10', 'log_mvar13'], axis = 1)
y = df_eng['default_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3, random_state = 42)

lgbm_model = LGBMClassifier(boosting_type = 'goss',
                       learning_rate = 0.001,
                       n_estimators = 10000,
                       objective = 'binary',
                       num_leaves = 40,
                       max_depth = -1,
                       is_unbalance = True,
                       reg_lambda = 3.1,
                       reg_alpha = 0,
                       colsample_bytree = 0.9,
                       min_split_gain = 0,
                       random_state = 42,
                       min_data_in_bin = 4,
                        min_child_samples = 20
                       )

%time lgbm_model.fit(X_train, y_train)

y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

print(classification_report(y_train_pred_lgbm, y_train))
print(classification_report(y_test_pred_lgbm, y_test))

print(f1_score(y_test, y_test_pred_lgbm))

In [None]:
lightgbm.plot_importance(lgbm_model, max_num_features = 20)

In [None]:
lightgbm.plot_importance(lgbm_model, max_num_features = 20)

In [None]:
lightgbm.plot_importance(lgbm_model, max_num_features = 20)

In [None]:
df_eng

### Generating predictions for submission

In [None]:
df_test = pd.read_csv('testX.csv', index_col = 'application_key', low_memory = False)

In [None]:
df_test

In [None]:
for i in range(51):
    lol = get_non_numeric(list(df_test.iloc[:, i].unique()))
    print(i, lol)

In [None]:
df_test = df_test.replace('na', np.NaN)
df_test = df_test.replace('missing', np.NaN)
df_test = df_test.replace('#VALUE!', np.NaN)

In [None]:
for i in range(51):
    lol = get_non_numeric(list(df_test.iloc[:, i].unique()))
    print(i, lol)

In [None]:
df_test = pd.get_dummies(df_test, columns = ['mvar47'], prefix = ['type'])

In [None]:
for i in range(52):
    lol = get_non_numeric(list(df_test.iloc[:, i].unique()))
    print(i, lol)

In [None]:
for el in list(df_test.columns):
    df_test[el] = pd.to_numeric(df_test[el]) 

In [None]:
df_test

In [None]:
df_test['log_mvar2'] = np.log(df_test['mvar2']+1)
df_test['log_mvar6'] = np.log(df_test['mvar6']+1)
df_test['log_mvar7'] = np.log(df_test['mvar7']+1)
df_test['log_mvar8'] = np.log(df_test['mvar8']+1)
df_test['log_mvar11'] = np.log(df_test['mvar11']+1)

df_test = df_test.drop(['mvar2', 'mvar6', 'mvar8', 'mvar11'] ,axis = 1)

In [None]:
df_test = df_test.drop(['mvar33'] ,axis = 1)

In [None]:
df_test.columns

In [None]:
df_eng.columns

In [None]:
X.columns

In [None]:
df_test['lol'] = 0

df_test_imputed = mean_imputer.transform(df_test)
df_test_imputed = pd.DataFrame(df_test_imputed, columns = df_test.columns)

df_test_imputed.drop(['lol'], axis = 1)

In [None]:
X_eval = df_test_imputed[non_label_cols]

#### Model 1

In [None]:
predictions = partly_tuned_model.predict(X_eval)

In [None]:
predictions

In [None]:
df_submit = pd.read_csv('submission.csv', names = ['app', 'def'],header = None )

In [None]:
df_submit['def'] = predictions

In [None]:
df_submit

#### Model 2 - class weighting

In [None]:
predictions = weighted_tuned_model.predict(X_eval)

df_submit = pd.read_csv('submission.csv', names = ['app', 'def'],header = None )
df_submit['def'] = predictions


<b> Currently second, with score 59.64% </b>

#### Model 3 - regularization

In [None]:
predictions = regularized_xgb_model.predict(X_eval)

df_submit = pd.read_csv('submission.csv', names = ['app', 'def'],header = None )
df_submit['def'] = predictions


#### Model 4 - more regularization

In [None]:
predictions = lambda_xgb_model.predict(X_eval)

df_submit = pd.read_csv('submission.csv', names = ['app', 'def'],header = None )
df_submit['def'] = predictions


#### Model 5

In [None]:
X_eval = df_test[non_label_cols]

In [None]:
X_eval

In [None]:
for el in list(X_eval.columns):
    X_eval[el] = pd.to_numeric(X_eval[el]) 


predictions = lgbm_model.predict(X_eval)

df_submit = pd.read_csv('submission.csv', names = ['app', 'def'],header = None )
df_submit['def'] = predictions


In [None]:
predictions = lgbm_model.predict(df_test)

df_submit = pd.read_csv('submission.csv', names = ['app', 'def'],header = None )
df_submit['def'] = predictions
