In [46]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC

import lightgbm as lgbm
import xgboost as xgb

from imblearn.over_sampling import SMOTE

from IPython.display import display

In [14]:
df = pd.read_csv('data/train_final.csv', index_col='Id')
df_ult = pd.read_csv('data/test_final.csv', index_col='Id')

X = df.drop('Y', axis='columns').values
y = df['Y'].values

X_ult = df_ult.values

scale_pos_weight = len(y[y == 0])/len(y[y == 1])

In [13]:
skfold = StratifiedKFold(5, shuffle=True, random_state=0)

In [11]:
def transform(df, y=True):
    df_copy = df.copy()
    df_copy['f8-f19'] = df['f8'] - df['f19']
    df_copy['f8-f13'] = df['f8'] - df['f13']
    df_copy['f17-f4'] = df['f17'] - df['f4']
    df_copy['f4-f7'] = df['f4'] - df['f7']
    df_copy['f13-f19'] = df['f13'] - df['f19']
    df_copy['f8wf13wf19'] = PCA(n_components=1).fit_transform(df[['f8', 'f13', 'f19']])
    return df_copy.drop('Y', axis='columns') if y else df_copy

selected_cols = ['f14', 'f13', 'f15', 'f4', 'f8wf13wf19', 'f4-f7', 'f16', 'f17', 'f19', 'f1', 'f8-f19']

In [18]:
def log_reg(cols):
    base_param = {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'tol': [1e-2, 1e-3, 1e-4, 1e-5],
        'class_weight': ['balanced']
    }

    tune_param = [{
        'penalty': ['l2'], 
        'solver': ['newton-cg', 'sag', 'lbfgs']
    }, {
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'l1_ratio': [0.1, 0.25, 0.5, 0.75, 0.9]
    }, {
        'penalty': ['l2'],
        'solver': ['liblinear', 'saga']
    }]

    [param.update(base_param) for param in tune_param]

    grid = GridSearchCV(LogisticRegression(), tune_param, cv=skfold, scoring='roc_auc', n_jobs=-1)

    grid.fit(transform(df)[cols].values, y)

    print(f'Best parameters {grid.best_params_}.')
    print(f'Best auc score is {grid.best_score_}.')
    
    return grid.best_estimator_

In [8]:
def ext_tre(cols):
    tune_param = {
        'n_estimators': [100, 500, 1000],
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, None],
        'class_weight': ['balanced'],
        'random_state': [0],
        'n_jobs': [-1]
    }

    grid = GridSearchCV(ExtraTreesClassifier(), tune_param, cv=skfold, scoring='roc_auc', n_jobs=-1)

    grid.fit(transform(df)[cols].values, y)

    print(f'Best parameters {grid.best_params_}.')
    print(f'Best auc score is {grid.best_score_}.')
    
    return grid.best_estimator_

In [16]:
def rdm_for(cols):
    tune_param = {
        'n_estimators': [1000],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None],
        'class_weight': ['balanced'],
        'random_state': [0],
        'n_jobs': [-1]
    }

    grid = GridSearchCV(RandomForestClassifier(), tune_param, cv=skfold, scoring='roc_auc', n_jobs=-1)

    grid.fit(transform(df)[cols].values, y)

    print(f'Best parameters {grid.best_params_}.')
    print(f'Best auc score is {grid.best_score_}.')
    
    return grid.best_estimator_

In [42]:
def lin_svc(cols):
    tune_param = {
        'tol': [0.0001, 0.00001, 0.001], 
        'C': [1.0, 0.1, 10, 0.01, 100], 
        'random_state': [0], 
        'max_iter': [500, 1000]
    }

    grid = GridSearchCV(LinearSVC(), tune_param, cv=skfold, scoring='roc_auc', n_jobs=-1)

    grid.fit(transform(df)[cols].values, y)

    print(f'Best parameters {grid.best_params_}.')
    print(f'Best auc score is {grid.best_score_}.')
    
    return grid.best_estimator_

In [43]:
lin_csv_sel = lin_svc(selected_cols)

Best parameters {'C': 1.0, 'max_iter': 1000, 'random_state': 0, 'tol': 0.0001}.
Best auc score is 0.5058158020272084.




In [52]:
def gra_bst(cols):
    tune_param = {
        'n_estimators': [500, 1000],
        'loss' : ['deviance', 'exponential'],
        'max_depth': [5, None],
#         'class_weight': ['balanced'],
        'random_state': [0],
    }

    grid = GridSearchCV(GradientBoostingClassifier(), tune_param, cv=skfold, scoring='roc_auc', n_jobs=-1)

    grid.fit(transform(df)[cols].values, y)

    print(f'Best parameters {grid.best_params_}.')
    print(f'Best auc score is {grid.best_score_}.')
    
    return grid.best_estimator_

In [56]:
def aba_bst(cols):
    tune_param = {
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.1, 1],
        'random_state': [0],
    }

    grid = GridSearchCV(AdaBoostClassifier(), tune_param, cv=skfold, scoring='roc_auc', n_jobs=-1)

    grid.fit(transform(df)[cols].values, y)

    print(f'Best parameters {grid.best_params_}.')
    print(f'Best auc score is {grid.best_score_}.')
    
    return grid.best_estimator_

In [57]:
aba_bst_sel = aba_bst(selected_cols)

Best parameters {'learning_rate': 1, 'n_estimators': 1000, 'random_state': 0}.
Best auc score is 0.8560549273161631.


In [53]:
gra_bst_sel = gra_bst(selected_cols)

Best parameters {'loss': 'exponential', 'max_depth': 5, 'n_estimators': 500, 'random_state': 0}.
Best auc score is 0.8796743696046784.


In [15]:
ext_tre_sel = ext_tre(selected_cols)

Best parameters {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0}.
Best auc score is 0.8640559016418421.


In [27]:
rdm_for_sel = rdm_for(selected_cols)

Best parameters {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0}.
Best auc score is 0.8824712258965254.


In [30]:
rdm_for_0t6 = rdm_for(['f1', 'f2', 'f3', 'f4', 'f5', 'f6'])

Best parameters {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0}.
Best auc score is 0.6443147506530321.


In [34]:
rdm_for_7t12 = rdm_for(['f7', 'f8', 'f9', 'f10', 'f11', 'f12'])

Best parameters {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0}.
Best auc score is 0.6274706873751901.


In [35]:
rdm_for_13t18 = rdm_for(['f13', 'f14', 'f15', 'f16', 'f17', 'f18'])

Best parameters {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0}.
Best auc score is 0.8519775508449846.


In [40]:
rdm_for_19t24 = rdm_for(['f19', 'f20', 'f21', 'f22', 'f23', 'f24'])

Best parameters {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0}.
Best auc score is 0.5741570859334137.


In [39]:
rdm_for_synth = rdm_for(['f8-f19', 'f8-f13', 'f17-f4', 'f4-f7', 'f13-f19', 'f8wf13wf19'])

Best parameters {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0}.
Best auc score is 0.7703803960532711.


In [19]:
log_reg_sel = log_reg(selected_cols)

Best parameters {'C': 100, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'newton-cg', 'tol': 0.01}.
Best auc score is 0.5513333054965867.


In [27]:
train = []
test = []
custom_df = transform(df)
X1 = custom_df.values
for train_idx, _ in skfold.split(X1, y):
    train.append(X1[train_idx])
    test.append(y[train_idx])
    
print(np.array(train)[0].shape)

train_df = pd.DataFrame(columns=custom_df.columns.tolist())
train_df

(13106, 30)


In [38]:
from functools import reduce
temp = np.array(train)
reduce(lambda x, y: np.append(x, y, axis=0), temp).shape

(65532, 30)

In [39]:
df.shape

(16383, 25)

In [21]:
transform(df).iloc[:, [0,1]]

Unnamed: 0_level_0,f1,f2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,25884,1
2,34346,1
3,34923,1
4,80926,1
5,4674,1
...,...,...
16379,33328,1
16380,19944,1
16381,28359,1
16382,7542,1


In [42]:
def full_train(clf, cols): return clf.fit(transform(df)[cols].values, y)

full_train(log_reg_sel, selected_cols)
full_train(rdm_for_sel, selected_cols)
full_train(rdm_for_0t6, ['f1', 'f2', 'f3', 'f4', 'f5', 'f6'])
full_train(rdm_for_7t12, ['f7', 'f8', 'f9', 'f10', 'f11', 'f12'])
full_train(rdm_for_13t18, ['f13', 'f14', 'f15', 'f16', 'f17', 'f18'])
full_train(rdm_for_19t24, ['f19', 'f20', 'f21', 'f22', 'f23', 'f24'])
full_train(rdm_for_synth, ['f8-f19', 'f8-f13', 'f17-f4', 'f4-f7', 'f13-f19', 'f8wf13wf19']);



In [44]:
def pred_prob(clf, df, cols): return clf.predict_proba(transform(df)[cols].values)[:, 1]

log_pred = pred_prob(log_reg_sel, df, selected_cols)
rdm_sel_pred = pred_prob(rdm_for_sel, df, selected_cols)
rdm_0t6_pred = pred_prob(rdm_for_0t6, df, ['f1', 'f2', 'f3', 'f4', 'f5', 'f6'])
rdm_7t12_pred = pred_prob(rdm_for_7t12, df, ['f7', 'f8', 'f9', 'f10', 'f11', 'f12'])
rdm_13t18_pred = pred_prob(rdm_for_13t18, df, ['f13', 'f14', 'f15', 'f16', 'f17', 'f18'])
rdm_19t24_pred = pred_prob(rdm_for_19t24, df, ['f19', 'f20', 'f21', 'f22', 'f23', 'f24'])
rdm_synth_pred = pred_prob(rdm_for_synth, df, ['f8-f19', 'f8-f13', 'f17-f4', 'f4-f7', 'f13-f19', 'f8wf13wf19'])

In [50]:
temp = transform(df)

X2 = np.array([log_pred, 
               rdm_sel_pred, 
               rdm_0t6_pred, 
               rdm_7t12_pred, 
               rdm_13t18_pred, 
               rdm_19t24_pred, 
               rdm_synth_pred, 
               temp['f14'], 
               temp['f13'], 
               temp['f15'], 
               temp['f4'], 
               temp['f8wf13wf19']]).T

In [74]:
fix_param = {
    'boosting_type':                 'goss', 
    'metric':                        'auc', 
    'objective':                     'binary', 
    'scale_pos_weight':              scale_pos_weight, 
    'n_jobs':                        -1, 
#     'seed': 75
}

now_param = {
    'n_estimators':                  [1000], 
    'learning_rate':                 [0.1], 
#     'min_child_weight':              [1e-3, 1e-9], 
#     'min_split_gain':                [6e-6], 
    'colsample_bytree':              [0.01], 
    'reg_alpha':                     [0], 
    'reg_lambda':                    [0.89995, 1], 
}

more_fix_param = {}
# more_fix_param = {k:v[0] for k, v in now_param.items()}

# now_param = { 'n_estimators': [932, 933, 934], }

grid = GridSearchCV(lgbm.LGBMClassifier(**fix_param, **more_fix_param), 
                          now_param, 
                          cv=StratifiedKFold(10), 
                          scoring='roc_auc', 
                          n_jobs=-1)

grid.fit(X2, y)

print(f'Best parameters {grid.best_params_}.')
print(f'Best auc score is {grid.best_score_}.')

Best parameters {'colsample_bytree': 0.01, 'learning_rate': 0.1, 'n_estimators': 1000, 'reg_alpha': 0, 'reg_lambda': 0.89995}.
Best auc score is 1.0.


In [55]:
grid.best_estimator_.feature_importances_

array([ 858,  164,  876,  869,  126,  475,  613, 1323,  597, 1165,  876,
        674], dtype=int32)

In [60]:
def pred_ult(clf, df, cols): return clf.predict_proba(transform(df, y=False)[cols].values)[:, 1]

log_pred2 = pred_ult(log_reg_sel, df_ult, selected_cols)
rdm_sel_pred2 = pred_ult(rdm_for_sel, df_ult, selected_cols)
rdm_0t6_pred2 = pred_ult(rdm_for_0t6, df_ult, ['f1', 'f2', 'f3', 'f4', 'f5', 'f6'])
rdm_7t12_pred2 = pred_ult(rdm_for_7t12, df_ult, ['f7', 'f8', 'f9', 'f10', 'f11', 'f12'])
rdm_13t18_pred2 = pred_ult(rdm_for_13t18, df_ult, ['f13', 'f14', 'f15', 'f16', 'f17', 'f18'])
rdm_19t24_pred2 = pred_ult(rdm_for_19t24, df_ult, ['f19', 'f20', 'f21', 'f22', 'f23', 'f24'])
rdm_synth_pred2 = pred_ult(rdm_for_synth, df_ult, ['f8-f19', 'f8-f13', 'f17-f4', 'f4-f7', 'f13-f19', 'f8wf13wf19'])

In [62]:
temp2 = transform(df_ult, y=False)

X3 = np.array([log_pred2, 
               rdm_sel_pred2, 
               rdm_0t6_pred2, 
               rdm_7t12_pred2, 
               rdm_13t18_pred2, 
               rdm_19t24_pred2, 
               rdm_synth_pred2, 
               temp2['f14'], 
               temp2['f13'], 
               temp2['f15'], 
               temp2['f4'], 
               temp2['f8wf13wf19']]).T

In [64]:
grid.best_estimator_.fit(X2, y)

LGBMClassifier(boosting_type='goss', class_weight=None, colsample_bytree=0.01,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               metric='auc', min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=1000, n_jobs=-1, num_leaves=31,
               objective='binary', random_state=None, reg_alpha=0,
               reg_lambda=0.89995, scale_pos_weight=0.06141885325558795,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [71]:
req_rows = 16384
clf = grid.best_estimator_
result = pd.DataFrame(clf.predict_proba(X3)[:, 1], 
                      index=list(range(req_rows, req_rows*2 + 1)), 
                      columns=['Y'])

result.index.name = 'Id'
result.to_csv(f'submission_29_0.csv', float_format='%.20f')