In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 50)

train_df = pd.read_csv('training_data.csv', index_col = 0)

_columns = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1',
       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
       'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
       'PAY_AMT6', 'target']
train_df.columns = _columns
train_df.drop('ID', axis=0, inplace=True)

for col in _columns:
    try:
        train_df[col] = train_df[col].astype('float64')
    except:
        continue

In [52]:
conditions = [
    train_df['EDUCATION'].eq(1),
    train_df['EDUCATION'].eq(2),
    train_df['EDUCATION'].eq(3)
]
choices = [
    3,
    2,
    1
]

train_df['EDUCATION'] = np.select(conditions, choices, 4)
train_df['EDUCATION'].value_counts()

2    10516
3     7919
1     3713
4      351
Name: EDUCATION, dtype: int64

In [53]:
conditions = [
    train_df['MARRIAGE'].eq(1),
    train_df['MARRIAGE'].eq(2),
]
choices = [
    1,
    2
]

train_df['MARRIAGE'] = np.select(conditions, choices, 3)
train_df['MARRIAGE'].value_counts()

2    12026
1    10195
3      278
Name: MARRIAGE, dtype: int64

In [54]:
train_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target
28835,220000.0,2.0,3,2,36.0,0.0,0.0,0.0,0.0,0.0,0.0,222598.0,222168.0,217900.0,221193.0,181859.0,184605.0,10000.0,8018.0,10121.0,6006.0,10987.0,143779.0,1.0
25329,200000.0,2.0,1,2,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,0.0
18894,180000.0,2.0,3,2,27.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
690,80000.0,1.0,2,2,32.0,0.0,0.0,0.0,0.0,0.0,0.0,51372.0,51872.0,47593.0,43882.0,42256.0,42527.0,1853.0,1700.0,1522.0,1548.0,1488.0,1500.0,0.0
6239,10000.0,1.0,2,2,27.0,0.0,0.0,0.0,0.0,0.0,0.0,8257.0,7995.0,4878.0,5444.0,2639.0,2697.0,2000.0,1100.0,600.0,300.0,300.0,1000.0,1.0


In [34]:
from scipy import stats
def return_slope_pay_amt(_x):
    _slope = stats.linregress(x= list(range(1,7)),
                             y= [_x['PAY_AMT6'],_x['PAY_AMT5'],_x['PAY_AMT4'], 
                                 _x['PAY_AMT3'],_x['PAY_AMT2'],_x['PAY_AMT1']])[0]
    return _slope

def return_slope_bill_amt(_x):
    _slope = stats.linregress(x= list(range(1,7)),
                             y= [_x['BILL_AMT6'],_x['BILL_AMT5'],_x['BILL_AMT4'], 
                                 _x['BILL_AMT3'],_x['BILL_AMT2'],_x['BILL_AMT1']])[0]
    return _slope

def return_slope_pay(_x):
    _slope = stats.linregress(x= list(range(1,7)),
                             y= [_x['PAY_6'],_x['PAY_5'],_x['PAY_4'], 
                                 _x['PAY_3'],_x['PAY_2'],_x['PAY_0']])[0]
    return _slope

train_df['PAY_AMT_SLOPE'] = train_df.apply(return_slope_pay_amt, axis=1)
train_df['BILL_AMT_SLOPE'] = train_df.apply(return_slope_bill_amt, axis=1)
train_df['PAY_SLOPE'] = train_df.apply(return_slope_pay, axis=1)

In [35]:
train_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target,PAY_AMT_SLOPE,BILL_AMT_SLOPE,PAY_SLOPE
28835,220000.0,2.0,3,2,36.0,0.0,0.0,0.0,0.0,0.0,0.0,222598.0,222168.0,217900.0,221193.0,181859.0,184605.0,10000.0,8018.0,10121.0,6006.0,10987.0,143779.0,1.0,-19248.2,8788.542857,0.0
25329,200000.0,2.0,1,2,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,0.0,0.0,0.0,0.0
18894,180000.0,2.0,3,2,27.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
690,80000.0,1.0,2,2,32.0,0.0,0.0,0.0,0.0,0.0,0.0,51372.0,51872.0,47593.0,43882.0,42256.0,42527.0,1853.0,1700.0,1522.0,1548.0,1488.0,1500.0,0.0,67.857143,2193.828571,0.0
6239,10000.0,1.0,2,2,27.0,0.0,0.0,0.0,0.0,0.0,0.0,8257.0,7995.0,4878.0,5444.0,2639.0,2697.0,2000.0,1100.0,600.0,300.0,300.0,1000.0,1.0,220.0,1237.2,0.0


In [36]:
train_df['PAY_SLOPE'].value_counts()

 0.000000    11196
 0.285714      781
-0.142857      765
 0.428571      624
 0.228571      619
             ...  
 0.457143        1
-0.028571        1
 0.828571        1
-0.142857        1
-0.085714        1
Name: PAY_SLOPE, Length: 146, dtype: int64

In [37]:
from scipy import stats
def return_slope_pay_amt(_x):
    _slope = stats.linregress(x= list(range(1,7)),
                             y= [_x['PAY_AMT6'],_x['PAY_AMT5'],_x['PAY_AMT4'], 
                                 _x['PAY_AMT3'],_x['PAY_AMT2'],_x['PAY_AMT1']])[0]
    return _slope

def return_slope_bill_amt(_x):
    _slope = stats.linregress(x= list(range(1,7)),
                             y= [_x['BILL_AMT6'],_x['BILL_AMT5'],_x['BILL_AMT4'], 
                                 _x['BILL_AMT3'],_x['BILL_AMT2'],_x['BILL_AMT1']])[0]
    return _slope

def return_slope_pay(_x):
    _slope = stats.linregress(x= list(range(1,7)),
                             y= [_x['PAY_6'],_x['PAY_5'],_x['PAY_4'], 
                                 _x['PAY_3'],_x['PAY_2'],_x['PAY_0']])[0]
    return _slope

train_df['PAY_AMT_SLOPE'] = train_df.apply(return_slope_pay_amt, axis=1)
train_df['BILL_AMT_SLOPE'] = train_df.apply(return_slope_bill_amt, axis=1)
train_df['PAY_SLOPE'] = train_df.apply(return_slope_pay, axis=1)

def return_std_pay_amt(_x):
    _std = np.std([_x['PAY_AMT6'],_x['PAY_AMT5'],_x['PAY_AMT4'], 
                                 _x['PAY_AMT3'],_x['PAY_AMT2'],_x['PAY_AMT1']])
    return _std

def return_std_bill_amt(_x):
    _std = np.std([_x['BILL_AMT6'],_x['BILL_AMT5'],_x['BILL_AMT4'], 
                                 _x['BILL_AMT3'],_x['BILL_AMT2'],_x['BILL_AMT1']])
    return _std
train_df['PAY_AMT_STD'] = train_df.apply(return_std_pay_amt, axis=1)
train_df['BILL_AMT_STD'] = train_df.apply(return_std_bill_amt, axis=1)


def return_mean_pay_amt(_x):
    _mean = np.mean([_x['PAY_AMT6'],_x['PAY_AMT5'],_x['PAY_AMT4'], 
                                 _x['PAY_AMT3'],_x['PAY_AMT2'],_x['PAY_AMT1']])
    return _mean

def return_mean_bill_amt(_x):
    _mean = np.mean([_x['BILL_AMT6'],_x['BILL_AMT5'],_x['BILL_AMT4'], 
                                 _x['BILL_AMT3'],_x['BILL_AMT2'],_x['BILL_AMT1']])
    return _mean
train_df['PAY_AMT_MEAN'] = train_df.apply(return_mean_pay_amt, axis=1)
train_df['BILL_AMT_MEAN'] = train_df.apply(return_std_bill_amt, axis=1)

In [38]:
conditions = [
    (train_df['BILL_AMT6'] == train_df['PAY_AMT5']) & (train_df['BILL_AMT6'] != 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT6_FULL'] = np.select(conditions, choices, 0)

conditions = [
    (train_df['BILL_AMT5'] == train_df['PAY_AMT4']) & (train_df['BILL_AMT5'] != 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT5_FULL'] = np.select(conditions, choices, 0)

conditions = [
    (train_df['BILL_AMT4'] == train_df['PAY_AMT3']) & (train_df['BILL_AMT4'] != 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT4_FULL'] = np.select(conditions, choices, 0)

conditions = [
    (train_df['BILL_AMT3'] == train_df['PAY_AMT2']) & (train_df['BILL_AMT3'] != 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT3_FULL'] = np.select(conditions, choices, 0)

conditions = [
    (train_df['BILL_AMT2'] == train_df['PAY_AMT1']) & (train_df['BILL_AMT2'] != 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT2_FULL'] = np.select(conditions, choices, 0)

In [39]:
train_df['TOTAL_PAID_BILL_FULL'] = train_df['PAID_BILL_AMT6_FULL'] + train_df['PAID_BILL_AMT5_FULL'] + train_df['PAID_BILL_AMT4_FULL'] + train_df['PAID_BILL_AMT3_FULL'] + train_df['PAID_BILL_AMT2_FULL']

In [40]:
conditions = [
    (train_df['BILL_AMT6'] != train_df['PAY_AMT5']) & (train_df['BILL_AMT6'] < 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT6_PARTIAL'] = np.select(conditions, choices, 0)


conditions = [
    (train_df['BILL_AMT5'] != train_df['PAY_AMT4']) & (train_df['BILL_AMT5'] < 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT5_PARTIAL'] = np.select(conditions, choices, 0)


conditions = [
    (train_df['BILL_AMT4'] != train_df['PAY_AMT3']) & (train_df['BILL_AMT4'] < 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT4_PARTIAL'] = np.select(conditions, choices, 0)



conditions = [
    (train_df['BILL_AMT3'] != train_df['PAY_AMT2']) & (train_df['BILL_AMT3'] < 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT3_PARTIAL'] = np.select(conditions, choices, 0)

conditions = [
    (train_df['BILL_AMT2'] != train_df['PAY_AMT1']) & (train_df['BILL_AMT2'] < 0)
]
choices = [
    1
]

train_df['PAID_BILL_AMT2_PARTIAL'] = np.select(conditions, choices, 0)

In [41]:
train_df['TOTAL_PAY'] = train_df['PAY_0'] + train_df['PAY_2'] + train_df['PAY_3'] +train_df['PAY_4'] + train_df['PAY_5'] + train_df['PAY_6']

In [42]:
train_df['BILL_AMT1_SIN'] = np.sin(train_df['BILL_AMT1'])
train_df['BILL_AMT2_SIN'] = np.sin(train_df['BILL_AMT2'])
train_df['BILL_AMT3_SIN'] = np.sin(train_df['BILL_AMT3'])
train_df['BILL_AMT4_SIN'] = np.sin(train_df['BILL_AMT4'])
train_df['BILL_AMT5_SIN'] = np.sin(train_df['BILL_AMT5'])
train_df['BILL_AMT6_SIN'] = np.sin(train_df['BILL_AMT6'])
train_df['BILL_AMT1_COS'] = np.cos(train_df['BILL_AMT1'])
train_df['BILL_AMT2_COS'] = np.cos(train_df['BILL_AMT2'])
train_df['BILL_AMT3_COS'] = np.cos(train_df['BILL_AMT3'])
train_df['BILL_AMT4_COS'] = np.cos(train_df['BILL_AMT4'])
train_df['BILL_AMT5_COS'] = np.cos(train_df['BILL_AMT5'])
train_df['BILL_AMT6_COS'] = np.cos(train_df['BILL_AMT6'])

In [43]:
train_df['PAYL_AMT1_SIN'] = np.sin(train_df['BILL_AMT1'])
train_df['PAYL_AMT2_SIN'] = np.sin(train_df['BILL_AMT2'])
train_df['PAYL_AMT3_SIN'] = np.sin(train_df['BILL_AMT3'])
train_df['PAYL_AMT4_SIN'] = np.sin(train_df['BILL_AMT4'])
train_df['PAYL_AMT5_SIN'] = np.sin(train_df['BILL_AMT5'])
train_df['PAYL_AMT6_SIN'] = np.sin(train_df['BILL_AMT6'])
train_df['PAYL_AMT1_COS'] = np.cos(train_df['BILL_AMT1'])
train_df['PAYL_AMT2_COS'] = np.cos(train_df['BILL_AMT2'])
train_df['PAYL_AMT3_COS'] = np.cos(train_df['BILL_AMT3'])
train_df['PAYL_AMT4_COS'] = np.cos(train_df['BILL_AMT4'])
train_df['PAYL_AMT5_COS'] = np.cos(train_df['BILL_AMT5'])
train_df['PAYL_AMT6_COS'] = np.cos(train_df['BILL_AMT6'])

In [44]:
train_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target,PAY_AMT_SLOPE,...,TOTAL_PAY,BILL_AMT1_SIN,BILL_AMT2_SIN,BILL_AMT3_SIN,BILL_AMT4_SIN,BILL_AMT5_SIN,BILL_AMT6_SIN,BILL_AMT1_COS,BILL_AMT2_COS,BILL_AMT3_COS,BILL_AMT4_COS,BILL_AMT5_COS,BILL_AMT6_COS,PAYL_AMT1_SIN,PAYL_AMT2_SIN,PAYL_AMT3_SIN,PAYL_AMT4_SIN,PAYL_AMT5_SIN,PAYL_AMT6_SIN,PAYL_AMT1_COS,PAYL_AMT2_COS,PAYL_AMT3_COS,PAYL_AMT4_COS,PAYL_AMT5_COS,PAYL_AMT6_COS
28835,220000.0,2.0,3,2,36.0,0.0,0.0,0.0,0.0,0.0,0.0,222598.0,222168.0,217900.0,221193.0,181859.0,184605.0,10000.0,8018.0,10121.0,6006.0,10987.0,143779.0,1.0,-19248.2,...,0.0,-0.437242,0.751758,-0.762037,-0.252781,-0.998473,-0.95436,-0.899344,0.659439,0.647534,0.967523,0.055237,0.298658,-0.437242,0.751758,-0.762037,-0.252781,-0.998473,-0.95436,-0.899344,0.659439,0.647534,0.967523,0.055237,0.298658
25329,200000.0,2.0,1,2,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,0.0,0.0,...,-6.0,-0.663611,-0.663611,-0.663611,-0.663611,-0.663611,-0.663611,0.748078,0.748078,0.748078,0.748078,0.748078,0.748078,-0.663611,-0.663611,-0.663611,-0.663611,-0.663611,-0.663611,0.748078,0.748078,0.748078,0.748078,0.748078,0.748078
18894,180000.0,2.0,3,2,27.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
690,80000.0,1.0,2,2,32.0,0.0,0.0,0.0,0.0,0.0,0.0,51372.0,51872.0,47593.0,43882.0,42256.0,42527.0,1853.0,1700.0,1522.0,1548.0,1488.0,1500.0,0.0,67.857143,...,0.0,0.626402,-0.918273,-0.848366,0.23169,0.999968,0.674104,0.7795,-0.395948,-0.529411,0.97279,-0.008013,-0.738636,0.626402,-0.918273,-0.848366,0.23169,0.999968,0.674104,0.7795,-0.395948,-0.529411,0.97279,-0.008013,-0.738636
6239,10000.0,1.0,2,2,27.0,0.0,0.0,0.0,0.0,0.0,0.0,8257.0,7995.0,4878.0,5444.0,2639.0,2697.0,2000.0,1100.0,600.0,300.0,300.0,1000.0,1.0,220.0,...,0.0,0.7799,0.345999,0.779202,0.370984,0.062131,0.998359,0.625904,-0.938235,-0.626773,-0.928639,0.998068,0.057262,0.7799,0.345999,0.779202,0.370984,0.062131,0.998359,0.625904,-0.938235,-0.626773,-0.928639,0.998068,0.057262


In [55]:
X = train_df.drop(columns=['target'])
y = train_df['target']

In [56]:
for i in X.columns:
    X[i+'_PAY_0'] = X[i] * X['PAY_0']
#     X[i+'_TOTAL_PAY'] = X[i] * X['TOTAL_PAY']

In [57]:
X

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,LIMIT_BAL_PAY_0,SEX_PAY_0,EDUCATION_PAY_0,MARRIAGE_PAY_0,AGE_PAY_0,PAY_0_PAY_0,PAY_2_PAY_0,PAY_3_PAY_0,PAY_4_PAY_0,PAY_5_PAY_0,PAY_6_PAY_0,BILL_AMT1_PAY_0,BILL_AMT2_PAY_0,BILL_AMT3_PAY_0,BILL_AMT4_PAY_0,BILL_AMT5_PAY_0,BILL_AMT6_PAY_0,PAY_AMT1_PAY_0,PAY_AMT2_PAY_0,PAY_AMT3_PAY_0,PAY_AMT4_PAY_0,PAY_AMT5_PAY_0,PAY_AMT6_PAY_0
28835,220000.0,2.0,3,2,36.0,0.0,0.0,0.0,0.0,0.0,0.0,222598.0,222168.0,217900.0,221193.0,181859.0,184605.0,10000.0,8018.0,10121.0,6006.0,10987.0,143779.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25329,200000.0,2.0,1,2,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,-200000.0,-2.0,-1.0,-2.0,-29.0,1.0,1.0,1.0,1.0,1.0,1.0,-326.0,-326.0,-326.0,-326.0,-326.0,-326.0,-326.0,-326.0,-326.0,-326.0,-326.0,-326.0
18894,180000.0,2.0,3,2,27.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-360000.0,-4.0,-6.0,-4.0,-54.0,4.0,4.0,4.0,4.0,4.0,4.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
690,80000.0,1.0,2,2,32.0,0.0,0.0,0.0,0.0,0.0,0.0,51372.0,51872.0,47593.0,43882.0,42256.0,42527.0,1853.0,1700.0,1522.0,1548.0,1488.0,1500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6239,10000.0,1.0,2,2,27.0,0.0,0.0,0.0,0.0,0.0,0.0,8257.0,7995.0,4878.0,5444.0,2639.0,2697.0,2000.0,1100.0,600.0,300.0,300.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16247,40000.0,2.0,2,1,38.0,0.0,0.0,3.0,2.0,2.0,2.0,35183.0,39197.0,39477.0,39924.0,39004.0,41462.0,4600.0,1200.0,1400.0,0.0,3069.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2693,350000.0,1.0,3,1,42.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3800.0,3138.0,4150.0,3750.0,1362.0,8210.0,3138.0,4160.0,3750.0,2272.0,8210.0,9731.0,-350000.0,-1.0,-3.0,-1.0,-42.0,1.0,1.0,1.0,1.0,1.0,1.0,-3800.0,-3138.0,-4150.0,-3750.0,-1362.0,-8210.0,-3138.0,-4160.0,-3750.0,-2272.0,-8210.0,-9731.0
8076,100000.0,2.0,1,2,46.0,1.0,-1.0,2.0,2.0,-1.0,0.0,0.0,203.0,203.0,0.0,7856.0,16544.0,203.0,0.0,0.0,7856.0,10000.0,865.0,100000.0,2.0,1.0,2.0,46.0,1.0,-1.0,2.0,2.0,-1.0,0.0,0.0,203.0,203.0,0.0,7856.0,16544.0,203.0,0.0,0.0,7856.0,10000.0,865.0
20213,20000.0,2.0,1,1,50.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,5141.0,3455.0,6906.0,0.0,0.0,0.0,3754.0,6906.0,290.0,0.0,0.0,0.0,-20000.0,-2.0,-1.0,-1.0,-50.0,1.0,1.0,1.0,1.0,2.0,2.0,-5141.0,-3455.0,-6906.0,-0.0,-0.0,-0.0,-3754.0,-6906.0,-290.0,-0.0,-0.0,-0.0


In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(data = X_train,
                      columns = X.columns)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(data = X_test,
                     columns = X.columns)

In [59]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='minority', random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [12]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)
from sklearn.linear_model import LogisticRegression




from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=19, max_features=0.5,
                       max_leaf_nodes=74, max_samples=0.66,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=13, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)



# parameters = {'max_leaf_nodes': range(10,100,1)}
#     'min_samples_split': range(2,10,1)}
#     'n_estimators': range(100, 500, 50)}
#     'max_depth': range(10,25,1)}
#              'min_samples_leaf': range(1,10,1),
#              'class_weight': ['balanced',None,'balanced_subsample']}
# grid_tree = GridSearchCV(rfc, parameters, cv=5, scoring='f1', n_jobs=-1,verbose=1)

rfc.fit(X_train, y_train)
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.8289547682172411
Testing Precision:  0.5069565217391304



Training Recall:  0.7376322562196168
Testing Recall:  0.5732546705998034



Training Accuracy:  0.792715184443809
Testing Accuracy:  0.7775555555555556



Training F1-Score:  0.7806317382258369
Testing F1-Score:  0.5380710659898477


In [24]:
import optuna

def objective(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
    rf_max_leaf_nodes = trial.suggest_int('rf_max_leaf_nodes', 10, 64)
    rf_max_features = trial.suggest_loguniform('rf_max_features', .1, .9)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 2, 20)
    rf_max_samples = trial.suggest_loguniform('rf_max_samples', .1, .9)
    rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    
    regressor_obj = RandomForestClassifier(max_depth = rf_max_depth,
                                          max_leaf_nodes = rf_max_leaf_nodes,
                                          max_features = rf_max_features,
                                          min_samples_leaf = rf_min_samples_leaf,
                                          max_samples = rf_max_samples,
                                          n_estimators=300,
                                          random_state=42,
                                          criterion = rf_criterion)
    
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_test)
    
    return (1 - f1_score(y_test, y_pred))

In [25]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-08-12 16:16:34,924] Finished trial#0 with value: 0.4557019536574285 with parameters: {'rf_max_depth': 13, 'rf_max_leaf_nodes': 63, 'rf_max_features': 0.2440939167975394, 'rf_min_samples_leaf': 6, 'rf_max_samples': 0.11831099836488107, 'rf_criterion': 'gini'}. Best is trial#0 with value: 0.4557019536574285.
[I 2020-08-12 16:16:53,759] Finished trial#1 with value: 0.4588665447897624 with parameters: {'rf_max_depth': 19, 'rf_max_leaf_nodes': 62, 'rf_max_features': 0.2058985718792555, 'rf_min_samples_leaf': 6, 'rf_max_samples': 0.34024558480495526, 'rf_criterion': 'entropy'}. Best is trial#0 with value: 0.4557019536574285.
[I 2020-08-12 16:17:01,153] Finished trial#2 with value: 0.459016393442623 with parameters: {'rf_max_depth': 28, 'rf_max_leaf_nodes': 32, 'rf_max_features': 0.12618076969354372, 'rf_min_samples_leaf': 16, 'rf_max_samples': 0.23164372785422654, 'rf_criterion': 'entropy'}. Best is trial#0 with value: 0.4557019536574285.
[I 2020-08-12 16:17:04,953] Finished trial#3 

In [60]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)
from sklearn.linear_model import LogisticRegression




from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(max_depth=5,
                            max_leaf_nodes=39,
                            max_features = .2479,
                            min_samples_leaf=8,
                            max_samples = 0.2647,
                            criterion='entropy')


# {'rf_max_depth': 5, 'rf_max_leaf_nodes': 39, 'rf_max_features': 0.24795749696390704, 'rf_min_samples_leaf': 8, 'rf_max_samples': 0.26471291186578816, 'rf_criterion': 'entropy'}
# parameters = {'max_leaf_nodes': range(10,100,1)}
#     'min_samples_split': range(2,10,1)}
#     'n_estimators': range(100, 500, 50)}
#     'max_depth': range(10,25,1)}
#              'min_samples_leaf': range(1,10,1),
#              'class_weight': ['balanced',None,'balanced_subsample']}
# grid_tree = GridSearchCV(rfc, parameters, cv=5, scoring='f1', n_jobs=-1,verbose=1)

rfc.fit(X_train, y_train)
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

SyntaxError: invalid syntax (<ipython-input-60-ea676e26d436>, line 14)

In [61]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)
from sklearn.linear_model import LogisticRegression




from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(max_depth=5,
                            max_leaf_nodes=39,
                            max_features = .2479,
                            min_samples_leaf=8,
                            max_samples = 0.2647,
                            criterion='entropy')


# {'rf_max_depth': 5, 'rf_max_leaf_nodes': 39, 'rf_max_features': 0.24795749696390704, 'rf_min_samples_leaf': 8, 'rf_max_samples': 0.26471291186578816, 'rf_criterion': 'entropy'}
# parameters = {'max_leaf_nodes': range(10,100,1)}
#     'min_samples_split': range(2,10,1)}
#     'n_estimators': range(100, 500, 50)}
#     'max_depth': range(10,25,1)}
#              'min_samples_leaf': range(1,10,1),
#              'class_weight': ['balanced',None,'balanced_subsample']}
# grid_tree = GridSearchCV(rfc, parameters, cv=5, scoring='f1', n_jobs=-1,verbose=1)

rfc.fit(X_train, y_train)
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.806497175141243
Testing Precision:  0.5404896421845574



Training Recall:  0.5714898484415213
Testing Recall:  0.5644051130776795



Training Accuracy:  0.7171861595653417
Testing Accuracy:  0.7931111111111111



Training F1-Score:  0.6689539748953974
Testing F1-Score:  0.5521885521885521


In [28]:
X_train

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,LIMIT_BAL_PAY_0,SEX_PAY_0,EDUCATION_PAY_0,MARRIAGE_PAY_0,AGE_PAY_0,PAY_0_PAY_0,PAY_2_PAY_0,PAY_3_PAY_0,PAY_4_PAY_0,PAY_5_PAY_0,PAY_6_PAY_0,BILL_AMT1_PAY_0,BILL_AMT2_PAY_0,BILL_AMT3_PAY_0,BILL_AMT4_PAY_0,BILL_AMT5_PAY_0,BILL_AMT6_PAY_0,PAY_AMT1_PAY_0,PAY_AMT2_PAY_0,PAY_AMT3_PAY_0,PAY_AMT4_PAY_0,PAY_AMT5_PAY_0,PAY_AMT6_PAY_0
0,0.486361,0.811629,-0.300988,0.852130,-0.805526,0.018902,0.110420,0.135825,0.184195,0.225981,0.248073,-0.525727,-0.439569,-0.379756,-0.054180,0.015515,-0.467170,0.024117,-0.096754,0.770012,-0.173162,-0.216717,0.143194,0.179607,0.035107,0.061382,0.014950,0.025530,-0.467093,-0.361213,-0.340515,-0.332498,-0.329965,-0.331437,-0.162774,-0.161853,-0.148473,-0.151015,-0.151374,-0.150270,0.085032,0.059574,0.077854,0.068343,0.056677,0.054135
1,-0.901232,-1.232091,-1.671809,2.775084,2.435316,0.018902,0.110420,0.135825,0.184195,0.225981,0.248073,-0.070327,-0.030784,0.005687,-0.392657,-0.363018,-0.341661,-0.215464,-0.180760,-0.239690,-0.255754,-0.267770,-0.254844,0.179607,0.035107,0.061382,0.014950,0.025530,-0.467093,-0.361213,-0.340515,-0.332498,-0.329965,-0.331437,-0.162774,-0.161853,-0.148473,-0.151015,-0.151374,-0.150270,0.085032,0.059574,0.077854,0.068343,0.056677,0.054135
2,-0.515789,0.811629,-1.671809,-1.070825,0.598839,0.018902,0.110420,0.135825,0.184195,0.225981,0.248073,0.251352,0.298867,0.338065,0.404934,0.452444,0.393796,-0.183837,-0.127644,-0.123641,-0.164109,-0.161674,-0.157128,0.179607,0.035107,0.061382,0.014950,0.025530,-0.467093,-0.361213,-0.340515,-0.332498,-0.329965,-0.331437,-0.162774,-0.161853,-0.148473,-0.151015,-0.151374,-0.150270,0.085032,0.059574,0.077854,0.068343,0.056677,0.054135
3,1.103069,-1.232091,-1.671809,-1.070825,0.922923,0.018902,0.110420,0.135825,0.184195,0.225981,0.248073,2.510020,2.704192,2.890568,3.420123,3.768343,3.976015,0.265084,0.150212,0.770012,0.319554,0.385022,0.305064,0.179607,0.035107,0.061382,0.014950,0.025530,-0.467093,-0.361213,-0.340515,-0.332498,-0.329965,-0.331437,-0.162774,-0.161853,-0.148473,-0.151015,-0.151374,-0.150270,0.085032,0.059574,0.077854,0.068343,0.056677,0.054135
4,-0.824143,-1.232091,-0.300988,0.852130,-1.021582,0.018902,0.110420,0.135825,0.184195,0.225981,0.248073,-0.169285,-0.127856,-0.138975,-0.204064,-0.156602,-0.095380,-0.215464,-0.173085,-0.176042,-0.229086,-0.121706,-0.215456,0.179607,0.035107,0.061382,0.014950,0.025530,-0.467093,-0.361213,-0.340515,-0.332498,-0.329965,-0.331437,-0.162774,-0.161853,-0.148473,-0.151015,-0.151374,-0.150270,0.085032,0.059574,0.077854,0.068343,0.056677,0.054135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27971,-0.811700,-1.232091,-0.300988,0.852130,-1.328230,0.018902,0.110420,0.135825,0.184195,0.225981,0.248073,-0.045167,-0.223302,-0.180655,-0.379532,-0.341750,-0.301532,-0.218511,-0.153746,-0.242816,-0.235625,-0.186069,-0.249195,0.179607,0.035107,0.061382,0.014950,0.025530,-0.467093,-0.361213,-0.340515,-0.332498,-0.329965,-0.331437,-0.162774,-0.161853,-0.148473,-0.151015,-0.151374,-0.150270,0.085032,0.059574,0.077854,0.068343,0.056677,0.054135
27972,-1.055409,0.811629,-1.671809,-1.070825,1.945120,1.800127,1.779141,1.800592,1.884800,0.225981,0.248073,-0.358787,-0.350573,-0.286983,-0.267684,-0.227413,-0.181679,-0.330855,-0.097244,-0.281159,-0.222725,-0.163883,-0.282077,0.436274,2.164456,0.824371,1.113255,2.582117,1.014352,1.232187,1.407986,1.513043,-0.329965,-0.331437,0.376162,0.376069,0.451860,0.489233,0.521412,0.591657,0.096301,0.255944,0.077854,0.212478,0.294450,0.064083
27973,-0.975875,0.811629,-0.300988,-1.070825,0.062125,1.800127,1.779141,1.800592,1.884800,1.978054,2.420891,-0.441634,-0.412407,-0.374261,-0.351558,-0.304000,-0.297299,-0.232388,-0.164546,-0.254046,-0.185089,-0.311729,-0.231211,0.524543,2.164456,1.587360,1.113255,1.748579,1.014352,1.232187,1.407986,1.513043,1.659999,2.313924,0.241139,0.276396,0.314748,0.355337,0.402179,0.408945,0.267590,0.156195,0.130793,0.286166,0.056677,0.143367
27974,0.775097,0.811629,1.069833,-1.070825,1.962354,-1.762323,-1.558302,-1.528942,-1.516410,-1.526092,-0.939055,-0.685051,-0.672330,-0.669123,-0.666212,-0.659566,-0.645280,-0.285346,-0.229735,-0.272497,-0.296341,-0.289351,-0.271720,-2.108618,-2.094243,-2.227586,-1.083355,-2.538686,1.014352,1.232187,1.407986,1.513043,1.659999,1.113867,-0.169973,-0.181121,-0.148473,-0.155062,-0.151374,-0.159582,-0.005401,0.059574,0.060941,0.068343,0.020688,0.028044


In [29]:
import statsmodels.api as sm
X_train_log = sm.tools.add_constant(X_train)
logit_model = sm.Logit(y_train, X_train_log)
result = logit_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.588276
         Iterations 7


0,1,2,3
Dep. Variable:,target,No. Observations:,27976.0
Model:,Logit,Df Residuals:,27929.0
Method:,MLE,Df Model:,46.0
Date:,"Wed, 12 Aug 2020",Pseudo R-squ.:,0.1513
Time:,16:58:37,Log-Likelihood:,-16458.0
converged:,True,LL-Null:,-19391.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1994,0.014,-13.862,0.000,-0.228,-0.171
LIMIT_BAL,-0.2322,0.019,-12.286,0.000,-0.269,-0.195
SEX,-0.0638,0.013,-4.737,0.000,-0.090,-0.037
EDUCATION,0.0105,0.015,0.706,0.480,-0.019,0.040
MARRIAGE,-0.1145,0.015,-7.573,0.000,-0.144,-0.085
AGE,0.0135,0.015,0.879,0.380,-0.017,0.044
PAY_0,0.3103,0.129,2.408,0.016,0.058,0.563
PAY_2,0.1099,0.024,4.598,0.000,0.063,0.157
PAY_3,0.1520,0.025,6.136,0.000,0.103,0.201


In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [65]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

bc_dtc = BaggingClassifier(
            base_estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=10, max_features=0.1461, max_leaf_nodes=14,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best'), 
            n_estimators= 300,
            max_samples= 0.66,
            max_features= .8,
            oob_score= True, n_jobs=-1
                )
# {'rf_max_depth': 10, 'rf_max_leaf_nodes': 14, 'rf_max_features': 0.14612189898099304, 'rf_min_samples_leaf': 8, 'rf_criterion': 'gini'}

bc_dtc.fit(X_train, y_train)
y_train_pred = bc_dtc.predict(X_train)
y_test_pred = bc_dtc.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.7876455125797972
Testing Precision:  0.5051993067590987



Training Recall:  0.5997998284243637
Testing Recall:  0.5732546705998034



Training Accuracy:  0.7190448956248213
Testing Accuracy:  0.7766666666666666



Training F1-Score:  0.6810064935064936
Testing F1-Score:  0.5370796867802856


In [63]:
import optuna

def objective(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
    rf_max_leaf_nodes = trial.suggest_int('rf_max_leaf_nodes', 10, 64)
    rf_max_features = trial.suggest_loguniform('rf_max_features', .1, .9)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 2, 20)
    rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    
    regressor_obj = DecisionTreeClassifier(max_depth = rf_max_depth,
                                          max_leaf_nodes = rf_max_leaf_nodes,
                                          max_features = rf_max_features,
                                          min_samples_leaf = rf_min_samples_leaf,
                                          random_state=42,
                                          criterion = rf_criterion)
    
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_test)
    
    return (1 - f1_score(y_test, y_pred))

In [64]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-08-12 17:06:30,646] Finished trial#0 with value: 0.4710424710424711 with parameters: {'rf_max_depth': 17, 'rf_max_leaf_nodes': 13, 'rf_max_features': 0.16648840070282345, 'rf_min_samples_leaf': 9, 'rf_criterion': 'gini'}. Best is trial#0 with value: 0.4710424710424711.
[I 2020-08-12 17:06:31,066] Finished trial#1 with value: 0.48395604395604397 with parameters: {'rf_max_depth': 19, 'rf_max_leaf_nodes': 25, 'rf_max_features': 0.3787485023593908, 'rf_min_samples_leaf': 20, 'rf_criterion': 'entropy'}. Best is trial#0 with value: 0.4710424710424711.
[I 2020-08-12 17:06:31,239] Finished trial#2 with value: 0.4722624216111915 with parameters: {'rf_max_depth': 2, 'rf_max_leaf_nodes': 26, 'rf_max_features': 0.10946825056636159, 'rf_min_samples_leaf': 3, 'rf_criterion': 'entropy'}. Best is trial#0 with value: 0.4710424710424711.
[I 2020-08-12 17:06:31,701] Finished trial#3 with value: 0.47303271441202477 with parameters: {'rf_max_depth': 25, 'rf_max_leaf_nodes': 45, 'rf_max_features': 0

In [66]:
from sklearn.linear_model import LogisticRegression

In [71]:
from sklearn.linear_model import LogisticRegression
bc_lr1 = BaggingClassifier(
            base_estimator=LogisticRegression(solver='saga', penalty='l2', C=.04108295837286748, max_iter = 1000, tol = 1e-3), 
            n_estimators= 300,
            max_samples= 0.66,
            max_features= .8,
            oob_score= True, n_jobs=-1
                )

# {'lr_C': 0.04108295837286748, 'lr_solver': 'saga'}
bc_lr1.fit(X_train, y_train)
y_train_pred = bc_lr1.predict(X_train)
y_test_pred = bc_lr1.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.7697157726180944
Testing Precision:  0.5149911816578483



Training Recall:  0.5498284243637404
Testing Recall:  0.5742379547689282



Training Accuracy:  0.6926651415498999
Testing Accuracy:  0.7815555555555556



Training F1-Score:  0.6414512093411175
Testing F1-Score:  0.5430032543003254


In [69]:
import optuna

def objective(trial):

    lr_C = trial.suggest_loguniform('lr_C', 1e-8, 1)
    lr_solver = trial.suggest_categorical('lr_solver', ['newton-cg','lbfgs','sag','saga','liblinear'])
    if lr_solver == 'liblinear':
        lr_penalty = trial.suggest_categorical('lr_penalty', ['l1', 'l2'])
    else:
        lr_penalty = 'l2'
    
    regressor_obj = LogisticRegression(penalty = lr_penalty,
                                          C = lr_C,
                                          solver = lr_solver,
                                          max_iter = 1000,
                                          tol = 1e-3)
    
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_test)
    
    return (1 - f1_score(y_test, y_pred))

In [70]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-08-12 17:18:10,842] Finished trial#0 with value: 0.47122133832475444 with parameters: {'lr_C': 0.00019853880704122752, 'lr_solver': 'newton-cg'}. Best is trial#0 with value: 0.47122133832475444.
[I 2020-08-12 17:18:11,212] Finished trial#1 with value: 0.5079059005013498 with parameters: {'lr_C': 2.3348048709984127e-07, 'lr_solver': 'liblinear', 'lr_penalty': 'l2'}. Best is trial#0 with value: 0.47122133832475444.
[I 2020-08-12 17:18:11,659] Finished trial#2 with value: 0.4668470906630582 with parameters: {'lr_C': 0.0024028218065910013, 'lr_solver': 'lbfgs'}. Best is trial#2 with value: 0.4668470906630582.
[I 2020-08-12 17:18:15,810] Finished trial#3 with value: 0.45627548708654275 with parameters: {'lr_C': 0.11661600526778318, 'lr_solver': 'saga'}. Best is trial#3 with value: 0.45627548708654275.
[I 2020-08-12 17:18:16,185] Finished trial#4 with value: 0.4658385093167702 with parameters: {'lr_C': 9.722217407215602e-06, 'lr_solver': 'lbfgs'}. Best is trial#3 with value: 0.456275

In [83]:
from sklearn.linear_model import LogisticRegression
knn_clf = KNeighborsClassifier(n_neighbors = 82,
                                          algorithm = 'ball_tree',
                                          p = 2)
            

# {'lr_C': 0.04108295837286748, 'lr_solver': 'saga'} {'knn_n_neighbors': 82, 'knn_algorithm': 'ball_tree', 'knn_p': 2}
knn_clf.fit(X_train, y_train)
y_train_pred = knn_clf.predict(X_train)
y_test_pred = knn_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.7283851113716295
Testing Precision:  0.42341220423412207



Training Recall:  0.7106805833571633
Testing Recall:  0.6686332350049164



Training Accuracy:  0.7228338575922219
Testing Accuracy:  0.7193333333333334



Training F1-Score:  0.7194239397886815
Testing F1-Score:  0.5184902783072817


In [72]:
from sklearn.neighbors import KNeighborsClassifier

In [81]:
import optuna

def objective(trial):

    knn_n_neighbors = trial.suggest_int('knn_n_neighbors', 40, 100)
    knn_algorithm = trial.suggest_categorical('knn_algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    knn_p = trial.suggest_categorical('knn_p', [1,2])
    
    regressor_obj = KNeighborsClassifier(n_neighbors = knn_n_neighbors,
                                          algorithm = knn_algorithm,
                                          p = knn_p,
                                          )
    
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_test)
    
    return (1 - f1_score(y_test, y_pred))

In [82]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-08-12 17:39:07,652] Finished trial#0 with value: 0.49327354260089684 with parameters: {'knn_n_neighbors': 51, 'knn_algorithm': 'brute', 'knn_p': 1}. Best is trial#0 with value: 0.49327354260089684.
[I 2020-08-12 17:39:19,463] Finished trial#1 with value: 0.48928024502297096 with parameters: {'knn_n_neighbors': 50, 'knn_algorithm': 'kd_tree', 'knn_p': 1}. Best is trial#1 with value: 0.48928024502297096.
[I 2020-08-12 17:39:33,418] Finished trial#2 with value: 0.48888051262721444 with parameters: {'knn_n_neighbors': 64, 'knn_algorithm': 'kd_tree', 'knn_p': 2}. Best is trial#2 with value: 0.48888051262721444.
[I 2020-08-12 17:39:37,107] Finished trial#3 with value: 0.4834537847090148 with parameters: {'knn_n_neighbors': 72, 'knn_algorithm': 'brute', 'knn_p': 2}. Best is trial#3 with value: 0.4834537847090148.
[I 2020-08-12 17:39:40,758] Finished trial#4 with value: 0.5031101353823637 with parameters: {'knn_n_neighbors': 40, 'knn_algorithm': 'brute', 'knn_p': 2}. Best is trial#3 wi

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)
from sklearn.linear_model import LogisticRegression




from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc_overfit = RandomForestClassifier(max_depth=5,
                            max_leaf_nodes=39,
                            max_features = .2479,
                            min_samples_leaf=8,
                            max_samples = 0.2647,
                            criterion='entropy')


# {'rf_max_depth': 5, 'rf_max_leaf_nodes': 39, 'rf_max_features': 0.24795749696390704, 'rf_min_samples_leaf': 8, 'rf_max_samples': 0.26471291186578816, 'rf_criterion': 'entropy'}
# parameters = {'max_leaf_nodes': range(10,100,1)}
#     'min_samples_split': range(2,10,1)}
#     'n_estimators': range(100, 500, 50)}
#     'max_depth': range(10,25,1)}
#              'min_samples_leaf': range(1,10,1),
#              'class_weight': ['balanced',None,'balanced_subsample']}
# grid_tree = GridSearchCV(rfc, parameters, cv=5, scoring='f1', n_jobs=-1,verbose=1)

rfc_overfit.fit(X_train, y_train)
y_train_pred = rfc_overfit.predict(X_train)
y_test_pred = rfc_overfit.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

In [92]:
import optuna

def objective(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 50, 120)
    rf_max_leaf_nodes = trial.suggest_int('rf_max_leaf_nodes', 70, 240)
    rf_max_features = trial.suggest_loguniform('rf_max_features', .1, .9)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 2, 20)
    rf_max_samples = trial.suggest_loguniform('rf_max_samples', .1, .9)
    rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    
    regressor_obj = RandomForestClassifier(max_depth = rf_max_depth,
                                          max_leaf_nodes = rf_max_leaf_nodes,
                                          max_features = rf_max_features,
                                          min_samples_leaf = rf_min_samples_leaf,
                                          max_samples = rf_max_samples,
                                          n_estimators=300,
                                          random_state=42,
                                          criterion = rf_criterion)
    
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_test)
    
    return (1 - f1_score(y_test, y_pred))

In [93]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-08-12 18:28:19,026] Finished trial#0 with value: 0.46220009053870537 with parameters: {'rf_max_depth': 51, 'rf_max_leaf_nodes': 103, 'rf_max_features': 0.5688882165344097, 'rf_min_samples_leaf': 19, 'rf_max_samples': 0.12774545996747017, 'rf_criterion': 'gini'}. Best is trial#0 with value: 0.46220009053870537.
[I 2020-08-12 18:28:25,409] Finished trial#1 with value: 0.4589941096511101 with parameters: {'rf_max_depth': 91, 'rf_max_leaf_nodes': 154, 'rf_max_features': 0.12630569012206003, 'rf_min_samples_leaf': 17, 'rf_max_samples': 0.13388884320727892, 'rf_criterion': 'entropy'}. Best is trial#1 with value: 0.4589941096511101.
[I 2020-08-12 18:28:29,345] Finished trial#2 with value: 0.45884492951341516 with parameters: {'rf_max_depth': 92, 'rf_max_leaf_nodes': 73, 'rf_max_features': 0.10241779538408154, 'rf_min_samples_leaf': 17, 'rf_max_samples': 0.14333395362889853, 'rf_criterion': 'gini'}. Best is trial#2 with value: 0.45884492951341516.
[I 2020-08-12 18:28:33,639] Finished t

In [67]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)
from sklearn.linear_model import LogisticRegression




from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=19, max_features=0.5,
                       max_leaf_nodes=74, max_samples=0.66,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=13, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)



# parameters = {'max_leaf_nodes': range(10,100,1)}
#     'min_samples_split': range(2,10,1)}
#     'n_estimators': range(100, 500, 50)}
#     'max_depth': range(10,25,1)}
#              'min_samples_leaf': range(1,10,1),
#              'class_weight': ['balanced',None,'balanced_subsample']}
# grid_tree = GridSearchCV(rfc, parameters, cv=5, scoring='f1', n_jobs=-1,verbose=1)

rfc.fit(X_train, y_train)
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.5416055334311465
Testing Precision:  0.5195996663886572



Training Recall:  0.6442283719770631
Testing Recall:  0.6125860373647984



Training Accuracy:  0.7992110672815156
Testing Accuracy:  0.7844444444444445



Training F1-Score:  0.5884764290594398
Testing F1-Score:  0.5622743682310468


In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

bc_dtc = BaggingClassifier(
            base_estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=6, max_features=0.4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best'), 
            n_estimators= 300,
            max_samples= 0.66,
            max_features= .8,
            oob_score= True, n_jobs=-1
                )


bc_dtc.fit(X_train, y_train)
y_train_pred = bc_dtc.predict(X_train)
y_test_pred = bc_dtc.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.5098631239935588
Testing Precision:  0.5035913806863528



Training Recall:  0.6315133383196211
Testing Recall:  0.6204523107177975



Training Accuracy:  0.7825990332796267
Testing Accuracy:  0.776



Training F1-Score:  0.5642053680810781
Testing F1-Score:  0.5559471365638767


In [36]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier


adaboost_clf = AdaBoostClassifier(random_state=42, n_estimators=300)


adaboost_clf.fit(X_train, y_train)
y_train_pred = adaboost_clf.predict(X_train)
y_test_pred = adaboost_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.7302631578947368
Testing Precision:  0.6858846918489065



Training Recall:  0.387434554973822
Testing Recall:  0.3392330383480826



Training Accuracy:  0.8316017556530918
Testing Accuracy:  0.8155555555555556



Training F1-Score:  0.5062713797035349
Testing F1-Score:  0.45394736842105265


In [15]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier


gbt_clf  = GradientBoostingClassifier(random_state=42)


gbt_clf.fit(X_train, y_train)
y_train_pred = gbt_clf.predict(X_train)
y_test_pred = gbt_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.7211174242424242
Testing Precision:  0.6948529411764706



Training Recall:  0.37970580902518075
Testing Recall:  0.37168141592920356



Training Accuracy:  0.8290460581143397
Testing Accuracy:  0.8211111111111111



Training F1-Score:  0.4974685611628287
Testing F1-Score:  0.48430493273542596


In [37]:
from sklearn.linear_model import LogisticRegression
bc_lr1 = BaggingClassifier(
            base_estimator=LogisticRegression(solver='liblinear', class_weight='balanced', penalty='l1', C=.01), 
            n_estimators= 300,
            max_samples= 0.66,
            max_features= .8,
            oob_score= True, n_jobs=-1
                )


bc_lr1.fit(X_train, y_train)
y_train_pred = bc_lr1.predict(X_train)
y_test_pred = bc_lr1.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.5153196622436671
Testing Precision:  0.5377990430622009



Training Recall:  0.5325355272999253
Testing Recall:  0.5526057030481809



Training Accuracy:  0.7842102339018835
Testing Accuracy:  0.7915555555555556



Training F1-Score:  0.523786169691025
Testing F1-Score:  0.545101842870999


In [38]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
                estimators=[('rfc', rfc), ('bc_dtc', bc_dtc),  ('bc_lr1', bc_lr1)],
                voting='hard', n_jobs=-1)

In [39]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)
# from sklearn.linear_model import LogisticRegression

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_train = pd.DataFrame(data = X_train,
#                       columns = X.columns)
# X_test = scaler.transform(X_test)
# X_test = pd.DataFrame(data = X_test,
#                      columns = X.columns)

voting_clf.fit(X_train, y_train)
y_train_pred = voting_clf.predict(X_train)
y_test_pred = voting_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.532349841938883
Testing Precision:  0.5188916876574308



Training Recall:  0.6297681376215407
Testing Recall:  0.6076696165191741



Training Accuracy:  0.7942107894883049
Testing Accuracy:  0.784



Training F1-Score:  0.5769757880310644
Testing F1-Score:  0.5597826086956521


In [40]:
import xgboost as xgb
import pandas.util.testing as tm


xgb_clf = xgb.XGBClassifier()



param_grid = {
    "learning_rate": [0.1],
    'max_depth': [6],
    'min_child_weight': [10],
    'subsample': [ 0.7],
    'n_estimators': [5, 30, 100, 250],
}
grid_clf = GridSearchCV(xgb_clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)

grid_clf.fit(X_train, y_train)
y_train_pred = grid_clf.predict(X_train)
y_test_pred = grid_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

  import pandas.util.testing as tm


Training Precision:  0.9835447377442578
Testing Precision:  0.6342281879194631



Training Recall:  0.7152829718274745
Testing Recall:  0.37168141592920356



Training Accuracy:  0.9338852158453247
Testing Accuracy:  0.8095555555555556



Training F1-Score:  0.8282332563510393
Testing F1-Score:  0.4686918784872908


In [41]:
import xgboost as xgb


xgb_clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.9, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.02, max_delta_step=0, max_depth=18,
              min_child_weight=5, monotone_constraints=None,
              n_estimators=230, n_jobs=-1, num_parallel_tree=1, 
              objective='binary:logistic', random_state=42, reg_alpha=0.1,
              reg_lambda=0.1, scale_pos_weight=67, subsample=0.8, tree_method=None,
              validate_parameters=False, verbosity=None)



# param_grid = {
#     "learning_rate": [0.1],
#     'max_depth': [6],
#     'min_child_weight': [10],
#     'subsample': [ 0.7],
#     'n_estimators': [5, 30, 100, 250],
# }
# grid_clf = GridSearchCV(xgb_clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)

xgb_clf.fit(X_train, y_train)
y_train_pred = xgb_clf.predict(X_train)
y_test_pred = xgb_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.5332358415315076
Testing Precision:  0.35747021081576535



Training Recall:  1.0
Testing Recall:  0.7669616519174042



Training Accuracy:  0.8049336074226346
Testing Accuracy:  0.6357777777777778



Training F1-Score:  0.6955692361050898
Testing F1-Score:  0.4876523913723039


In [None]:
import xgboost as xgb
import pandas.util.testing as tm


xgb_clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=18,
              min_child_weight=7, monotone_constraints=None,
              n_estimators=130, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=67, subsample=0.9, tree_method=None,
              validate_parameters=False, verbosity=None)



# param_grid = {
#     "learning_rate": [0.1],
#     'max_depth': [6],
#     'min_child_weight': [10],
#     'subsample': [ 0.7],
#     'n_estimators': [5, 30, 100, 250],
# }
# grid_clf = GridSearchCV(xgb_clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)

xgb_clf.fit(X_train, y_train)
y_train_pred = xgb_clf.predict(X_train)
y_test_pred = xgb_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

In [42]:
grid_clf.best_estimator_

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=10, missing=nan, monotone_constraints=None,
              n_estimators=30, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.7, tree_method=None,
              validate_parameters=False, verbosity=None)

In [None]:
{'n_estimators': 130, 'max_depth': 18, 'min_child_weight': 7, 'scale_pos_weight': 67, 'subsample': 0.9, 'colsample_bytree': 0.8}

In [45]:
import xgboost as xgb
import pandas.util.testing as tm


xgb_clf = xgb.XGBClassifier(gpu_id=-1)



param_grid = {
    "learning_rate": [0.1, 0.05, 0.01],
    'max_depth': [6, 7, 8, 9],
    'min_child_weight': [8,10,12],
    'subsample': [ 0.5,0.7],
    'n_estimators': [5, 30, 100, 250],
}
grid_clf = GridSearchCV(xgb_clf, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)

grid_clf.fit(X_train, y_train)
y_train_pred = grid_clf.predict(X_train)
y_test_pred = grid_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.6min


KeyboardInterrupt: 

In [20]:
import optuna

In [115]:
def objective(trial):
    

    
    param = {
        'silent':1,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': trial.suggest_categorical('booster', {'gbtree', 'gblinear', 'dart'}),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
    }
    
    if param['booster'] == 'gbtree' or param['booster'] == 'dart':
        param['max_depth'] = trial.suggest_int('max_depth', 1, 9)
        param['eta'] = trial.suggest_loguniform('eta', 1e-8, 1.0)
        param['gamma'] = trial.suggest_loguniform('gamma', 1e-8, 1.0)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    
    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree','forest'])
        param['rate_drop'] = trial.suggest_loguniform('rate_drop', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
        
        

    xgboost_tuna = xgb.XGBClassifier()
    
    xgboost_tuna.fit(X_train, y_train)
    tuna_pred_test = xgboost_tuna.predict(X_test)
    
    return (1-f1_score(y_test, tuna_pred_test))

In [21]:
def opt(X_train, y_train, X_test, y_test, trial):
    #param_list
    booster = 'gbtree'
#     lambda_ = trial.suggest_loguniform('lambda', 1e-8, 1.0)
    alpha_ = trial.suggest_loguniform('alpha', 1e-8, 1.0)
    max_depth = trial.suggest_int('max_depth', 1, 9)
    eta = trial.suggest_loguniform('eta', 1e-8, 1.0)
    gamma =trial.suggest_loguniform('gamma', 1e-8, 1.0)
    grow_policy = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    

    xgboost_tuna = xgb.XGBClassifier(
        random_state=42, 
        booster = booster,
        alpha = alpha_,
        max_depth = max_depth,
        eta = eta,
        gamma = gamma,
        grow_policy = grow_policy
    )
    xgboost_tuna.fit(X_train, y_train)
    tuna_pred_test = xgboost_tuna.predict(X_test)
    
    return (1-f1_score(y_test, tuna_pred_test))

In [22]:
def opt(X_train, y_train, X_test, y_test, trial):
    #param_list

    
    n_estimators = trial.suggest_int('n_estimators', 0, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 40)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 30)
    learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.01, 0.1, 0.01)
    scale_pos_weight = trial.suggest_int('scale_pos_weight', 1, 100)
    subsample = trial.suggest_discrete_uniform('subsample', 0.4, 0.9, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.4, 0.9, 0.1)

    xgboost_tuna = xgb.XGBClassifier(
        random_state=42, 
        tree_method='gpu_hist',
        n_estimators = n_estimators,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        learning_rate = learning_rate,
        scale_pos_weight = scale_pos_weight,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
    )
    xgboost_tuna.fit(X_train, y_train)
    tuna_pred_test = xgboost_tuna.predict(X_test)
    
    return (1-f1_score(y_test, tuna_pred_test))

In [23]:
import xgboost as xgb
import pandas.testing
import optuna


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [24]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [None]:
study = optuna.create_study()
import functools
study.optimize(functools.partial(opt, X_train, y_train, X_test, y_test), n_trials=100)

In [None]:
{'n_estimators': 234, 'max_depth': 40, 'min_child_weight': 12, 'learning_rate': 0.060000000000000005, 'scale_pos_weight': 7, 'subsample': 0.5, 'colsample_bytree': 0.9}

In [None]:
import xgboost as xgb
import pandas.util.testing as tm


xgb_clf = xgb.XGBClassifier(gpu_id=-1)



param_grid = {
    "learning_rate": [0.1, 0.05, 0.01],
    'max_depth': [6, 7, 8, 9],
    'min_child_weight': [8,10,12],
    'subsample': [ 0.5,0.7],
    'n_estimators': [5, 30, 100, 250],
}
grid_clf = GridSearchCV(xgb_clf, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)

grid_clf.fit(X_train, y_train)
y_train_pred = grid_clf.predict(X_train)
y_test_pred = grid_clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

In [90]:
study = optuna.create_study()

In [91]:
study.optimize(objective, n_trials = 5)

[0]	validation-auc:0.74282
[1]	validation-auc:0.74897
[2]	validation-auc:0.75378
[3]	validation-auc:0.75323
[4]	validation-auc:0.75277
[5]	validation-auc:0.75307
[6]	validation-auc:0.75253
[7]	validation-auc:0.75219
[8]	validation-auc:0.75184
[9]	validation-auc:0.75199


[I 2020-08-12 08:49:38,997] Finished trial#0 with value: 0.8073333333333333 with parameters: {'booster': 'gblinear', 'lambda': 4.456886019648993e-07, 'alpha': 0.034263600573274974}. Best is trial#0 with value: 0.8073333333333333.


[0]	validation-auc:0.74358
[1]	validation-auc:0.74400
[2]	validation-auc:0.75995
[3]	validation-auc:0.77288
[4]	validation-auc:0.77311
[5]	validation-auc:0.77683
[6]	validation-auc:0.77753
[7]	validation-auc:0.77954
[8]	validation-auc:0.77997
[9]	validation-auc:0.78039


[I 2020-08-12 08:49:40,586] Finished trial#1 with value: 0.8226666666666667 with parameters: {'booster': 'dart', 'lambda': 0.28620214818397915, 'alpha': 0.005755769888038868, 'max_depth': 3, 'eta': 0.15782301315139421, 'gamma': 0.007067552755752852, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 8.635053410458381e-08, 'skip_drop': 1.8187042298764968e-08}. Best is trial#0 with value: 0.8073333333333333.


[0]	validation-auc:0.75804
[1]	validation-auc:0.76268
[2]	validation-auc:0.76491
[3]	validation-auc:0.76473
[4]	validation-auc:0.76746
[5]	validation-auc:0.76756
[6]	validation-auc:0.77028
[7]	validation-auc:0.77120
[8]	validation-auc:0.77198
[9]	validation-auc:0.77243


[I 2020-08-12 08:49:43,779] Finished trial#2 with value: 0.8166666666666667 with parameters: {'booster': 'gbtree', 'lambda': 1.2831527099149306e-05, 'alpha': 1.606872674693005e-05, 'max_depth': 7, 'eta': 0.004339937997300324, 'gamma': 3.089278839700702e-07, 'grow_policy': 'lossguide'}. Best is trial#0 with value: 0.8073333333333333.


[0]	validation-auc:0.50000
[1]	validation-auc:0.50000
[2]	validation-auc:0.50000
[3]	validation-auc:0.56266
[4]	validation-auc:0.56266
[5]	validation-auc:0.69845
[6]	validation-auc:0.70232
[7]	validation-auc:0.70232
[8]	validation-auc:0.70232
[9]	validation-auc:0.70232


[I 2020-08-12 08:49:45,176] Finished trial#3 with value: 0.8202222222222222 with parameters: {'booster': 'gbtree', 'lambda': 4.803349021360904e-06, 'alpha': 0.042726156481543265, 'max_depth': 2, 'eta': 2.318138409578163e-08, 'gamma': 2.5336928569809252e-08, 'grow_policy': 'depthwise'}. Best is trial#0 with value: 0.8073333333333333.


[0]	validation-auc:0.70510
[1]	validation-auc:0.75092
[2]	validation-auc:0.75113
[3]	validation-auc:0.75100
[4]	validation-auc:0.75097
[5]	validation-auc:0.75134
[6]	validation-auc:0.75138
[7]	validation-auc:0.75103
[8]	validation-auc:0.75099
[9]	validation-auc:0.75135


[I 2020-08-12 08:49:47,015] Finished trial#4 with value: 0.82 with parameters: {'booster': 'gbtree', 'lambda': 0.8126054411330562, 'alpha': 0.0008412211426558617, 'max_depth': 4, 'eta': 3.3744751949930707e-07, 'gamma': 0.002192157250346616, 'grow_policy': 'depthwise'}. Best is trial#0 with value: 0.8073333333333333.


In [95]:
import xgboost as xgb
import pandas.util.testing as tm

param = {'booster': 'gblinear', 
         'lambda': 0.0006425358326832809, 
         'alpha': 0.0003101918662407133}

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
dtrain = xgb.DMatrix(X_train, label= y_train)
dtest = xgb.DMatrix(X_test, label= y_test)

bst = xgb.train(param, dtrain, evals=[(dtest, 'validation')])



# param_grid = {
#     "learning_rate": [0.1, 0.05, 0.01],
#     'max_depth': [6, 7, 8, 9],
#     'min_child_weight': [8,10,12],
#     'subsample': [ 0.5,0.7],
#     'n_estimators': [5, 30, 100, 250],
# }
# grid_clf = GridSearchCV(xgb_clf, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)

# xgb_clf.fit(X_train, y_train)
y_train_pred = bst.predict(X_train)
y_test_pred = bst.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

[0]	validation-rmse:0.38509
[1]	validation-rmse:0.38306
[2]	validation-rmse:0.38227
[3]	validation-rmse:0.38176
[4]	validation-rmse:0.38141
[5]	validation-rmse:0.38120
[6]	validation-rmse:0.38106
[7]	validation-rmse:0.38094
[8]	validation-rmse:0.38086
[9]	validation-rmse:0.38079


TypeError: ('Expecting data to be a DMatrix object, got: ', <class 'pandas.core.frame.DataFrame'>)

In [None]:
from sklearn.svm import LinearSVC, SVC
from sklearn.kernel_approximation import Nystroem

clf = SVC(kernel='poly', degree=8)
# feature_map_nystroem = Nystroem(gamma=.2,
#                                random_state=1,
#                                n_components=300)
# X_train_transformed = feature_map_nystroem.fit_transform(X_train)
# X_test_transformed = feature_map_nystroem.transform(X_test)

clf.fit(X_train, y_train)

# bc_dtc = BaggingClassifier(
#             base_estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
#                        max_depth=6, max_features=0.4, max_leaf_nodes=None,
#                        min_impurity_decrease=0.0, min_impurity_split=None,
#                        min_samples_leaf=8, min_samples_split=2,
#                        min_weight_fraction_leaf=0.0, presort='deprecated',
#                        random_state=42, splitter='best'), 
#             n_estimators= 100,
#             max_samples= 0.66,
#             max_features= .8,
#             oob_score= True, n_jobs=-1
#                 )


# bc_dtc.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

In [None]:
from sklearn.svm import LinearSVC, SVC
from sklearn.kernel_approximation import Nystroem

clf = SVC(kernel='poly')
# feature_map_nystroem = Nystroem(gamma=.2,
#                                random_state=1,
#                                n_components=300)
# X_train_transformed = feature_map_nystroem.fit_transform(X_train)
# X_test_transformed = feature_map_nystroem.transform(X_test)

clf.fit(X_train, y_train)

# bc_dtc = BaggingClassifier(
#             base_estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
#                        max_depth=6, max_features=0.4, max_leaf_nodes=None,
#                        min_impurity_decrease=0.0, min_impurity_split=None,
#                        min_samples_leaf=8, min_samples_split=2,
#                        min_weight_fraction_leaf=0.0, presort='deprecated',
#                        random_state=42, splitter='best'), 
#             n_estimators= 100,
#             max_samples= 0.66,
#             max_features= .8,
#             oob_score= True, n_jobs=-1
#                 )


# bc_dtc.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))