In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 50)

In [2]:
train_df = pd.read_csv('training_data.csv', index_col = 0)

_columns = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1',
       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
       'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
       'PAY_AMT6', 'target']
train_df.columns = _columns
train_df.drop('ID', axis=0, inplace=True)

In [3]:
for col in _columns:
    try:
        train_df[col] = train_df[col].astype('float64')
    except:
        continue

In [4]:
conditions = [
    train_df['EDUCATION'].eq(1),
    train_df['EDUCATION'].eq(2),
    train_df['EDUCATION'].eq(3)
]
choices = [
    3,
    2,
    1
]
# reassigning education 1->3, 3->1 because:
# 1 = 23.966% defaults, 2 = 31.482% defaults, 3 = 34.334% defaults, 4 = 9.322% defaults...
# changed the order hoping it will make an affect? shrug..
# not planning to expand categorical columns
train_df['EDUCATION'] = np.select(conditions, choices, 4)
train_df['EDUCATION'].value_counts()

2    10516
3     7919
1     3713
4      351
Name: EDUCATION, dtype: int64

In [5]:
conditions = [
    train_df['MARRIAGE'].eq(1),
    train_df['MARRIAGE'].eq(2),
]
choices = [
    1,
    2
]

train_df['MARRIAGE'] = np.select(conditions, choices, 3)
train_df['MARRIAGE'].value_counts()

2    12026
1    10195
3      278
Name: MARRIAGE, dtype: int64

In [6]:
train_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target
28835,220000.0,2.0,3,2,36.0,0.0,0.0,0.0,0.0,0.0,0.0,222598.0,222168.0,217900.0,221193.0,181859.0,184605.0,10000.0,8018.0,10121.0,6006.0,10987.0,143779.0,1.0
25329,200000.0,2.0,1,2,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,0.0
18894,180000.0,2.0,3,2,27.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
690,80000.0,1.0,2,2,32.0,0.0,0.0,0.0,0.0,0.0,0.0,51372.0,51872.0,47593.0,43882.0,42256.0,42527.0,1853.0,1700.0,1522.0,1548.0,1488.0,1500.0,0.0
6239,10000.0,1.0,2,2,27.0,0.0,0.0,0.0,0.0,0.0,0.0,8257.0,7995.0,4878.0,5444.0,2639.0,2697.0,2000.0,1100.0,600.0,300.0,300.0,1000.0,1.0


In [7]:
train_df['SEX_EDUCATION_MARRIAGE'] = train_df['SEX'] * train_df['EDUCATION'] * train_df['MARRIAGE']

In [8]:
X = train_df.drop(columns=['target'])
y = train_df['target']

In [9]:
from sklearn.preprocessing import PolynomialFeatures
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly2_data = poly_2.fit_transform(X)
poly2_columns = poly_2.get_feature_names(X.columns)
df_poly2_df = pd.DataFrame(data = poly2_data, 
                           index = X.index,
                           columns = poly2_columns)
df_poly2_df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,SEX_EDUCATION_MARRIAGE,LIMIT_BAL^2,...,PAY_AMT1 PAY_AMT4,PAY_AMT1 PAY_AMT5,PAY_AMT1 PAY_AMT6,PAY_AMT1 SEX_EDUCATION_MARRIAGE,PAY_AMT2^2,PAY_AMT2 PAY_AMT3,PAY_AMT2 PAY_AMT4,PAY_AMT2 PAY_AMT5,PAY_AMT2 PAY_AMT6,PAY_AMT2 SEX_EDUCATION_MARRIAGE,PAY_AMT3^2,PAY_AMT3 PAY_AMT4,PAY_AMT3 PAY_AMT5,PAY_AMT3 PAY_AMT6,PAY_AMT3 SEX_EDUCATION_MARRIAGE,PAY_AMT4^2,PAY_AMT4 PAY_AMT5,PAY_AMT4 PAY_AMT6,PAY_AMT4 SEX_EDUCATION_MARRIAGE,PAY_AMT5^2,PAY_AMT5 PAY_AMT6,PAY_AMT5 SEX_EDUCATION_MARRIAGE,PAY_AMT6^2,PAY_AMT6 SEX_EDUCATION_MARRIAGE,SEX_EDUCATION_MARRIAGE^2
28835,220000.0,2.0,3.0,2.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,222598.0,222168.0,217900.0,221193.0,181859.0,184605.0,10000.0,8018.0,10121.0,6006.0,10987.0,143779.0,12.0,4.840000e+10,...,60060000.0,109870000.0,1.437790e+09,120000.0,64288324.0,81150178.0,48156108.0,88093766.0,1.152820e+09,96216.0,102434641.0,60786726.0,111199427.0,1.455187e+09,121452.0,36072036.0,65987922.0,863536674.0,72072.0,120714169.0,1.579700e+09,131844.0,2.067240e+10,1725348.0,144.0
25329,200000.0,2.0,1.0,2.0,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,4.0,4.000000e+10,...,106276.0,106276.0,1.062760e+05,1304.0,106276.0,106276.0,106276.0,106276.0,1.062760e+05,1304.0,106276.0,106276.0,106276.0,1.062760e+05,1304.0,106276.0,106276.0,106276.0,1304.0,106276.0,1.062760e+05,1304.0,1.062760e+05,1304.0,16.0
18894,180000.0,2.0,3.0,2.0,27.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,3.240000e+10,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.000000e+00,0.0,144.0
690,80000.0,1.0,2.0,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,51372.0,51872.0,47593.0,43882.0,42256.0,42527.0,1853.0,1700.0,1522.0,1548.0,1488.0,1500.0,4.0,6.400000e+09,...,2868444.0,2757264.0,2.779500e+06,7412.0,2890000.0,2587400.0,2631600.0,2529600.0,2.550000e+06,6800.0,2316484.0,2356056.0,2264736.0,2.283000e+06,6088.0,2396304.0,2303424.0,2322000.0,6192.0,2214144.0,2.232000e+06,5952.0,2.250000e+06,6000.0,16.0
6239,10000.0,1.0,2.0,2.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,8257.0,7995.0,4878.0,5444.0,2639.0,2697.0,2000.0,1100.0,600.0,300.0,300.0,1000.0,4.0,1.000000e+08,...,600000.0,600000.0,2.000000e+06,8000.0,1210000.0,660000.0,330000.0,330000.0,1.100000e+06,4400.0,360000.0,180000.0,180000.0,6.000000e+05,2400.0,90000.0,90000.0,300000.0,1200.0,90000.0,3.000000e+05,1200.0,1.000000e+06,4000.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16247,40000.0,2.0,2.0,1.0,38.0,0.0,0.0,3.0,2.0,2.0,2.0,35183.0,39197.0,39477.0,39924.0,39004.0,41462.0,4600.0,1200.0,1400.0,0.0,3069.0,0.0,4.0,1.600000e+09,...,0.0,14117400.0,0.000000e+00,18400.0,1440000.0,1680000.0,0.0,3682800.0,0.000000e+00,4800.0,1960000.0,0.0,4296600.0,0.000000e+00,5600.0,0.0,0.0,0.0,0.0,9418761.0,0.000000e+00,12276.0,0.000000e+00,0.0,16.0
2693,350000.0,1.0,3.0,1.0,42.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3800.0,3138.0,4150.0,3750.0,1362.0,8210.0,3138.0,4160.0,3750.0,2272.0,8210.0,9731.0,3.0,1.225000e+11,...,7129536.0,25762980.0,3.053588e+07,9414.0,17305600.0,15600000.0,9451520.0,34153600.0,4.048096e+07,12480.0,14062500.0,8520000.0,30787500.0,3.649125e+07,11250.0,5161984.0,18653120.0,22108832.0,6816.0,67404100.0,7.989151e+07,24630.0,9.469236e+07,29193.0,9.0
8076,100000.0,2.0,1.0,2.0,46.0,1.0,-1.0,2.0,2.0,-1.0,0.0,0.0,203.0,203.0,0.0,7856.0,16544.0,203.0,0.0,0.0,7856.0,10000.0,865.0,4.0,1.000000e+10,...,1594768.0,2030000.0,1.755950e+05,812.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.0,61716736.0,78560000.0,6795440.0,31424.0,100000000.0,8.650000e+06,40000.0,7.482250e+05,3460.0,16.0
20213,20000.0,2.0,1.0,1.0,50.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,5141.0,3455.0,6906.0,0.0,0.0,0.0,3754.0,6906.0,290.0,0.0,0.0,0.0,2.0,4.000000e+08,...,0.0,0.0,0.000000e+00,7508.0,47692836.0,2002740.0,0.0,0.0,0.000000e+00,13812.0,84100.0,0.0,0.0,0.000000e+00,580.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.000000e+00,0.0,4.0


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_poly2_df, y, random_state = 42, test_size=0.2)
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(data = X_train,
                      columns = df_poly2_df.columns)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(data = X_test,
                     columns = df_poly2_df.columns)

# from imblearn.over_sampling import SMOTE
# sm = SMOTE(sampling_strategy='minority', random_state=23)
# X_train, y_train = sm.fit_sample(X_train, y_train)

logreg = LogisticRegression(solver='liblinear', class_weight='balanced', penalty='l1', C=21)
logreg.fit(X_train, y_train)
y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.4778761061946903
Testing Precision:  0.4760501105379514



Training Recall:  0.6327599102468212
Testing Recall:  0.6352015732546706



Training Accuracy:  0.7640980054447469
Testing Accuracy:  0.7595555555555555



Training F1-Score:  0.544518343703068
Testing F1-Score:  0.544229149115417


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=19, max_features='auto',
                       max_leaf_nodes=74, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)



rfc.fit(X_train, y_train)
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Training Precision: ', precision_score(y_train, y_train_pred))
print('Testing Precision: ', precision_score(y_test, y_test_pred))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_train_pred))
print('Testing Recall: ', recall_score(y_test, y_test_pred))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_train_pred))
print('Testing Accuracy: ', accuracy_score(y_test, y_test_pred))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_train_pred))
print('Testing F1-Score: ', f1_score(y_test, y_test_pred))

Training Precision:  0.5057515337423313
Testing Precision:  0.5023076923076923



Training Recall:  0.6576913487908252
Testing Recall:  0.6420845624385447



Training Accuracy:  0.7804878048780488
Testing Accuracy:  0.7753333333333333



Training F1-Score:  0.5718001517286225
Testing F1-Score:  0.5636599050496331
