In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier
import numpy as np 

In [2]:
brupt = pd.read_csv("Bankruptcy.csv")
X = brupt.drop(['NO','D'], axis=1)
y = brupt['D']
lr = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                   test_size = 0.3, 
                                   random_state=24,
                                   stratify=y)

In [3]:
scaler = StandardScaler().set_output(transform='pandas')
pca = PCA(n_components=11).set_output(transform='pandas')

In [5]:
pipe = Pipeline([('SCL',scaler),('PCA',pca),('LR',lr)])
pipe.fit(X_train, y_train)
print(np.cumsum(pca.explained_variance_ratio_ * 100)) 

[36.89727764 49.26805297 60.18881995 68.68873469 75.68772544 80.484899
 84.66449768 88.67298039 91.81964019 94.50927463 96.43097012]


In [6]:
y_pred = pipe.predict(X_test)
y_pred_prob = pipe.predict_proba(X_test)
print(accuracy_score(y_test, y_pred))
print(log_loss(y_test, y_pred_prob))

0.75
0.8442593798231378


In [7]:
print(pipe.get_params())
params = {'PCA__n_components': np.arange(6,12),
          'LR__C': np.linspace(0.001, 3, 5)}
kfold = StratifiedKFold(n_splits=5, shuffle=True, 
                        random_state=24)
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold ,
                   scoring='neg_log_loss')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

{'memory': None, 'steps': [('SCL', StandardScaler()), ('PCA', PCA(n_components=11)), ('LR', LogisticRegression())], 'transform_input': None, 'verbose': False, 'SCL': StandardScaler(), 'PCA': PCA(n_components=11), 'LR': LogisticRegression(), 'SCL__copy': True, 'SCL__with_mean': True, 'SCL__with_std': True, 'PCA__copy': True, 'PCA__iterated_power': 'auto', 'PCA__n_components': 11, 'PCA__n_oversamples': 10, 'PCA__power_iteration_normalizer': 'auto', 'PCA__random_state': None, 'PCA__svd_solver': 'auto', 'PCA__tol': 0.0, 'PCA__whiten': False, 'LR__C': 1.0, 'LR__class_weight': None, 'LR__dual': False, 'LR__fit_intercept': True, 'LR__intercept_scaling': 1, 'LR__l1_ratio': None, 'LR__max_iter': 100, 'LR__multi_class': 'deprecated', 'LR__n_jobs': None, 'LR__penalty': 'l2', 'LR__random_state': None, 'LR__solver': 'lbfgs', 'LR__tol': 0.0001, 'LR__verbose': 0, 'LR__warm_start': False}
{'LR__C': np.float64(0.75075), 'PCA__n_components': np.int64(10)}
-0.478659773514722


In [9]:
nb = GaussianNB()
pipe = Pipeline([('SCL',scaler),('PCA',pca),('NB',nb)])
params = {'PCA__n_components': [5,6,7,8,9]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, 
                        random_state=24)
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold ,
                   scoring='neg_log_loss')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

{'PCA__n_components': 6}
-1.261673131953675


In [10]:
rf = RandomForestClassifier(random_state=24)
pipe = Pipeline([('SCL',scaler),('PCA',pca),('RF',rf)])
params = {'PCA__n_components': [5,6,7,8,9]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, 
                        random_state=24)
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold ,
                   scoring='neg_log_loss')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

{'PCA__n_components': 7}
-0.48505233458041663
