In [None]:
# !pip3 install missingpy

In [None]:
#!pip3 install scikit-learn==0.20.1  # dep for missingpy
#!pip3 install scikit-learn==0.24.1  # required for getting tree diagram

In [None]:
data_path = '../data/'

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
random_state = 2021

## Year $N$
Run all in this section...

In [None]:
N = '5'
drop_cols = ['Attr37', 'Attr7','Attr43','Attr62','Attr32','Attr44','Attr15','Attr19','Attr3','Attr51','Attr4','Attr49','Attr38','Attr60','Attr6']

In [None]:
arff = loadarff(data_path+N+'year.arff')
df = pd.DataFrame(arff[0])
df['class']= df['class'].astype('int')
df = df.drop_duplicates()
# df.head()

In [None]:
# change dropcols accordingly...
df.isnull().sum().sort_values(ascending=False).head()

In [None]:
df = df.drop(drop_cols,axis='columns')
# df.shape

### Retain NaN in test set also

In [None]:
X = df.drop('class',axis='columns')
Y = df['class']
# (X.shape,Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=random_state,shuffle=True,stratify=Y)

display(Y_train.value_counts(), Y_test.value_counts())

In [None]:
X_train,Y_train,X_test,Y_test = X_train.to_numpy(),Y_train.to_numpy(),X_test.to_numpy(),Y_test.to_numpy()

### Standardise

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

#scaler = None

#idx = np.isnan(X_train).any(axis=1)
#scaler = StandardScaler().fit(X_train[~idx])

## Missing values

In [None]:
def build_imputer(imputer_estimator,X_train,Y_train,transform_x=True,reset_index = True,verbose=True,max_iter=10,tol=1,imputer=None,scaler=None):
    # train = pd.concat([X_train,Y_train],axis=1)
    # train['class'] = train['class'].astype('category')
    # if reset_index:
    #     train = train.reset_index(drop=True)
    
    if imputer is None:
        imputer = IterativeImputer(estimator=imputer_estimator, n_nearest_features=None, imputation_order='descending',verbose=verbose,max_iter=max_iter,tol=tol)
        imputer = imputer.fit(X_train,Y_train)
    else:
        imputer = imputer.fit(X_train,Y_train)
        
    
    if transform_x:
        X_train = imputer.transform(X_train)
        
        #if scaler is not None:
        #    X_train = scaler.transform(X_train)
        
        return imputer,X_train
    
    return imputer


In [None]:
from missingpy import MissForest

missf_imp = MissForest(random_state=random_state,verbose=1,n_jobs=4)
missf_imp,X_train_imp = build_imputer(None,X_train,Y_train,transform_x=True,imputer=missf_imp)
X_test_imp = missf_imp.transform(X_test)

# note... these are not scaled!!
np.save("y"+N+"_realmissforest_train.npy",X_train_imp)
np.save("y"+N+"_realmissforest_test.npy",X_test_imp)


In [None]:
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor

# has to be scaled...
knn_imp = build_imputer( KNeighborsRegressor(n_jobs=4) ,scaler.transform(X_train),Y_train,transform_x=False,reset_index = True,verbose=True,max_iter=64,tol=0.009,scaler=scaler)

    

In [None]:
from sklearn.impute import SimpleImputer
simple_imp = SimpleImputer(missing_values=np.nan, strategy='mean')
simple_imp = simple_imp.fit(X_train)

## Various Classification Models

In [None]:
# choose imputer <<comment blocks accordingly>>

#### for simple
scaled_already = False
X_train_imp = simple_imp.transform(X_train)
X_test_imp = simple_imp.transform(X_test)

################################# OR ################################
'''
#### for KNN
scaled_already = True
X_train_imp = knn_imp.transform(scaler.transform(X_train))
X_test_imp = knn_imp.transform(scaler.transform(X_test))
'''
################################# OR ################################
'''
#### for missf, use saved files...
scaled_already = False
X_train_imp = np.load("y"+N+"_realmissforest_train.npy")
X_test_imp = np.load("y"+N+"_realmissforest_test.npy")
'''

In [None]:
# !pip3 install imblearn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier # Voting Ensemble for Classification

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [None]:
def try_all_classifiers(X_train, X_test, y_train, y_test, classifiers, sampling  = None, scaler=None):
    '''
    do all imputations before passing here...
    Classifier : array of tuples (classifier,scaling required=True/False)
    '''
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    
    if sampling == "SMOTE":
        smote = SMOTE(sampling_strategy=0.6,random_state=random_state,n_jobs=4)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        rus = RandomUnderSampler(sampling_strategy=0.6,random_state=random_state)
        X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    if sampling == "SMOTEENN":
        smoteenn = SMOTEENN(sampling_strategy=0.6,random_state=random_state,n_jobs=4)
        X_train, y_train = smoteenn.fit_resample(X_train, y_train)
        print("SMOTEENN")    
        
    
    for i in range(len(classifiers)):
        classif = classifiers[i][0]
        y_pred = []
        print(classifiers_names[i])
        
        if classifiers[i][1] and not scaled_already:
            print("\t- Requires scaling and not scaled. Doing it now...")
            classif.fit(scaler.transform(X_train), y_train)
            y_pred = classif.predict(scaler.transform(X_test))
            
        else:
            classif.fit(X_train, y_train)
            y_pred = classif.predict(X_test)
        
        accuracy[i] = classif.score(X_test, y_test)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    print("Done")
    return accuracy,f1,precision,recall

In [None]:
classifiers_voting = [('log',LogisticRegression(max_iter=2048)),("knn10",KNeighborsClassifier(n_neighbors=10)),("dtc",DecisionTreeClassifier()),("svm_linear",SVC(kernel='linear',random_state=random_state)),("rf",RandomForestClassifier(n_estimators=16, n_jobs=8, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]
# classifiers_voting = [("dtc",DecisionTreeClassifier()),("rf",RandomForestClassifier(n_estimators=64, n_jobs=8, random_state=random_state)),("xbg",XGBClassifier(use_label_encoder=False))]

# create the ensemble model
ensemble = VotingClassifier(classifiers_voting,weights=[1,1,1,1,1,2],n_jobs=4,voting="hard")


classifiers_names = ["Logistic Regression", "LDA", "KNN 5", "KNN 10", "GNB", "DT", "SVM", "RFC", "XGB","Voting"]
classifiers = [(LogisticRegression(max_iter=2048),True), 
                (LinearDiscriminantAnalysis(),True),
                (KNeighborsClassifier(n_neighbors=5),True), 
                (KNeighborsClassifier(n_neighbors=10),True),
                (GaussianNB(),True),
                (DecisionTreeClassifier(random_state=random_state),False),
                (SVC(kernel='linear',random_state=random_state),True),
                (RandomForestClassifier(random_state=random_state),False),
                (XGBClassifier(use_label_encoder=False),False),
                (ensemble,True)]



In [None]:
accuracy,f1,precision,recall = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, scaler=scaler)


#### Sampling

In [None]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTE", scaler=scaler)


In [None]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "RUS", scaler=scaler)


In [None]:
accuracy_smoteenn,f1_smoteenn,precision_smoteenn,recall_smoteenn = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTEENN",scaler=scaler)


In [None]:
print('''
| classifier          | Accuracy | Precision | Recall | F1 score |
| =================== | ======== | ========= | ====== | ======== |''')
for c,a,p,r,f in zip(classifiers_names,accuracy,precision,recall,f1):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_sm,precision_sm,recall_sm,f1_sm):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_rus,precision_rus,recall_rus,f1_rus):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_smoteenn,precision_smoteenn,recall_smoteenn,f1_smoteenn):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    

# ------
# EOF


## sample train

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay

In [None]:
def build_confusion_matrix(y_true,y_pred):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure()
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=["Not Bankrupt","Bankrupt"])
    disp = disp.plot()
    plt.show()
    
    # plt.savefig(dir_+"/cf_test-epoch"+str(e+1)+".png",transparent=True)

In [None]:
def build_model(model,X_train,Y_train,X_val,Y_val,X_test,Y_test,report_train_scores=True,impute_test=False,imputer=None,scaler=None):
    # pass the imputed train set
    
    if scaler is not None:
        X_train = scaler.transform(X_train)
    
    model = model.fit(X_train, Y_train)
    
    if impute_test:
        X_test = imputer.transform(X_test)
    
    # impute -> standardise -> predict
    if scaler is not None:
        X_test = scaler.transform(X_test)
    
    y_pred = model.predict(X_test)

    print(classification_report(Y_test, y_pred))
    build_confusion_matrix(Y_test,y_pred)
    
    if report_train_scores:
        print("For train...")
        y_pred = model.predict(X_train)
        print(classification_report(Y_train, y_pred))
    
    # print(y_pred)
    
    return model

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
build_model(knn,X_train_imp,Y_train,None,None,X_test,Y_test,impute_test=True,imputer=imputer,scaler=scaler)

In [None]:
# DecTrees
from sklearn.tree import DecisionTreeClassifier
decTree = DecisionTreeClassifier()
build_model(decTree,X_train_imp,Y_train,None,None,X_test,Y_test,impute_test=True,imputer=missf,scaler=scaler)

In [None]:
# from sklearn.tree import plot_tree
# plot_tree(decTree) 

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada = AdaBoostClassifier()
build_model(ada,X_train_imp,Y_train,None,None,X_test,Y_test,impute_test=True,imputer=missf,scaler=scaler)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=16)
xgb = build_model(xgb,X_train_imp,Y_train,None,None,X_test,Y_test,impute_test=True,imputer=missf,scaler=scaler)



In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(max_iter=1024)
build_model(logistic,X_train_imp,Y_train,None,None,X_test,Y_test,impute_test=True,imputer=missf,scaler=scaler)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

ada = RandomForestClassifier(n_estimators=16, n_jobs=4, random_state=random_state, verbose=1)
build_model(ada,X_train_imp,Y_train,None,None,X_test,Y_test,impute_test=True,imputer=missf,scaler=scaler)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=30)
pca = pca.fit(X_train_imp)

In [None]:
X_train_imp_pca = pca.transform(X_train_imp)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=16)
xgb = build_model(xgb,X_train_imp_pca,Y_train,None,None,pca.transform(knn_imp.transform(X_test)),Y_test,impute_test=False)



## Combining...

In [None]:
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, AdaBoostClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from xgboost import XGBClassifier


all_imputers = [ExtraTreesRegressor(n_estimators=16, n_jobs=4, random_state=random_state, verbose=0)]
# all_imputers = [KNeighborsRegressor(n_jobs=4),ExtraTreesRegressor(n_estimators=16, n_jobs=4, random_state=random_state, verbose=0),RandomForestRegressor(n_estimators=16, n_jobs=4, random_state=random_state, verbose=0),BayesianRidge(verbose=0)]

imputer=None
X_train_imp=None

for imputer_estimator in all_imputers:
    print(imputer_estimator)
    imputer,X_train_imp = build_imputer(imputer_estimator,X_train,Y_train,transform_x=True,reset_index = True,verbose=True,max_iter=32,tol=0.01,scaler=scaler)


### Various Classification Models

In [None]:
# !pip3 install imblearn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [None]:
def try_all_classifiers(X_train, X_test, y_train, y_test, classifiers, sampling  = None):
    accuracy = [0]*len(classifiers)
    f1 = [0]*len(classifiers)
    precision = [0]*len(classifiers)
    recall = [0]*len(classifiers)
    i = 0
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=1, shuffle = True)
    if sampling == "SMOTE":
        smote = SMOTE(sampling_strategy=0.6,random_state=random_state,n_jobs=4)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print("SMOTE")
    if sampling == "RUS":
        rus = RandomUnderSampler(random_state=random_state,n_jobs=4)
        X_train, y_train = rus.fit_resample(X_train, y_train)
        print("RUS")
    if sampling == "SMOTEENN":
        rus = SMOTEENN(sampling_strategy=0.6,random_state=random_state,n_jobs=4)
        X_train, y_train = rus.fit_resample(X_train, y_train)
        print("SMOTEENN")
        
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
        
        
    
    for i in range(len(classifiers)):
        classif = classifiers[i]
        classif.fit(X_train, y_train)
        y_pred = classif.predict(X_test)
        
        accuracy[i] = classif.score(X_test, y_test)
        f1[i] = metrics.f1_score(y_test, y_pred, labels=np.unique(y_pred))
        precision[i] = metrics.precision_score(y_test, y_pred)
        recall[i] = metrics.recall_score(y_test, y_pred)
    print("Done")
    return accuracy,f1,precision,recall

In [None]:
classifiers_names = ["Logistic Regression", "LDA", "KNN 5", "KNN 10", "GNB", "DT", "SVM", "RFC", "XGB"]
classifiers = [LogisticRegression(max_iter=2048), LinearDiscriminantAnalysis(),KNeighborsClassifier(n_neighbors=5), KNeighborsClassifier(n_neighbors=10),GaussianNB(),DecisionTreeClassifier(),SVC(kernel='linear',random_state=random_state),RandomForestClassifier(),XGBClassifier(use_label_encoder=False)]



In [None]:
#X_test_imp = imputer.transform(X_test)

In [None]:
accuracy,f1,precision,recall = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers)


### Sampling

In [None]:
accuracy_sm,f1_sm,precision_sm,recall_sm = try_all_classifiers(scaler.transform(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTE")



In [None]:
## scaler.transform(X_train_imp),scaler.transform(imputer.transform(X_test)),Y_train,Y_test,

In [None]:
accuracy_rus,f1_rus,precision_rus,recall_rus = try_all_classifiers(X_train_imp,X_test_imp,Y_train,Y_test, classifiers, sampling = "SMOTEENN")


In [None]:

print('''
| classifier          | Accuracy | Precision | Recall | F1 score |
| =================== | ======== | ========= | ====== | ======== |''')
for c,a,p,r,f in zip(classifiers_names,accuracy,precision,recall,f1):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
    
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_sm,precision_sm,recall_sm,f1_sm):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  
print("\n===============================================================\n")
for c,a,p,r,f in zip(classifiers_names,accuracy_rus,precision_rus,recall_rus,f1_rus):
    print(c,"  |  ",a,"  |  ",p,"  |  ",r,"  |  ",f,"  |  ")
  


In [None]:
np.save("y_incomp_realmiss_train.npy",X_train_imp)
np.save("y_incomp_realmiss_test.npy",X_test_imp)

In [None]:
X_train_imp = np.load('y5_incomp_realmiss_train.npy')
#display(np.all(arr == X_train_imp))

X_test_imp = np.load('y5_incomp_realmiss_test.npy')
#display(np.all(arr == X_test_imp))