In [1]:
########### import all necessary functions ###########
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, silhouette_score
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.feature_selection import VarianceThreshold, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE, SVMSMOTE, ADASYN, BorderlineSMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from xgboost import XGBClassifier

In [2]:
#### Warning Remove
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [3]:
###### Import the dataset #######
df =pd.read_csv('cirrhosis.csv')
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [4]:
####### print the shape of this dataset #######
print("Number of rows of this dataset is = ", df.shape[0],'\n')
print("Number of columns of this dataset = ", df.shape[1],'\n')
####### Drop the ID column ########
df.drop(['ID'], axis = 1, inplace = True)
print("ID column is drooped from the dataset.")

Number of rows of this dataset is =  418 

Number of columns of this dataset =  20 

ID column is drooped from the dataset.


In [5]:
###### Checking missing value is presence or not #######
for column_, value_ in zip(df.isnull().sum().index, df.isnull().sum().values):
    print(f"{column_} has missing value = {value_}", end = " | ")

N_Days has missing value = 0 | Status has missing value = 0 | Drug has missing value = 106 | Age has missing value = 0 | Sex has missing value = 0 | Ascites has missing value = 106 | Hepatomegaly has missing value = 106 | Spiders has missing value = 106 | Edema has missing value = 0 | Bilirubin has missing value = 0 | Cholesterol has missing value = 134 | Albumin has missing value = 0 | Copper has missing value = 108 | Alk_Phos has missing value = 106 | SGOT has missing value = 106 | Tryglicerides has missing value = 136 | Platelets has missing value = 11 | Prothrombin has missing value = 2 | Stage has missing value = 6 | 

In [6]:
###### Missing value columns #######
nan_feature_ = df.isnull().sum()[df.isnull().sum() > 0]
nan_feature_

Drug             106
Ascites          106
Hepatomegaly     106
Spiders          106
Cholesterol      134
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [7]:
####### Status is our Target column #######
df.loc[:, 'Status'] = df.loc[:, 'Status'].map({'C': 0, 'D': 1, 'CL': 2})

In [8]:
####### 'Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides' = for capturing NaN with new features ########
for feature_ in ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']:
    df[feature_+"_nan"] = np.where(df.loc[:, feature_].isnull(), 1, 0)
    df.loc[:, feature_].fillna(df.loc[:, feature_].median(), inplace = True)
    print(feature_, " is done using capturing NaN values with new features - median")

Cholesterol  is done using capturing NaN values with new features - median
Copper  is done using capturing NaN values with new features - median
Alk_Phos  is done using capturing NaN values with new features - median
SGOT  is done using capturing NaN values with new features - median
Tryglicerides  is done using capturing NaN values with new features - median
Platelets  is done using capturing NaN values with new features - median
Prothrombin  is done using capturing NaN values with new features - median


In [9]:
##### For Stage - Most Frequent value ######
df.loc[:, 'Stage'].fillna(df.loc[:, 'Stage'].value_counts(ascending = False).index[0], inplace = True)
print("Stage is done.")

Stage is done.


In [10]:
##### Drug, Ascites, Hepatomegaly, Spiders - with new categories = missing #####
for feature_ in ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']:
    df.loc[:, feature_].fillna('missing', inplace = True)
    print(feature_, " is done.")

Drug  is done.
Ascites  is done.
Hepatomegaly  is done.
Spiders  is done.


In [11]:
if df.isnull().sum().sum() == 0:
    print("No NaN value is exits.")
else:
    print("NaN value exists.")

No NaN value is exits.


In [12]:
######## print the shape of this dataset ######
print("Number of rows    = ", df.shape[0])
print("Number od columns = ", df.shape[1])

Number of rows    =  418
Number od columns =  26


In [13]:
###### Check how many unique value in each categorical features are peresence in this dataset #######
for column_ in ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']:
    print(f"Number of unique value {column_:12} has = {df.loc[:, column_].nunique()}")
###### Here, it's not too much Unique value in the features - so we can use the One Hot Encoding #######

Number of unique value Drug         has = 3
Number of unique value Sex          has = 2
Number of unique value Ascites      has = 3
Number of unique value Hepatomegaly has = 3
Number of unique value Spiders      has = 3
Number of unique value Edema        has = 3


In [14]:
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,...,Platelets,Prothrombin,Stage,Cholesterol_nan,Copper_nan,Alk_Phos_nan,SGOT_nan,Tryglicerides_nan,Platelets_nan,Prothrombin_nan
0,400,1,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,...,190.0,12.2,4.0,0,0,0,0,0,0,0
1,4500,0,D-penicillamine,20617,F,N,Y,Y,N,1.1,...,221.0,10.6,3.0,0,0,0,0,0,0,0
2,1012,1,D-penicillamine,25594,M,N,N,N,S,1.4,...,151.0,12.0,4.0,0,0,0,0,0,0,0
3,1925,1,D-penicillamine,19994,F,N,Y,Y,S,1.8,...,183.0,10.3,4.0,0,0,0,0,0,0,0
4,1504,2,Placebo,13918,F,N,Y,Y,N,3.4,...,136.0,10.9,3.0,0,0,0,0,0,0,0


In [15]:
####### One Hot Encoding with get_dummies ########
df = pd.get_dummies(df, columns = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema'], drop_first = True)
###### save this csv file ########
df.to_csv('cirrhosis_new_.csv')
y = df.loc[:, 'Status']
df.drop(['Status'], axis = 1, inplace = True)
df.head()

Unnamed: 0,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,...,Drug_missing,Sex_M,Ascites_Y,Ascites_missing,Hepatomegaly_Y,Hepatomegaly_missing,Spiders_Y,Spiders_missing,Edema_S,Edema_Y
0,400,21464,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,...,0,0,1,0,1,0,1,0,0,1
1,4500,20617,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,...,0,0,0,0,1,0,1,0,0,0
2,1012,25594,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,...,0,1,0,0,0,0,0,0,1,0
3,1925,19994,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,...,0,0,0,0,1,0,1,0,1,0
4,1504,13918,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,...,0,0,0,0,1,0,1,0,0,0


In [16]:
###### Split the dataset into X, y #######
X = df.iloc[:, :].values
y = y.astype('int')
y.value_counts()

0    232
1    161
2     25
Name: Status, dtype: int64

In [17]:
####### Scaling the dataset using Standard scaler #########
standard_scaler_ = StandardScaler()
X = standard_scaler_.fit_transform(X)
####### split the dataset into train and test ########
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [18]:
###### Decision Tree Classifier #######
def DecisionTree(X_train, X_test, y_train, y_test):
    decision_tree_ = DecisionTreeClassifier()
    decision_tree_.fit(X_train, y_train)
    predicted_ = decision_tree_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

###### Random Tree Classifier #######
def RandomForest(X_train, X_test, y_train, y_test):
    decision_tree_ = RandomForestClassifier()
    decision_tree_.fit(X_train, y_train)
    predicted_ = decision_tree_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

###### Extra Tree Classifier #######
def ExtraTree(X_train, X_test, y_train, y_test):
    extra_tree_ = ExtraTreeClassifier()
    extra_tree_.fit(X_train, y_train)
    predicted_ = extra_tree_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

###### Adaboost Classifier #######
def Adaboost(X_train, X_test, y_train, y_test):
    adaboost_ = AdaBoostClassifier()
    adaboost_.fit(X_train, y_train)
    predicted_ = adaboost_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

###### Gradient Boosting Classifier #######
def GradientBoost(X_train, X_test, y_train, y_test):
    gradient_boost_ = GradientBoostingClassifier()
    gradient_boost_.fit(X_train, y_train)
    predicted_ = gradient_boost_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

##### XGBOOST Classifier #######
def xgboost(X_train, X_test, y_train, y_test):
    xgboost_ = XGBClassifier()
    xgboost_.fit(X_train, y_train)
    predicted_ = xgboost_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

###### KNN Classifier #######
def KNN(X_train, X_test, y_train, y_test):
    knn_ = KNeighborsClassifier()
    knn_.fit(X_train, y_train)
    predicted_ = knn_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

###### logistic Regression Classifier #######
def Logistic(X_train, X_test, y_train, y_test):
    logistic_ = LogisticRegression()
    logistic_.fit(X_train, y_train)
    predicted_ = logistic_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')

###### SVC Classifier #######
def SVC(X_train, X_test, y_train, y_test):
    svc_ = SVC()
    svc_.fit(X_train, y_train)
    predicted_ = svc_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')


###### SGD Classifier #######
def SGD(X_train, X_test, y_train, y_test):
    sgd_= SGDClassifier()
    sgd_.fit(X_train, y_train)
    predicted_ = sgd_.predict(X_test)
    return accuracy_score(predicted_, y_test), precision_score(predicted_, y_test, average = 'macro'), recall_score(predicted_, y_test, average = 'macro'), f1_score(predicted_, y_test, average = 'macro')



In [19]:
###### printing all accuracy ########
print("Decision Tree  :\n", DecisionTree(X_train, X_test, y_train, y_test))
print("The Extra Tree :\n", ExtraTree(X_train, X_test, y_train, y_test))
print("Adaboosting    :\n", Adaboost(X_train, X_test, y_train, y_test))
print("XgBoosting     :\n", xgboost(X_train, X_test, y_train, y_test))
print("Logistic Regre :\n", Logistic(X_train, X_test, y_train, y_test))
print("KNN Classifier :\n", KNN(X_train, X_test, y_train, y_test))
print("SGD Classifier :\n", SGD(X_train, X_test, y_train, y_test))

Decision Tree  :
 (0.6571428571428571, 0.5641414141414142, 0.5795118047218888, 0.5698494377739661)
The Extra Tree :
 (0.6, 0.4212121212121212, 0.4485616010006254, 0.43445116364302727)
Adaboosting    :
 (0.6952380952380952, 0.48484848484848486, 0.48125172318720705, 0.48130985480383076)
XgBoosting     :
 (0.7619047619047619, 0.5828282828282828, 0.6332926332926333, 0.596638909687715)
Logistic Regre :
 (0.8, 0.5621212121212121, 0.5412186379928315, 0.5505201086596435)
KNN Classifier :
 (0.7714285714285715, 0.5363636363636363, 0.5335542667771334, 0.5262515262515263)
SGD Classifier :
 (0.5619047619047619, 0.40151515151515155, 0.38095238095238093, 0.3875080906148867)


In [20]:
###### It's an imbalanced dataset #######
y.value_counts()

0    232
1    161
2     25
Name: Status, dtype: int64

In [21]:
###### import the dataset again #######
df = pd.read_csv('cirrhosis_new_.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,N_Days,Status,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,...,Drug_missing,Sex_M,Ascites_Y,Ascites_missing,Hepatomegaly_Y,Hepatomegaly_missing,Spiders_Y,Spiders_missing,Edema_S,Edema_Y
0,0,400,1,21464,14.5,261.0,2.6,156.0,1718.0,137.95,...,0,0,1,0,1,0,1,0,0,1
1,1,4500,0,20617,1.1,302.0,4.14,54.0,7394.8,113.52,...,0,0,0,0,1,0,1,0,0,0
2,2,1012,1,25594,1.4,176.0,3.48,210.0,516.0,96.1,...,0,1,0,0,0,0,0,0,1,0
3,3,1925,1,19994,1.8,244.0,2.54,64.0,6121.8,60.63,...,0,0,0,0,1,0,1,0,1,0
4,4,1504,2,13918,3.4,279.0,3.53,143.0,671.0,113.15,...,0,0,0,0,1,0,1,0,0,0


In [22]:
####### Split the dataset into X and y ########
y = df.loc[:, 'Status']
df.drop(['Status'], axis = 1, inplace = True)
X = df.iloc[:, :].values

In [23]:
###### Scaling the dataset with MinMax #######
minmax_scaler_ = StandardScaler()
X = minmax_scaler_.fit_transform(X)

In [24]:
####### Convert imbalanced dataset to balanced ########
adasyn_ = ADASYN(random_state = 42, n_neighbors = 3)
X, y = adasyn_.fit_resample(X, y)
X.shape, y.shape

((691, 31), (691,))

In [25]:
###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [26]:
###### printing all accuracy ########
print("Decision Tree  :\n", DecisionTree(X_train, X_test, y_train, y_test))
print("The Extra Tree :\n", ExtraTree(X_train, X_test, y_train, y_test))
print("Adaboosting    :\n", Adaboost(X_train, X_test, y_train, y_test))
print("XgBoosting     :\n", xgboost(X_train, X_test, y_train, y_test))
print("Random Forest  :\n", RandomForest(X_train, X_test, y_train, y_test))
print("Logistic Regre :\n", Logistic(X_train, X_test, y_train, y_test))
print("KNN Classifier :\n", KNN(X_train, X_test, y_train, y_test))
print("SGD Classifier :\n", SGD(X_train, X_test, y_train, y_test))

####### Here, Xgboost working well ########

Decision Tree  :
 (0.7788461538461539, 0.7737406190219257, 0.7751336898395721, 0.7729385160520647)
The Extra Tree :
 (0.625, 0.6130295449714688, 0.6093002881275411, 0.6106693657758889)
Adaboosting    :
 (0.7740384615384616, 0.7683491113618155, 0.7702986581722412, 0.7686447803319131)
XgBoosting     :
 (0.8894230769230769, 0.8832017135102435, 0.887964212525616, 0.8850250626566417)
Random Forest  :
 (0.8942307692307693, 0.8898358431026309, 0.8893219089736503, 0.8884517038095332)
Logistic Regre :
 (0.7836538461538461, 0.7877732541979366, 0.7860727228057245, 0.7859686814186643)
KNN Classifier :
 (0.7451923076923077, 0.7364088226157192, 0.7510567333729609, 0.7274984481688392)
SGD Classifier :
 (0.6971153846153846, 0.7052737855823156, 0.7026697912385206, 0.6973952970809728)


In [27]:
###### Stratified with 10 Fold #######
Kfold_ = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
count_, precision_score_, recall_score_, f1_score_, accuracy_score_ = 1, [], [], [], []
###### run a loop ######
for train_index, test_index in Kfold_.split(X, y):
    print("Cross Validation # ", count_)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ###### Classifier called Xgboost #######
    xgboost_ = XGBClassifier()
    xgboost_.fit(X_train, y_train)
    predicted_ = xgboost_.predict(X_test)
    accuracy_score_.append(accuracy_score(predicted_, y_test))
    precision_score_.append(precision_score(predicted_, y_test, average = 'macro'))
    recall_score_.append(recall_score(predicted_, y_test, average = 'macro'))
    f1_score_.append(f1_score(predicted_, y_test, average = 'macro'))
    count_ = count_ + 1
    
print("\n")
print("accuracy is  = ", np.array(accuracy_score_).mean())
print("recall   is  = ", np.array(recall_score_).mean())
print("precision is = ", np.array(precision_score_).mean())
print("f1_score is  = ", np.array(f1_score_).mean())

Cross Validation #  1
Cross Validation #  2
Cross Validation #  3
Cross Validation #  4
Cross Validation #  5
Cross Validation #  6
Cross Validation #  7
Cross Validation #  8
Cross Validation #  9
Cross Validation #  10


accuracy is  =  0.9001449275362319
recall   is  =  0.902305443567648
precision is =  0.8981145617667357
f1_score is  =  0.8972987315184066


In [28]:
###### Stratified with 10 Fold #######
Kfold_ = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
count_, precision_score_, recall_score_, f1_score_, accuracy_score_ = 1, [], [], [], []
###### run a loop ######
for train_index, test_index in Kfold_.split(X, y):
    print("Cross Validation # ", count_)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ###### Classifier called Xgboost #######
    random_forest_ = RandomForestClassifier(n_estimators=500)
    random_forest_.fit(X_train, y_train)
    predicted_ = random_forest_.predict(X_test)
    accuracy_score_.append(accuracy_score(predicted_, y_test))
    precision_score_.append(precision_score(predicted_, y_test, average = 'macro'))
    recall_score_.append(recall_score(predicted_, y_test, average = 'macro'))
    f1_score_.append(f1_score(predicted_, y_test, average = 'macro'))
    count_ = count_ + 1
    
print("\n")
print("accuracy is  = ", np.array(accuracy_score_).mean())
print("recall   is  = ", np.array(recall_score_).mean())
print("precision is = ", np.array(precision_score_).mean())
print("f1_score is  = ", np.array(f1_score_).mean())

Cross Validation #  1
Cross Validation #  2
Cross Validation #  3
Cross Validation #  4
Cross Validation #  5
Cross Validation #  6
Cross Validation #  7
Cross Validation #  8
Cross Validation #  9
Cross Validation #  10


accuracy is  =  0.9001242236024843
recall   is  =  0.9021940498765776
precision is =  0.8984824957651044
f1_score is  =  0.8972592735062139


In [29]:
###### Stacking and Blendong ######

estimators_ = [
    ('random_forest', RandomForestClassifier()),
    ('XgBoost_Classif', XGBClassifier())
]
stacking_classifier_ = StackingClassifier(estimators = estimators_, final_estimator = XGBClassifier(), n_jobs = -1, cv = 10)
stacking_classifier_.fit(X_train, y_train)

In [30]:
predicted_ = stacking_classifier_.predict(X_test)
print("accuracy is  = ", accuracy_score(predicted_, y_test))
print("recall   is  = ", recall_score(predicted_, y_test, average = 'macro'))
print("precision is = ", precision_score(predicted_, y_test, average = 'macro'))
print("f1_score is  = ", f1_score(predicted_, y_test, average = 'macro'))

accuracy is  =  0.8695652173913043
recall   is  =  0.8652035886818495
precision is =  0.8670634920634921
f1_score is  =  0.8658754281854586


In [31]:
####### Voting Ensemble #########
voting_ensemble_ = VotingClassifier(estimators = estimators_)
voting_ensemble_.fit(X_train, y_train)

In [32]:
predicted_ = voting_ensemble_.predict(X_test)
print("accuracy is  = ", accuracy_score(predicted_, y_test))
print("recall   is  = ", recall_score(predicted_, y_test, average = 'macro'))
print("precision is = ", precision_score(predicted_, y_test, average = 'macro'))
print("f1_score is  = ", f1_score(predicted_, y_test, average = 'macro'))

accuracy is  =  0.9420289855072463
recall   is  =  0.9404761904761904
precision is =  0.9404761904761904
f1_score is  =  0.9404761904761904


In [33]:
###### Stratified with 10 Fold #######
Kfold_ = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
count_, precision_score_, recall_score_, f1_score_, accuracy_score_ = 1, [], [], [], []
estimators_ = [
    ('random_forest', RandomForestClassifier()),
    ('XgBoost_Classif', XGBClassifier()),
    ('Decision Tree', DecisionTreeClassifier())
]
###### run a loop ######
for train_index, test_index in Kfold_.split(X, y):
    print("Cross Validation # ", count_)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ###### Classifier called Xgboost #######
    voting_ensemble_ = VotingClassifier(estimators = estimators_)
    voting_ensemble_.fit(X_train, y_train)
    predicted_ = voting_ensemble_.predict(X_test)
    accuracy_score_.append(accuracy_score(predicted_, y_test))
    precision_score_.append(precision_score(predicted_, y_test, average = 'macro'))
    recall_score_.append(recall_score(predicted_, y_test, average = 'macro'))
    f1_score_.append(f1_score(predicted_, y_test, average = 'macro'))
    count_ = count_ + 1
    
print("\n")
print("accuracy is  = ", np.array(accuracy_score_).mean())
print("recall   is  = ", np.array(recall_score_).mean())
print("precision is = ", np.array(precision_score_).mean())
print("f1_score is  = ", np.array(f1_score_).mean())


Cross Validation #  1
Cross Validation #  2
Cross Validation #  3
Cross Validation #  4
Cross Validation #  5
Cross Validation #  6
Cross Validation #  7
Cross Validation #  8
Cross Validation #  9
Cross Validation #  10


accuracy is  =  0.9087991718426502
recall   is  =  0.9098469030699254
precision is =  0.9065561202082941
f1_score is  =  0.9058268627647225


In [34]:
###### Tuning the XgBOOST ######

##### parameters for XGBOOST #####
# param_grid_ = {
#     "learning_rate"     : [0.01, 0.1],
#      "max_depth"        : [6, 10, 20, 50, None],
#      "min_child_weight" : [3, 5, 7, None ],
#      "gamma"            : [0.1, 0.2 , 0.3],
#      "colsample_bytree" : [0.3, 0.4, 0.5 , 0.7]}

# ##### fit into grid #####
# grid_search_cv_ = GridSearchCV(
#     estimator = XGBClassifier(),
#     param_grid = param_grid_,
#     scoring = 'accuracy',
#     n_jobs = -1,
#     cv = 3,
#     verbose = 2)

# ##### call the grid #####
# grid_search_cv_.fit(X_train, y_train)

In [35]:
# grid_search_cv_.best_score_

In [36]:
##### PCA Test #####
pca_ = PCA(n_components = 8)
X = pca_.fit_transform(X)

In [37]:
###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [38]:
###### Stacking and Blendong ######

estimators_ = [
    ('random_forest', RandomForestClassifier()),
    ('XgBoost_Classif', XGBClassifier())
]
stacking_classifier_ = StackingClassifier(estimators = estimators_, final_estimator = XGBClassifier(), n_jobs = -1, cv = 10)
stacking_classifier_.fit(X_train, y_train)

In [39]:
predicted_ = stacking_classifier_.predict(X_test)
print("accuracy is  = ", accuracy_score(predicted_, y_test))
print("recall   is  = ", recall_score(predicted_, y_test, average = 'macro'))
print("precision is = ", precision_score(predicted_, y_test, average = 'macro'))
print("f1_score is  = ", f1_score(predicted_, y_test, average = 'macro'))

accuracy is  =  0.8092485549132948
recall   is  =  0.8016483516483516
precision is =  0.7930097573627396
f1_score is  =  0.7947309790410566


In [40]:
####### Voting Ensemble #########
voting_ensemble_ = VotingClassifier(estimators = estimators_)
voting_ensemble_.fit(X_train, y_train)

In [41]:
predicted_ = voting_ensemble_.predict(X_test)
print("accuracy is  = ", accuracy_score(predicted_, y_test))
print("recall   is  = ", recall_score(predicted_, y_test, average = 'macro'))
print("precision is = ", precision_score(predicted_, y_test, average = 'macro'))
print("f1_score is  = ", f1_score(predicted_, y_test, average = 'macro'))

accuracy is  =  0.8497109826589595
recall   is  =  0.8507186290768379
precision is =  0.8325525654440896
f1_score is  =  0.8361487617135207
