In [1]:
# import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report ,confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#Import the datasets
liverCSV = pd.read_csv('E:\\Thesis\\Indian_Liver_Patient.csv')
# Handling the missing values
df = pd.DataFrame(liverCSV)
dataset = df.fillna(method='ffill', axis = 0)

# process  of checking the missing columns values
dataset.columns[dataset.isnull().any()]


# start oversampling process

target = 'Liver_patients'

# dataset counts 1 or 0 
print(dataset[target].value_counts())

# minority class length i.e 1
minority_class_len = len(dataset[dataset[target] == 0])
print(minority_class_len)

# display the indices of majority class i.e 0
majority_class_indices = dataset[dataset[target] == 1].index
print(majority_class_indices)

# randomly picking up the majority class indicies i.e. 0 
random_majority_indices = np.random.choice(majority_class_indices, minority_class_len, replace=False)
print(len(random_majority_indices))

# displaing the minority class indices i.e. 1
minority_class_indices = dataset[dataset[target] == 0].index
print(minority_class_indices)

# concatenate the minority indices and random majority incices
under_sample_indices = np.concatenate([minority_class_indices, random_majority_indices])

# locate the under_sample_indices to datasets
under_sample = dataset.loc[under_sample_indices]

# Bar Garph
sns.countplot(x=target, data=under_sample)

# displaing the values of 0 and 1
print(under_sample[target].value_counts())

# end oversampling process


tData = np.asarray(under_sample.drop('Liver_patients', 1))
tTarget = np.asarray(under_sample['Liver_patients'])

# Normalize Data
means = np.mean(tData, axis=0)
stds = np.std(tData, axis=0)
tData = (tData - means)/stds

X_train, X_test, y_train, y_test = train_test_split(tData,tTarget,test_size=0.20)

#applying Kernel PCA
#from sklearn.decomposition import KernelPCA
#kpca = KernelPCA(n_components = 3, kernel = 'rbf')
#X_train = kpca.fit_transform(X_train)
#X_test = kpca.transform(X_test)

#create a random forest classifier

rf = RandomForestClassifier(n_estimators=20, random_state=0)
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)

accuracy = rf.score(X_test, y_test)

print("accuracy = ", accuracy * 100, "%")

random_forest_score      = round(rf.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(rf.score(X_test, y_test) * 100, 2)

print('Random Forest Score: ', random_forest_score)
print('Random Forest Test Score: ', random_forest_score_test)
print('Accuracy: ', accuracy_score(y_test,rf_predicted))
print('\nClassification report: \n', classification_report(y_test,rf_predicted))

1    416
0    167
Name: Liver_patients, dtype: int64
167
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   9,  10,
            ...
            571, 572, 573, 574, 575, 576, 577, 579, 580, 581],
           dtype='int64', length=416)
167
Int64Index([  8,  12,  15,  17,  24,  28,  29,  32,  33,  34,
            ...
            539, 540, 541, 542, 545, 551, 564, 566, 578, 582],
           dtype='int64', length=167)
1    167
0    167
Name: Liver_patients, dtype: int64
accuracy =  67.16417910447761 %
Random Forest Score:  99.25
Random Forest Test Score:  67.16
Accuracy:  0.6716417910447762

Classification report: 
               precision    recall  f1-score   support

           0       0.64      0.76      0.69        33
           1       0.71      0.59      0.65        34

   micro avg       0.67      0.67      0.67        67
   macro avg       0.68      0.67      0.67        67
weighted avg       0.68      0.67      0.67        67

