## Data Preprocessing and Classification on the ILPD (Indian Liver Patient Dataset)

#### Dataset link : https://archive.ics.uci.edu/dataset/225/ilpd+indian+liver+patient+dataset

### Importing the libraries

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

### Reading the dataset

In [4]:
df=pd.read_csv('dataset.csv',header=None)
df.columns=['Age','Gender','TB','DB','Alkphos','Sgpt','Sgot','TP','ALB','A/G Ratio','Target']
df.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Target
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


##### Target : 1->diagnosed with liver disease , 2->not diagnosed

## Exploratory Data Analysis

In [5]:
df.describe()

Unnamed: 0,Age,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Target
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [6]:
df.info() #checking all the attributes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        583 non-null    int64  
 1   Gender     583 non-null    object 
 2   TB         583 non-null    float64
 3   DB         583 non-null    float64
 4   Alkphos    583 non-null    int64  
 5   Sgpt       583 non-null    int64  
 6   Sgot       583 non-null    int64  
 7   TP         583 non-null    float64
 8   ALB        583 non-null    float64
 9   A/G Ratio  579 non-null    float64
 10  Target     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


### Checking for Null values : 

In [7]:
df.isnull().sum()

Age          0
Gender       0
TB           0
DB           0
Alkphos      0
Sgpt         0
Sgot         0
TP           0
ALB          0
A/G Ratio    4
Target       0
dtype: int64

#### Clearly the A/G Ratio Column has 4 NULL Values. Hence we need to drop these rows.

In [8]:
df=df.dropna()

In [9]:
df.info()  #re-checking for missing values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 582
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        579 non-null    int64  
 1   Gender     579 non-null    object 
 2   TB         579 non-null    float64
 3   DB         579 non-null    float64
 4   Alkphos    579 non-null    int64  
 5   Sgpt       579 non-null    int64  
 6   Sgot       579 non-null    int64  
 7   TP         579 non-null    float64
 8   ALB        579 non-null    float64
 9   A/G Ratio  579 non-null    float64
 10  Target     579 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 54.3+ KB


### Encoding Categorical Variables

In [10]:
df['Gender']=df['Gender'].map({"Female":1,"Male":0})  #encoding categorical variable gender as female:1 male:0
df.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Target
0,65,1,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,0,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,0,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,0,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,0,3.9,2.0,195,27,59,7.3,2.4,0.4,1


## Balancing the data

In [11]:
df['Target'].value_counts()

1    414
2    165
Name: Target, dtype: int64

### The Data is imbalanced. 
##### Now we have 2 options. 
##### 1) Oversample the data using SMOTE
##### 2) Undersample the majority data
#### Since it is a medical dataset, I choose undersampling.

### So now undersampling the data to reduce the number of 1's to 190

In [12]:
ones_indices = df[df.iloc[:, -1] == 1].index[:190]  # Get the first 190 indices for 1's
zeros_indices = df[df.iloc[:, -1] == 2].index       # Get all indices for 2's

balanced_indices = ones_indices.union(zeros_indices)
df_balanced = df.loc[balanced_indices].reset_index(drop=True)

num_ones = df_balanced[df_balanced.iloc[:, -1] == 1].shape[0]
num_twos = df_balanced[df_balanced.iloc[:, -1] == 2].shape[0]

print("Number of 1's:", num_ones)  # Should be 190
print("Number of 2's:", num_twos)  # Should be 165

Number of 1's: 190
Number of 2's: 165


In [13]:
df=df_balanced
len(df)

355

In [14]:
df['Target']=df['Target'].map({1:1,2:0})  #Mapping all 2's to 0's for the classification.

##### Target : 1->diagnosed with liver disease , 0->not diagnosed

### Declaring the inputs and targets

In [15]:
inputs=df.iloc[:,0:-1]
targets=df.iloc[:,-1]

inputs.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio
0,65,1,0.7,0.1,187,16,18,6.8,3.3,0.9
1,62,0,10.9,5.5,699,64,100,7.5,3.2,0.74
2,62,0,7.3,4.1,490,60,68,7.0,3.3,0.89
3,58,0,1.0,0.4,182,14,20,6.8,3.4,1.0
4,72,0,3.9,2.0,195,27,59,7.3,2.4,0.4


### Scaling the inputs

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

inputs = sc.fit_transform(inputs)

inputs

array([[ 1.22528982,  1.79615993, -0.34518268, ...,  0.42104908,
         0.21345894, -0.21644398],
       [ 1.04470116, -0.5567433 ,  1.59055604, ...,  1.08876253,
         0.0871624 , -0.75424006],
       [ 1.04470116, -0.5567433 ,  0.90735414, ...,  0.61182436,
         0.21345894, -0.25005624],
       ...,
       [ 0.32234652,  1.79615993, -0.28824919, ...,  1.37492544,
         1.0975347 ,  0.11967856],
       [ 0.92430872, -0.5567433 , -0.38313834, ..., -0.43743964,
        -1.9335822 , -1.99789348],
       [-0.40000812, -0.5567433 , -0.28824919, ...,  0.89798726,
         1.60272085,  1.8002913 ]])

### Train test Split the data

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size = 0.2, random_state = 365)
y_train=y_train.to_numpy()  #Converting from pandas series to numpy array.
y_test=y_test.to_numpy()

# Classification

## Artificial Neural Network (ANN)

### Creating the Model

In [18]:
tf.random.set_seed(42)  #setting the weights

In [19]:
output_size = 1
hidden_layer_size = 15

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    tf.keras.layers.Dense(output_size, activation='sigmoid') 
])

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
model.fit(x_train, y_train,epochs=90)

Train on 284 samples
Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoc

<tensorflow.python.keras.callbacks.History at 0x1be5a151ec8>

#### Testing the Model

In [22]:
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.63. Test accuracy: 76.06%


In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[26 10]
 [ 7 28]]


## Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
print("Logistic Regression Classifier on scaled test data:")
print("Accuracy:", logreg.score(x_test, y_test))

Logistic Regression Classifier on scaled test data:
Accuracy: 0.7746478873239436


## SVM

In [26]:
from sklearn.svm import SVC

In [27]:
svc_clf = SVC(kernel = 'rbf')
svc_clf.fit(x_train,y_train)
print("SVM Classifier on scaled test data:")
print("Accuracy:", svc_clf.score(x_test, y_test))

SVM Classifier on scaled test data:
Accuracy: 0.7746478873239436


## Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier
clf3=RandomForestClassifier()
clf3.fit(x_train,y_train)
print('Accuracy:',accuracy_score(clf3.predict(x_test),y_test))

Accuracy: 0.7323943661971831


#### The accuracy in all the 4 models is about 73-79%.

### We will use the ANN.

### Saving the model.

In [29]:
model.save('model.h5') #Saving the model

In [30]:
import pickle  #saving the StandardScaler

with open('scaler.pkl', 'wb') as f:
    pickle.dump(sc, f)