## Dataset 2: Indian Liver Patient Records: KNN


In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("C:\\Users\\Pushkar\\OneDrive - The University of Kansas\\ML\\H1\\indian-liver-patient-records\\indian_liver_patient.csv")

In [5]:
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


### Data Analysis


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
Age                           583 non-null int64
Gender                        583 non-null object
Total_Bilirubin               583 non-null float64
Direct_Bilirubin              583 non-null float64
Alkaline_Phosphotase          583 non-null int64
Alamine_Aminotransferase      583 non-null int64
Aspartate_Aminotransferase    583 non-null int64
Total_Protiens                583 non-null float64
Albumin                       583 non-null float64
Albumin_and_Globulin_Ratio    579 non-null float64
Dataset                       583 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [7]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

#### so from above we can se that the column named 'Albumin_and_Globulin_Ratio' has 4 NAN values, i.e. we need to handle the missing values

In [9]:
# printing the missing value records.
df.loc[df['Albumin_and_Globulin_Ratio'].isnull()]

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
209,45,Female,0.9,0.3,189,23,33,6.6,3.9,,1
241,51,Male,0.8,0.2,230,24,46,6.5,3.1,,1
253,35,Female,0.6,0.2,180,12,15,5.2,2.7,,2
312,27,Male,1.3,0.6,106,25,54,8.5,4.8,,2


In [10]:
# replacing the NAN values with the mean of the column
df["Albumin_and_Globulin_Ratio"].fillna(df["Albumin_and_Globulin_Ratio"].mean(), inplace=True)


In [11]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

#### Checking the datatypes of the column


In [13]:
print(df.dtypes)

Age                             int64
Gender                         object
Total_Bilirubin               float64
Direct_Bilirubin              float64
Alkaline_Phosphotase            int64
Alamine_Aminotransferase        int64
Aspartate_Aminotransferase      int64
Total_Protiens                float64
Albumin                       float64
Albumin_and_Globulin_Ratio    float64
Dataset                         int64
dtype: object


#### so here we can see that the datatype for Gender column is object, so will encode the values of Gender column and will convert it to int type

In [14]:
# converting the datatype of Gender column to int and encoding Male as 1 and Female as 2.
df.Gender[df.Gender == 'Male'] = 1
df.Gender[df.Gender == 'Female'] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,2,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [16]:
# converting the Gender dataype from object to int
df["Gender"]=df["Gender"].astype(int)

In [17]:
print(df.dtypes)

Age                             int64
Gender                          int32
Total_Bilirubin               float64
Direct_Bilirubin              float64
Alkaline_Phosphotase            int64
Alamine_Aminotransferase        int64
Aspartate_Aminotransferase      int64
Total_Protiens                float64
Albumin                       float64
Albumin_and_Globulin_Ratio    float64
Dataset                         int64
dtype: object


#### Now, we have our dataset ready for KNN algoirthm.
#### Implementing KNN Alogorithm code start

In [19]:
import math
import operator

#### Function Name: functionToCalEuclideanDistance 
#### Purpose: To find the euclidean distance between the the two records/data (i.e. between each training data and test data).
#### Input : Training Dataset, Testing Dataset and number of columns of testing dataset
#### Returns: The euclidean distance between each training dataset and testing dataset

In [24]:
def functionToCalEuclideanDistance(dataPointTrain, dataPointTest, sizeOfData):
     varEuclideanDist = 0
     for col in range(sizeOfData):
        varEuclideanDist += pow((dataPointTest[col] - dataPointTrain[col]), 2)
        ## so here varEuclideanDist stores the summation of --> square{(x1-x2)}
     return np.sqrt(varEuclideanDist)

#### Function Name: functionForKNNAlgo 
#### Purpose: Function to implement KNN algorithm and prints the k nearest neighbors and predicted class of the test data.
#### Input : Value of K, Training Dataset & Testing Dataset
#### Output: Prints the k nearest neighbors and predicted class of the test data.

In [106]:
# Function for the KNN algorithm

def functionForKNNAlgo(k, trainingDataSet, testDataSet):
    
    listDistances = {}  # list created to store the distances of each of the test and the training dataset
    varNumberOfColumnsTestDS = testDataSet.shape[1]  # to store the number of columns of the test dataset
    
    for tempCol in range(len(trainingDataSet)):
        localDistance = functionToCalEuclideanDistance(trainingDataSet.iloc[tempCol],testDataSet,  varNumberOfColumnsTestDS)
        listDistances[tempCol] = localDistance[0] # storing for each record the euclidean distance calculated
        
    # print(listDistances)
    # Sorting the list listDistances 
    listSortedDistance = sorted(listDistances.items(), key=operator.itemgetter(1))
    #print(listSortedDistance)
    listOfNeighbors = []
    for temp in range(k):
        listOfNeighbors.append(listSortedDistance[temp][0])
   
    print("The value of K-->", k," and the nearest neighbours are---> ", listOfNeighbors)
    
    classificationOutput= functionToFindTheClass(listOfNeighbors, trainingDataSet)
    
    print("The class or output is--->", classificationOutput)
    

#### Function to parse through all the neigbors and to find the class of all the neighbors and then sort in to find the one the class that has maximum occurances.

#### Function Name: functionToFindTheClass 
#### Input : listOfNeighbors and trainingDataSet
#### Returns: The first element of the sorted list i.e., the predicted class of the test dataset

In [111]:
def functionToFindTheClass(listOfNeighbors, trainingDataSet):
     listClass ={}
     for tempLengthOfNeighbors in range(len(listOfNeighbors)):
        tempClass = trainingDataSet.iloc[listOfNeighbors[tempLengthOfNeighbors]][-1]
 
        if tempClass in listClass:
            listClass[tempClass] += 1
        else:
            listClass[tempClass] = 1
     listClassSorted = sorted(listClass.items(), key=operator.itemgetter(1), reverse=True)
     return listClassSorted[0][0]

In [115]:
testDataSet = [[62,1,10.9,5.5,699,64,100,7.5,3.2,0.74]]

In [116]:
testDF = pd.DataFrame(testDataSet)

In [118]:
# taking the value of k as 3
k = 3 
# Passing the values to the KNN function
functionForKNNAlgo(k,df, testDF )


The value of K--> 3  and the nearest neighbours are--->  [1, 196, 79]
The class or output is---> 1.0


In [119]:
# taking the value of k as 5
k = 5 
# Passing the values to the KNN function
functionForKNNAlgo(k,df, testDF )


The value of K--> 5  and the nearest neighbours are--->  [1, 196, 79, 341, 415]
The class or output is---> 1.0
