In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

In [67]:
df = pd.read_csv('data/maternalhealthrisk.csv')

In [68]:
df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


### Converting column names to lowercase

In [69]:
df.columns = df.columns.str.lower()

In [70]:
df

Unnamed: 0,age,systolicbp,diastolicbp,bs,bodytemp,heartrate,risklevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk
...,...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80,high risk
1010,55,120,90,18.0,98.0,60,high risk
1011,35,85,60,19.0,98.0,86,high risk
1012,43,120,90,18.0,98.0,70,high risk


### No missing value in the dataset

In [71]:
df.isnull().sum()

age            0
systolicbp     0
diastolicbp    0
bs             0
bodytemp       0
heartrate      0
risklevel      0
dtype: int64

### Separating dataset into 'X' and 'y'

In [72]:
X = df.drop('risklevel', axis='columns')
y = df.risklevel

##  Implementation of kNN

In [73]:
# Split into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 


In [74]:
# Decision Tree classifer object
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [75]:
# Capturing the response for test dataset
y_predict = knn.predict(X_test)
print("Training Accuracy", knn.score(X_train, y_train))
print("Test Accuracy", knn.score(X_test, y_test))

Training Accuracy 0.8039492242595204
Test Accuracy 0.6950819672131148


In [76]:
cm = confusion_matrix(y_test, y_predict, labels=['low risk','mid risk', 'high risk'] )
 
print("CM", cm)
print()
 
class_accuracy = cm.diagonal() / cm.sum(axis=1)
print(class_accuracy)

CM [[88 23  6]
 [37 57  7]
 [ 9 11 67]]

[0.75213675 0.56435644 0.77011494]


# Cross validation section

### Finding best 'k' value here to minimize 'overfitting'

In [77]:
for k in range(1,20) :
    knn = KNeighborsClassifier(n_neighbors=k)    
    scores = cross_val_score(knn, X_train, y_train, cv=5)
    print("n: ", k , "accuracy", scores.mean())

n:  1 accuracy 0.7841574268304864
n:  2 accuracy 0.7291379482569174
n:  3 accuracy 0.703785835580861
n:  4 accuracy 0.6840775147337927
n:  5 accuracy 0.6756068324842673
n:  6 accuracy 0.6685745679752273
n:  7 accuracy 0.647447807411847
n:  8 accuracy 0.6347617620617322
n:  9 accuracy 0.6262810908001198
n:  10 accuracy 0.6192288482669064
n:  11 accuracy 0.6234542003795824
n:  12 accuracy 0.6417540705224254
n:  13 accuracy 0.6403955648786335
n:  14 accuracy 0.6431924882629108
n:  15 accuracy 0.6502347417840375
n:  16 accuracy 0.6488362800918989
n:  17 accuracy 0.6502347417840375
n:  18 accuracy 0.6375187293976625
n:  19 accuracy 0.6318549595445011


# Train and Test again after cross-validation

In [78]:
#### Transforming data

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [79]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train,y_train)
print("Test Accuracy", knn.score(X_test, y_test))

Test Accuracy 0.6819672131147541


### Test accuracy hasn't been improved with fixed 'k' value as 8