### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
data = pd.read_csv("../Maternal Health Risk Data Set.csv")
data

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk
...,...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80,high risk
1010,55,120,90,18.0,98.0,60,high risk
1011,35,85,60,19.0,98.0,86,high risk
1012,43,120,90,18.0,98.0,70,high risk


In [3]:
data.head()


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [4]:
data.describe()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
count,1014.0,1014.0,1014.0,1014.0,1014.0,1014.0
mean,29.871795,113.198225,76.460552,8.725986,98.665089,74.301775
std,13.474386,18.403913,13.885796,3.293532,1.371384,8.088702
min,10.0,70.0,49.0,6.0,98.0,7.0
25%,19.0,100.0,65.0,6.9,98.0,70.0
50%,26.0,120.0,80.0,7.5,98.0,76.0
75%,39.0,120.0,90.0,8.0,98.0,80.0
max,70.0,160.0,100.0,19.0,103.0,90.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1014 non-null   int64  
 1   SystolicBP   1014 non-null   int64  
 2   DiastolicBP  1014 non-null   int64  
 3   BS           1014 non-null   float64
 4   BodyTemp     1014 non-null   float64
 5   HeartRate    1014 non-null   int64  
 6   RiskLevel    1014 non-null   object 
dtypes: float64(2), int64(4), object(1)
memory usage: 55.6+ KB


In [6]:
data.isnull()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
1009,False,False,False,False,False,False,False
1010,False,False,False,False,False,False,False
1011,False,False,False,False,False,False,False
1012,False,False,False,False,False,False,False


### Data Processing
#### Converting categorical variables of the dataset into numerical variables - using ONE HOT ENCODING technique


In [7]:
data.RiskLevel.unique()

array(['high risk', 'low risk', 'mid risk'], dtype=object)

In [8]:
data['RiskLevel']= data['RiskLevel'].apply({'low risk':0, 'mid risk':1, 'high risk':2}.get)

In [9]:
data

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,2
1,35,140,90,13.0,98.0,70,2
2,29,90,70,8.0,100.0,80,2
3,30,140,85,7.0,98.0,70,2
4,35,120,60,6.1,98.0,76,0
...,...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80,2
1010,55,120,90,18.0,98.0,60,2
1011,35,85,60,19.0,98.0,86,2
1012,43,120,90,18.0,98.0,70,2


### Dividing the dataset into dependent and independent columns¶

In [10]:
X= data.drop('RiskLevel',axis=1)
y=data['RiskLevel']
X

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
0,25,130,80,15.0,98.0,86
1,35,140,90,13.0,98.0,70
2,29,90,70,8.0,100.0,80
3,30,140,85,7.0,98.0,70
4,35,120,60,6.1,98.0,76
...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80
1010,55,120,90,18.0,98.0,60
1011,35,85,60,19.0,98.0,86
1012,43,120,90,18.0,98.0,70


In [11]:
y

0       2
1       2
2       2
3       2
4       0
       ..
1009    2
1010    2
1011    2
1012    2
1013    1
Name: RiskLevel, Length: 1014, dtype: int64

### Splitting the dataset into training and testing set
#### 20% of the dataset will be used for testing(evaluation) and 80% of the data will be used for training purposes

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

### Logistic Refression

In [13]:
logmodel=LogisticRegression()
logmodel.fit(X_train,y_train)
y_pred=logmodel.predict(X_test)
#calculating accuracy of the Classification model
log_cm=confusion_matrix(y_test,y_pred)
log_ac=accuracy_score(y_test,y_pred)
print(log_cm)
print(log_ac)

[[61  9  4]
 [39 26  6]
 [12  5 41]]
0.6305418719211823


### Decision Tree Algorithm

In [14]:
tree_classifier=DecisionTreeClassifier()
tree_classifier.fit(X_train,y_train)
predictions=tree_classifier.predict(X_test)
#calculating accuracy of the Classification model
dc_cm=confusion_matrix(y_test,predictions)
dc_ac=accuracy_score(y_test,predictions)
print(dc_cm)
print(dc_ac)

[[61 11  2]
 [ 5 63  3]
 [ 0  4 54]]
0.8768472906403941


### KNN Algorithm

In [15]:
knn=KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train,y_train)
predictions=knn.predict(X_test)
#calculating accuracy of the Classification model
knn_cm=confusion_matrix(y_test,predictions)
knn_ac=accuracy_score(y_test,predictions)
print(knn_cm)
print(knn_ac)

[[52 17  5]
 [19 49  3]
 [ 8  9 41]]
0.6995073891625616


### Random Forest Algorithm

In [16]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
#calculating accuracy of the Classification model
rf_cm=confusion_matrix(y_test,y_pred)
rf_ac=accuracy_score(y_test,y_pred)
print(rf_cm)
print(rf_ac)

[[58 13  3]
 [ 5 63  3]
 [ 0  5 53]]
0.8571428571428571


### Support Vector Machine

In [17]:
svc_model=SVC()
svc_model.fit(X_train,y_train)
predictions=svc_model.predict(X_test)
#calculating accuracy of the Classification model
svm_cm=confusion_matrix(y_test,predictions)
svm_ac=accuracy_score(y_test,predictions)
print(svm_cm)
print(svm_ac)

[[67  7  0]
 [44 25  2]
 [15 12 31]]
0.6059113300492611


In [18]:
print(f"logistic regression Accuracy {log_ac*100}")
print(f"Decision Tree Accuracy {dc_ac*100}")
print(f"KNN Accuracy {knn_ac*100}")
print(f"Random Forest Accuracy {rf_ac*100}")
print(f"Support Vector Machine Accuracy {svm_ac*100}")

logistic regression Accuracy 63.05418719211823
Decision Tree Accuracy 87.68472906403942
KNN Accuracy 69.95073891625616
Random Forest Accuracy 85.71428571428571
Support Vector Machine Accuracy 60.591133004926114


### To find the Minimum and Maximum value of the dataset


In [19]:
# For Multiple columns to exclude use,
# cols = [x for x in df.columns if x not in ('mpg', 'name')]
# cols

cols = [X for X in data.columns if X not in ('RiskLevel')]
cols

print("{")
for i,name in enumerate(cols):
    print(f'"{name}":{{"min":{data[name].min()},"max":{data[name].max()}}}{"," if i<(len(cols)-1) else ""}')
print("}")

{
"Age":{"min":10,"max":70},
"SystolicBP":{"min":70,"max":160},
"DiastolicBP":{"min":49,"max":100},
"BS":{"min":6.0,"max":19.0},
"BodyTemp":{"min":98.0,"max":103.0},
"HeartRate":{"min":7,"max":90}
}


In [20]:
X


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
0,25,130,80,15.0,98.0,86
1,35,140,90,13.0,98.0,70
2,29,90,70,8.0,100.0,80
3,30,140,85,7.0,98.0,70
4,35,120,60,6.1,98.0,76
...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80
1010,55,120,90,18.0,98.0,60
1011,35,85,60,19.0,98.0,86
1012,43,120,90,18.0,98.0,70


In [28]:
data_new = {'Age': 25, 'SystolicBP': 130, 'DiastolicBP': 80, 'BS': 15.0, 'BodyTemp': 98.0,
            'HeartRate': 86}
index = [1]  # serial number

In [29]:
my_data = pd.DataFrame(data_new, index)

In [30]:
print(my_data)

   Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate
1   25         130           80  15.0      98.0         86


In [31]:
medical_details = svc_model.predict(my_data)
print(f" The Risk Level is :{medical_details}")

 The Risk Level is :[1]
