In [105]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [100]:
dataset_url = "https://raw.githubusercontent.com/ZachLegros/CSI4106-A2-Classification/master/Maternal%20Health%20Risk%20Data%20Set.csv"
df = pd.read_csv(dataset_url)
print(df.head())

   Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate  RiskLevel
0   25         130           80  15.0      98.0         86  high risk
1   35         140           90  13.0      98.0         70  high risk
2   29          90           70   8.0     100.0         80  high risk
3   30         140           85   7.0      98.0         70  high risk
4   35         120           60   6.1      98.0         76   low risk


In [119]:
# Here we get each row of the dataset and remove the last column (the class). This is our X data and it is a 2D array.
X = df.iloc[:, :-1].values
# Here we get the last column of the dataset. This is our y data.
y = df.iloc[:, -1].values

kf = KFold(n_splits=4)

for i, (train_index, test_index) in enumerate(kf.split(X)):
  print("Fold", i+1)
  # use classification_report to generate a report with precision and recall only and compare the micro and macro averages
  X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
  clf = LogisticRegression(random_state=0, max_iter=1500)
  clf.fit(X_train, y_train)
  clf_ypred= clf.predict(X_test)
  clf_report = classification_report(y_test, clf_ypred, labels=["low risk", "mid risk", "high risk"])
  print("LogisticRegression:")
  print(clf_report)

  gnb = GaussianNB()
  gnb.fit(X_train, y_train)
  gnb_ypred = gnb.predict(X_test)
  gnb_report = classification_report(y_test, gnb_ypred, labels=["low risk", "mid risk", "high risk"])
  print("GaussianNB:")
  print(gnb_report)
  print("------------------------------------------------------")


Fold 1
LogisticRegression:
              precision    recall  f1-score   support

    low risk       0.73      0.93      0.82       105
    mid risk       0.45      0.41      0.43        64
   high risk       0.89      0.65      0.75        85

    accuracy                           0.70       254
   macro avg       0.69      0.66      0.66       254
weighted avg       0.71      0.70      0.70       254

GaussianNB:
              precision    recall  f1-score   support

    low risk       0.61      0.88      0.72       105
    mid risk       0.35      0.22      0.27        64
   high risk       0.92      0.67      0.78        85

    accuracy                           0.64       254
   macro avg       0.62      0.59      0.59       254
weighted avg       0.65      0.64      0.62       254

------------------------------------------------------
Fold 2
LogisticRegression:
              precision    recall  f1-score   support

    low risk       0.67      0.66      0.66       122
    mid 