In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [6]:
df = pd.read_csv("/content/drive/MyDrive/Maternal Health Risk Data Set.csv")

In [7]:
df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [8]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [9]:
print(X)

[[ 25. 130.  80.  15.  98.  86.]
 [ 35. 140.  90.  13.  98.  70.]
 [ 29.  90.  70.   8. 100.  80.]
 ...
 [ 35.  85.  60.  19.  98.  86.]
 [ 43. 120.  90.  18.  98.  70.]
 [ 32. 120.  65.   6. 101.  76.]]


In [10]:
print(y)

['high risk' 'high risk' 'high risk' ... 'high risk' 'high risk'
 'mid risk']


In [11]:
label_map = {'low risk': 0, 'mid risk': 1, 'high risk': 2}
y_numeric = np.array([label_map[val] for val in y])
print(y_numeric)

[2 2 2 ... 2 2 1]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
column_means = np.nanmean(X_train, axis=0)
X_train_filled = np.where(np.isnan(X_train), column_means, X_train)
X_test_filled = np.where(np.isnan(X_test), column_means, X_test)
print (X_test)

[[ 23.   120.    90.     7.5   98.    70.  ]
 [ 23.   130.    70.     6.9   98.    70.  ]
 [ 35.   100.    70.     7.5   98.    66.  ]
 [ 22.   100.    65.    12.    98.    80.  ]
 [ 13.    90.    65.     7.8  101.    80.  ]
 [ 35.   140.    90.    13.    98.    70.  ]
 [ 25.   120.    90.    12.   101.    80.  ]
 [ 25.   140.   100.     7.01  98.    80.  ]
 [ 22.   100.    65.     7.2   98.    70.  ]
 [ 60.    90.    65.     7.9   98.    77.  ]
 [ 15.    76.    49.     7.8   98.    77.  ]
 [ 19.   120.    80.     7.    98.    70.  ]
 [ 32.   120.    90.     7.5   98.    70.  ]
 [ 18.    90.    60.     6.9   98.    70.  ]
 [ 17.    85.    60.     7.9  102.    86.  ]
 [ 32.   120.    90.     6.8   98.    70.  ]
 [ 60.    90.    65.     6.8   98.    77.  ]
 [ 25.   140.   100.     6.8   98.    80.  ]
 [ 25.   140.   100.     7.5   98.    80.  ]
 [ 42.   120.    80.     7.5   98.    70.  ]
 [ 20.   120.    75.     7.01 100.    70.  ]
 [ 20.   100.    90.     7.5   98.    88.  ]
 [ 35.   1

In [None]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train_filled, y_train)

# Make predictions
y_pred_numeric = classifier.predict(X_test_filled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_numeric)
print("Accuracy:", accuracy)

Accuracy: 0.8627450980392157


In [13]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.tree_ = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # Stopping criteria
        if n_labels == 1 or n_samples < 2 or depth == self.max_depth:
            return {'class': np.argmax(np.bincount(y)), 'count': n_samples}

        # Find best split
        best_split = self._find_best_split(X, y)

        if best_split is None:
            return {'class': np.argmax(np.bincount(y)), 'count': n_samples}

        left_idxs, right_idxs, split_feature, split_value = best_split

        # Grow left and right branches
        left_tree = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right_tree = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)

        return {'feature': split_feature,
                'value': split_value,
                'left': left_tree,
                'right': right_tree}

    def _find_best_split(self, X, y):
        n_samples, n_features = X.shape

        best_gini = 1.0
        best_split = None

        for feature_idx in range(n_features):
            values = np.unique(X[:, feature_idx])
            for value in values:
                left_idxs = np.where(X[:, feature_idx] <= value)[0]
                right_idxs = np.where(X[:, feature_idx] > value)[0]

                gini = self._gini_impurity(y[left_idxs], y[right_idxs])
                if gini < best_gini:
                    best_gini = gini
                    best_split = (left_idxs, right_idxs, feature_idx, value)

        return best_split

    def _gini_impurity(self, left_labels, right_labels):
        n_left = len(left_labels)
        n_right = len(right_labels)
        n_total = n_left + n_right

        if n_left == 0 or n_right == 0:
            return 0.0

        p_left = np.sum(left_labels == np.argmax(np.bincount(left_labels))) / n_left
        p_right = np.sum(right_labels == np.argmax(np.bincount(right_labels))) / n_right

        return p_left * (1 - p_left) + p_right * (1 - p_right)

    def predict(self, X):
        return np.array([self._predict(x, self.tree_) for x in X])

    def _predict(self, x, tree):
        if 'class' in tree:
            return tree['class']
        else:
            if x[tree['feature']] <= tree['value']:
                return self._predict(x, tree['left'])
            else:
                return self._predict(x, tree['right'])

# Usage
dt = DecisionTree(max_depth=3)
dt.fit(X_train_filled, y_train)
y_pred_numeric = dt.predict(X_test_filled)
accuracy = accuracy_score(y_test, y_pred_numeric)

TypeError: Cannot cast array data from dtype('O') to dtype('int64') according to the rule 'safe'

In [None]:
#y_pred = pd.Series(y_pred_numeric).map({val: key for key, val in label_map.items()})
print(y_pred_numeric)

['low risk' 'mid risk' 'low risk' 'high risk' 'mid risk' 'high risk'
 'high risk' 'high risk' 'low risk' 'low risk' 'low risk' 'mid risk'
 'low risk' 'mid risk' 'low risk' 'low risk' 'mid risk' 'high risk'
 'high risk' 'low risk' 'mid risk' 'mid risk' 'low risk' 'high risk'
 'low risk' 'mid risk' 'mid risk' 'mid risk' 'low risk' 'mid risk'
 'high risk' 'high risk' 'mid risk' 'high risk' 'mid risk' 'mid risk'
 'high risk' 'mid risk' 'mid risk' 'high risk' 'low risk' 'mid risk'
 'low risk' 'mid risk' 'mid risk' 'low risk' 'high risk' 'low risk'
 'low risk' 'low risk' 'mid risk' 'low risk' 'high risk' 'high risk'
 'mid risk' 'mid risk' 'high risk' 'high risk' 'low risk' 'high risk'
 'low risk' 'high risk' 'high risk' 'low risk' 'high risk' 'low risk'
 'low risk' 'mid risk' 'high risk' 'low risk' 'mid risk' 'high risk'
 'mid risk' 'mid risk' 'mid risk' 'low risk' 'mid risk' 'low risk'
 'mid risk' 'low risk' 'mid risk' 'high risk' 'high risk' 'mid risk'
 'low risk' 'high risk' 'low risk' 'h