In [1]:
# RF_Example
# Referenced from https://towardsdatascience.com/master-machine-learning-random-forest-from-scratch-with-python-3efdd51b6d7a

import pandas as pd # used to load and manipulate data
import tensorflow.compat.v1 as tf
import numpy as np # data manipulation
from sklearn.utils import resample # downsampling the data
from sklearn.model_selection import train_test_split # used to split data into training and testing sets
from sklearn.metrics import confusion_matrix # used to create confusion matrix
from sklearn.metrics import classification_report # creates a classifaction report that measure the models performance
import seaborn as sns
from collections import Counter

# creating a variable to process and store the data from the CSV file
df = pd.read_csv('database_Pneumonia_2.csv')
df.head()

Unnamed: 0,Serial_Number,date,Age,Gender,Wt,Pneumococcus_vaccine,Streptococcus_vaccine,Difficulty_breathing,Tachypnea,fever,cough,difficulty_feeding,looks_sick,Suspected_Pneumonia
0,8016,17-Mar,1,f,3.4,n,n,y,y,y,,y,y,S
1,8043,17-Feb,1,m,3.7,n,n,y,y,,,y,,Sus
2,8047,17-Jun,2,f,4.2,n,n,,,,,y,y,UL
3,8048,17-Jan,2,f,4.0,n,n,y,y,y,,y,,S
4,8067,17-Jun,2,f,4.4,n,n,,y,,,y,y,Sus


In [2]:
# dropping the Serial_Number and date column since they are not necessary for the model
df.drop('Serial_Number', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Gender,Wt,Pneumococcus_vaccine,Streptococcus_vaccine,Difficulty_breathing,Tachypnea,fever,cough,difficulty_feeding,looks_sick,Suspected_Pneumonia
0,1,f,3.4,n,n,y,y,y,,y,y,S
1,1,m,3.7,n,n,y,y,,,y,,Sus
2,2,f,4.2,n,n,,,,,y,y,UL
3,2,f,4.0,n,n,y,y,y,,y,,S
4,2,f,4.4,n,n,,y,,,y,y,Sus


In [3]:
# now to replace the empty fields with the appropriate values
df_blankReplaced = df.fillna('n')
df_blankReplaced.head()

Unnamed: 0,Age,Gender,Wt,Pneumococcus_vaccine,Streptococcus_vaccine,Difficulty_breathing,Tachypnea,fever,cough,difficulty_feeding,looks_sick,Suspected_Pneumonia
0,1,f,3.4,n,n,y,y,y,n,y,y,S
1,1,m,3.7,n,n,y,y,n,n,y,n,Sus
2,2,f,4.2,n,n,n,n,n,n,y,y,UL
3,2,f,4.0,n,n,y,y,y,n,y,n,S
4,2,f,4.4,n,n,n,y,n,n,y,y,Sus


In [4]:
# declaring X variable, which will hold all of the independent variables that will be used to predict suspected pneumonia

# X = df_blankReplaced.drop('Gender', axis=1).copy()
# X = X.drop('Suspected_Pneumonia', axis=1).copy()

X = df_blankReplaced.drop('Suspected_Pneumonia', axis=1).copy()
X = X.drop('Age', axis=1).copy()
X = X.drop('Gender', axis=1).copy()
X = X.drop('Wt', axis=1).copy()


# converting y and n into 1 and 0, to be recognized as boolean by the system and existing methods
X = X.replace('y', 1)
X = X.replace('Y', 1)
X = X.replace('n', 0)

X = X.replace('m', 1)
X = X.replace('f', 0)

X.head()

Unnamed: 0,Pneumococcus_vaccine,Streptococcus_vaccine,Difficulty_breathing,Tachypnea,fever,cough,difficulty_feeding,looks_sick
0,0,0,1,1,1,0,1,1
1,0,0,1,1,0,0,1,0
2,0,0,0,0,0,0,1,1
3,0,0,1,1,1,0,1,0
4,0,0,0,1,0,0,1,1


In [5]:
# declaring y variable, which will hold the dependent variable (suspected pneumonia) that is going to be predicted 

y = df_blankReplaced['Suspected_Pneumonia'].copy()

# converting S into 1 and Sus/UL into 0, to be recognized as boolean by the system and existing methods
y = y.replace('S', 1)
y = y.replace('Sus', 0)
y = y.replace('UL', 0)
y.head()

0    1
1    0
2    0
3    1
4    0
Name: Suspected_Pneumonia, dtype: int64

In [6]:
class Node:
    '''
    Helper class which implements a single tree node.
    '''
    def __init__(self, feature=None, threshold=None, data_left=None, data_right=None, gain=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain
        self.value = value

In [7]:
class DecisionTree:
    '''
    Class which implements a decision tree classifier algorithm.
    '''
    def __init__(self, min_samples_split=2, max_depth=5):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None
        
    @staticmethod
    def _entropy(s):
        '''
        Helper function, calculates entropy from an array of integer values.
        
        :param s: list
        :return: float, entropy value
        '''
        # Convert to integers to avoid runtime errors
        counts = np.bincount(np.array(s, dtype=np.int64))
        # Probabilities of each class label
        percentages = counts / len(s)

        # Caclulate entropy
        entropy = 0
        for pct in percentages:
            if pct > 0:
                entropy += pct * np.log2(pct)
        return -entropy
    
    def _information_gain(self, parent, left_child, right_child):
        '''
        Helper function, calculates information gain from a parent and two child nodes.
        
        :param parent: list, the parent node
        :param left_child: list, left child of a parent
        :param right_child: list, right child of a parent
        :return: float, information gain
        '''
        num_left = len(left_child) / len(parent)
        num_right = len(right_child) / len(parent)
        
        # One-liner which implements the previously discussed formula
        return self._entropy(parent) - (num_left * self._entropy(left_child) + num_right * self._entropy(right_child))
    
    def _best_split(self, X, y):
        '''
        Helper function, calculates the best split for given features and target
        
        :param X: np.array, features
        :param y: np.array or list, target
        :return: dict
        '''
        best_split = {}
        best_info_gain = -1
        n_rows, n_cols = X.shape
        
        # For every dataset feature
        for f_idx in range(n_cols):
            X_curr = X[:, f_idx]
            # For every unique value of that feature
            for threshold in np.unique(X_curr):
                # Construct a dataset and split it to the left and right parts
                # Left part includes records lower or equal to the threshold
                # Right part includes records higher than the threshold
                df = np.concatenate((X, y.reshape(1, -1).T), axis=1)
                df_left = np.array([row for row in df if row[f_idx] <= threshold])
                df_right = np.array([row for row in df if row[f_idx] > threshold])

                # Do the calculation only if there's data in both subsets
                if len(df_left) > 0 and len(df_right) > 0:
                    # Obtain the value of the target variable for subsets
                    y = df[:, -1]
                    y_left = df_left[:, -1]
                    y_right = df_right[:, -1]

                    # Caclulate the information gain and save the split parameters
                    # if the current split if better then the previous best
                    gain = self._information_gain(y, y_left, y_right)
                    if gain > best_info_gain:
                        best_split = {
                            'feature_index': f_idx,
                            'threshold': threshold,
                            'df_left': df_left,
                            'df_right': df_right,
                            'gain': gain
                        }
                        best_info_gain = gain
        return best_split
    
    def _build(self, X, y, depth=0):
        '''
        Helper recursive function, used to build a decision tree from the input data.
        
        :param X: np.array, features
        :param y: np.array or list, target
        :param depth: current depth of a tree, used as a stopping criteria
        :return: Node
        '''
        n_rows, n_cols = X.shape
        
        # Check to see if a node should be leaf node
        if n_rows >= self.min_samples_split and depth <= self.max_depth:
            # Get the best split
            best = self._best_split(X, y)
            # If the split isn't pure
            if best['gain'] > 0:
                # Build a tree on the left
                left = self._build(
                    X=best['df_left'][:, :-1], 
                    y=best['df_left'][:, -1], 
                    depth=depth + 1
                )
                right = self._build(
                    X=best['df_right'][:, :-1], 
                    y=best['df_right'][:, -1], 
                    depth=depth + 1
                )
                return Node(
                    feature=best['feature_index'], 
                    threshold=best['threshold'], 
                    data_left=left, 
                    data_right=right, 
                    gain=best['gain']
                )
        # Leaf node - value is the most common target value 
        return Node(
            value=Counter(y).most_common(1)[0][0]
        )
    
    def fit(self, X, y):
        '''
        Function used to train a decision tree classifier model.
        
        :param X: np.array, features
        :param y: np.array or list, target
        :return: None
        '''
        # Call a recursive function to build the tree
        self.root = self._build(X, y)
        
    def _predict(self, x, tree):
        '''
        Helper recursive function, used to predict a single instance (tree traversal).
        
        :param x: single observation
        :param tree: built tree
        :return: float, predicted class
        '''
        # Leaf node
        if tree.value != None:
            return tree.value
        feature_value = x[tree.feature]
        
        # Go to the left
        if feature_value <= tree.threshold:
            return self._predict(x=x, tree=tree.data_left)
        
        # Go to the right
        if feature_value > tree.threshold:
            return self._predict(x=x, tree=tree.data_right)
        
    def predict(self, X):
        '''
        Function used to classify new instances.
        
        :param X: np.array, features
        :return: np.array, predicted classes
        '''
        # Call the _predict() function for every observation
        return [self._predict(x, self.root) for x in X]

In [8]:

class RandomForest:
    '''
    A class that implements Random Forest algorithm from scratch.
    '''
    def __init__(self, num_trees=25, min_samples_split=2, max_depth=5):
        self.num_trees = num_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        # Will store individually trained decision trees
        self.decision_trees = []
        
    @staticmethod
    def _sample(X, y):
        '''
        Helper function used for boostrap sampling.
        
        :param X: np.array, features
        :param y: np.array, target
        :return: tuple (sample of features, sample of target)
        '''
        n_rows, n_cols = X.shape
        # Sample with replacement
        samples = np.random.choice(a=n_rows, size=n_rows, replace=True)
        return X[samples], y[samples]
        
    def fit(self, X, y):
        '''
        Trains a Random Forest classifier.
        
        :param X: np.array, features
        :param y: np.array, target
        :return: None
        '''
        # Reset
        if len(self.decision_trees) > 0:
            self.decision_trees = []
            
        # Build each tree of the forest
        num_built = 0
        while num_built < self.num_trees:
            try:
                clf = DecisionTree(
                    min_samples_split=self.min_samples_split,
                    max_depth=self.max_depth
                )
                # Obtain data sample
                _X, _y = self._sample(X, y)
                # Train
                clf.fit(_X, _y)
                # Save the classifier
                self.decision_trees.append(clf)
                num_built += 1
            except Exception as e:
                continue
    
    def predict(self, X):
        '''
        Predicts class labels for new data instances.
        
        :param X: np.array, new instances to predict
        :return: 
        '''
        # Make predictions with every tree in the forest
        y = []
        for tree in self.decision_trees:
            y.append(tree.predict(X))
        
        # Reshape so we can find the most common value
        y = np.swapaxes(a=y, axis1=0, axis2=1)
        
        # Use majority voting for the final prediction
        predictions = []
        for preds in y:
            counter = Counter(preds)
            predictions.append(counter.most_common(1)[0][0])
        return predictions

In [9]:
from sklearn.datasets import load_iris

iris = load_iris()

X = iris['data']
y = iris['target']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(X_train)

[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]
 [6.3 2.5 5.  1.9]
 [6.4 3.2 4.5 1.5]
 [5.2 3.5 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.2 4.1 1.5 0.1]
 [5.8 2.7 5.1 1.9]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [5.4 3.9 1.3 0.4]
 [5.4 3.7 1.5 0.2]
 [5.5 2.4 3.7 1. ]
 [6.3 2.8 5.1 1.5]
 [6.4 3.1 5.5 1.8]
 [6.6 3.  4.4 1.4]
 [7.2 3.6 6.1 2.5]
 [5.7 2.9 4.2 1.3]
 [7.6 3.  6.6 2.1]
 [5.6 3.  4.5 1.5]
 [5.1 3.5 1.4 0.2]
 [7.7 2.8 6.7 2. ]
 [5.8 2.7 4.1 1. ]
 [5.2 3.4 1.4 0.2]
 [5.  3.5 1.3 0.3]
 [5.1 3.8 1.9 0.4]
 [5.  2.  3.5 1. ]
 [6.3 2.7 4.9 1.8]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.1 3.3 1.7 0.5]
 [5.6 2.7 4.2 1.3]
 [5.1 3.4 1.5 0.2]
 [5.7 3.  4.2 1.2]
 [7.7 3.8 6.7 2.2]
 [4.6 3.2 1.4 0.2]
 [6.2 2.9 4.3 1.3]
 [5.7 2.5 5.  2. ]
 [5.5 4.2 1.4 0.2]
 [6.  3.  4.8 1.8]
 [5.8 2.7 5.1 1.9]
 [6.  2.2 4.  1. ]
 [5.4 3.  4.5 1.5]
 [6.2 3.4 5.4 2.3]
 [5.5 2.3 4.  1.3]
 [5.4 3.9 1.7 0.4]
 [5.  2.3 3.3 1. ]
 [6.4 2.7 5.3 1.9]
 [5.  3.3 1.4 0.2]
 [5.  3.2 1.

In [17]:
print(y_train)

[0 0 1 0 0 2 1 0 0 0 2 1 1 0 0 1 2 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1
 2 0 1 2 0 2 2 1 1 2 1 0 1 2 0 0 1 1 0 2 0 0 1 1 2 1 2 2 1 0 0 2 2 0 0 0 1
 2 0 2 2 0 1 1 2 1 2 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1
 1 2 2 0 1 2 0 1 2]


In [11]:
model = RandomForest()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, preds)

1.0