In [16]:
import pandas as pd
import numpy as np

# Reading data 
### transfusion.csv
*data source:= https://www.kaggle.com/datasets/ninalabiba/blood-transfusion-dataset*

In [17]:
df=pd.read_csv("/content/transfusion.csv")
df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


# changing the columns name

In [18]:

df= df.rename(columns = {'whether he/she donated blood in March 2007' : 'donated'})
X = df.drop('donated',axis =1)
Y = df['donated']

df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),donated
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


# Decision Tree Algorithm

In [23]:
class Dtree:
    def __init__(self, depth=0, mdepth=10):
        self.left_child = None
        self.right_child = None
        self.mdepth = mdepth
        self.depth = depth
        self.key = None
        self.val = None
        self.target = None

    def divide_data(self, Xd, fkey, fval):

        XR = pd.DataFrame([], columns=Xd.columns)
        XL = pd.DataFrame([], columns=Xd.columns)
        
        
        for ix in range(Xd.shape[0]):
            val = Xd.iloc[:, fkey].loc[ix]
            if val > fval:
                XR = XR.append(Xd.loc[ix])
            else:
                XL = XL.append(Xd.loc[ix])
        
        return XL, XR
    
    def calc_entropy(self, col):
        temp = []
        temp.append(col.mean())
        temp.append(1-temp[0])
        
        entropy = 0.0
        for px in temp:
            entropy += (-1.0 * px * np.log2(px))
        return entropy

    def information_gain(self, xdata, fkey, fval):
        left, right = self.divide_data(xdata, fkey, fval)
        
        if left.shape[0] == 0 or right.shape[0] == 0:
            return -10000
        
        return 2*self.calc_entropy(xdata.y) - (self.calc_entropy(left.y) + self.calc_entropy(right.y))
    
    
    def data_training(self, x,y):
        df=x
        df['y']=y

        gains = []
        for col_i in range(len(df.columns)-1):
            gains.append(self.information_gain(df, col_i, df.iloc[:,col_i].mean()))

        self.key = np.argmax(gains)
        self.val = df.iloc[:,self.key].mean()

        Ldata, Rdata = self.divide_data(df, self.key, self.val)
        Ldata = Ldata.reset_index(drop=True)
        Rdata = Rdata.reset_index(drop=True)

        if Ldata.shape[0] == 0 or Rdata.shape[0] == 0:
            if df.y.mean() >= 0.5:
                self.target = 1
            else:
                self.target = 0
            return
        
        if self.depth >= self.mdepth:
            if df.y.mean() >= 0.5:
                self.target = 1
            else:
                self.target = 0
            return
        
        self.right_child = Dtree(depth=self.depth+1, mdepth=self.mdepth)
        self.right_child.data_training(Rdata.drop("y",axis=1),Rdata["y"])


        self.left_child = Dtree(depth=self.depth+1, mdepth=self.mdepth)
        self.left_child.data_training(Ldata.drop("y",axis=1),Ldata["y"])
        
        if df.y.mean() >= 0.8:
            self.target = 1
        else:
            self.target = 0
        
        return
    
    def data_prediction(self, test):
        if test[self.key] > self.val:
            if self.right_child is None:
                return self.target
            return self.right_child.data_prediction(test)
        else:
            if self.left_child is None:
                return self.target
            return self.left_child.data_prediction(test)

### splitting data into train, test data

In [24]:
df=df.sample(frac=1).reset_index(drop=1)
split = int(0.8 * df.shape[0])
data_train = df[:split]
testing_data = df[split:]




### Training the model


In [None]:
dt = Dtree()
dt.data_training(data_train.drop("donated",axis=1), data_train["donated"])

### Testing the model



In [25]:
y_pred=[]
testing_data_x= testing_data.drop("donated", axis=1)
for ix in testing_data_x.index:       
    y_pred.append(dt.data_prediction(testing_data_x.loc[ix,:]))

m, t= 0,0
for i,j in zip(testing_data.donated, y_pred):
  if i==j:
    t+=1
  m+=1
    
print("Accuracy of the decision tree is -->:",t/m)

Accuracy of the decision tree is -->: 0.7133333333333334
