In [77]:
import numpy as np

In [78]:
#Defining node of a tree
class Node:
    def __init__(self,X,y,num_classes):
        self.left = None #Left Child
        self.right = None #RIght Child
        self.X = X #Examples of the current node
        self.y = y #Lables of the examples of the current node
        self.of_ind = None #optimal feature index
        self.ot = None #optimal threshold
        self.m = X.shape[0] #number of examples in the current node
        self.probs = [0]*num_classes
        for i in range(self.m):
            self.probs[int(y[i])] = (self.probs[int(y[i])]) + (1/self.m) #probability of each class in the node
        self.prediction = self.probs.index(max(self.probs)) # prediction of class on the basis of maximum probability
        self.confidence = max(self.probs) * 100 # confidence which is the maximum probability

In [79]:
class DecisionTree:
    def __init__(self,max_depth=20):
        self.max_depth = max_depth # Max depht of tree is a hyper_parameter to regulate overfitting
        self.num_classes = None # Number of classes in the data set
        self.root = None
        
    
    def sort_according_to_feature(self,xi,X,y): # Sorting accoring to ith feature
        p = X[np.argsort(X[:,xi-1])]
        q = y[np.argsort(X[:,xi-1])]
        return p,q
        
    
    def find_optimal_threshold_and_feature(self,X,y): #Finding optimal threshold by minimizing Gini impurtiy
        m,n = X.shape 
        self.classes = [0]*self.num_classes
        of_ind = None
        ot = None
        current_gini = 2
        for i in range(m):
            self.classes[int(y[i])] = self.classes[int(y[i])] +1
        
        for ind in range(n):
            X,y=self.sort_according_to_feature(ind,X,y)
            nl=[0]*self.num_classes
            nr=self.classes
            for i in range(m-1):
                nl[int(y[i])] = nl[int(y[i])] + 1
                nr[int(y[i])] = nr[int(y[i])] - 1
                ct = (X[i][ind] + X[i+1][ind])/2
                
                gini_left = 1 - sum([(k/m)**2 for k in nl])
                gini_right = 1 - sum([(k/m)**2 for k in nr])
                gini_node = (i/m)*gini_left + ((m-i)/m)*gini_right
                
                if gini_node < current_gini:
                    
                    current_gini = gini_node
                    of_ind = ind
                    ot = ct
        
        return [of_ind,ot]
    
    def split(self, root, depth=0): # Splitting the data recursively on basis of optimal feature and threshold to generate the tree
        if root.X.shape[0]<=1 or depth >= self.max_depth:
            return None
        of_ind, ot = self.find_optimal_threshold_and_feature(root.X,root.y)
        r_ind = root.X[:,of_ind] > ot
        root.of_ind = of_ind
        root.ot = ot
        right_X = root.X[r_ind]
        right_y = root.y[r_ind]
        left_X = root.X[~r_ind]
        left_y = root.y[~r_ind]
        
        rn = Node(right_X,right_y,self.num_classes)
        ln = Node(left_X,left_y,self.num_classes)

        root.left = self.split(ln,depth+1)
        root.right = self.split(rn,depth+1)
        return root
        
    def fit(self,X,y,num_classes): # Fitting the data by training the model
        self.root = Node(X,y,num_classes)
        self.num_classes = num_classes
        self.split(self.root)
        
    def predict(self,X): #Predicting the given data
        m = X.shape[0]
        preds = np.array([0]*m)
        confidence = np.array([0]*m)
        for i in range(m):
            c = X[i,:]
            temp = self.root
            while temp != None:
                preds[i] = temp.prediction
                confidence[i] = temp.confidence
                if c[temp.of_ind] <= temp.ot:
                    temp = temp.left
                else:
                    temp = temp.right
                    
        return preds,confidence
            
        
        
        
        
        
        
                    
        
    
    

In [80]:
def data_prep():
    #Loading the data of pulsar stars as numpy array
    data = np.genfromtxt(r'C:\Users\91888\Desktop\Kaggle\pulsar star\predicting-a-pulsar-star\logistic_regression_predict\pulsar_stars.csv', delimiter = ',', skip_header=1)
    
    #m is the number of examples(number of rows)
    m = data.shape[0]
    
    #n is 20% of m, as 80% of the data is used for training and 20% for testing(testing data used in predict.py)
    n = int(m  * 0.1)
    
    #Initializing training data
    train_data = data[0:m-2*n,:]
    cross_val = data[m-2*n:m-n,:]
    test_data = data[m-n:m,:]
    
    
    return train_data,cross_val,test_data
        
        
        

In [81]:
if __name__ == "__main__":
 train_data,cross_val,test_data = data_prep()
 a = train_data.shape[1]
 X = train_data[:,0:a-1]
 y = train_data[:,a-1:a]
 # fitting the training data
 model = DecisionTree()
 model.fit(X,y,2)



In [76]:
 # Testing
 X_test = test_data[:,0:a-1]
 y_test = np.reshape(test_data[:,a-1:a],(test_data.shape[0],))
 preds,confidence = model.predict(X_test)
 X_cross = cross_val[:,0:a-1]
 y_cross = np.reshape(cross_val[:,a-1:a],(cross_val.shape[0],))
 preds_c,confidence_c = model.predict(X_cross)
 tp = sum(y_test == 1)
 fp = sum(np.logical_and(y_test == 0,preds == 1))
 fn = sum(np.logical_and(y_test == 1,preds == 0))
 tpc = sum(y_cross == 1)
 fpc = sum(np.logical_and(y_cross == 0,preds_c == 1))
 fnc = sum(np.logical_and(y_cross == 1,preds_c == 0))
  
# calculating precision, recall and f1 score on testing and cross_validation data set
   
 p = tp/(tp+fp)
 r = tp/(tp+fn)
 pc = tpc/(tpc+fpc)
 rc = tpc/(tpc+fnc)
print("test---")
print(2*(p*r)/(p+r))
print("cross---")
print(2*(pc*rc)/(pc+rc))

test---
0.6067415730337079
cross---
0.5977011494252874
