In [1]:
import numpy as np
import pandas as pd
class Node:
    def __init__(self,value=None,spliter=None,left=None,right=None,col_index=None):
        self.node_value=value
        self.split_value = spliter
        self.feature_index = col_index
        self.left = left
        self.right = right


        
class Tree:
    def __init__(self,depth=4):
        self.depth = depth
        
    def fit(self,dataset):
        self.root = self.build_tree(dataset,self.depth)


    def entropy(self,dataset):
        row = dataset.shape[0]
        if(row==0):
            return 0
        yes = dataset.iloc[(dataset.iloc[:,-1].values==1)].shape[0]
        no =  dataset.iloc[(dataset.iloc[:,-1].values==0)].shape[0]
        if(yes==0 or no==0):
            return 0
        entropy = -(yes/row)*np.log2(yes/row)-(no/row)*np.log2(no/row)
        
        return entropy
        
    def best_ig(self,dataset,column_no):
        sample_dataset = dataset.iloc[(dataset.iloc[:,column_no].argsort())]
        parent_row = sample_dataset.shape[0]

        parent_entropy = self.entropy(sample_dataset)
                                     
        spliter_value =  (sample_dataset.iloc[:,column_no].values[1:]+  sample_dataset.iloc[:,column_no].values[:-1])/2
        best_spliter = None
        best_ig = -np.inf
                                      
        for current_split in spliter_value:
            left_dataset = sample_dataset.iloc[(sample_dataset.iloc[:,column_no].values <= current_split)]
            left_row = left_dataset.shape[0]
            left_entropy = self.entropy(left_dataset)
            
            right_dataset = sample_dataset.iloc[(sample_dataset.iloc[:,column_no].values > current_split)]
            right_row = right_dataset.shape[0]
            right_entropy= self.entropy(right_dataset)
            
            
            ig = parent_entropy-(left_row/parent_row)*left_entropy -(right_row/parent_row)*right_entropy
            if(ig>best_ig):
                best_ig = ig
                best_spliter = current_split
            
        return best_ig, best_spliter
    def best_column(self,dataset):

        best_column = None
        best_ig = -np.inf
        best_spliter=None
        for col in range(dataset.shape[1]-1):
            col_ig, col_split = self.best_ig(dataset,col)
            if(col_ig>best_ig):
                best_ig = col_ig
                best_spliter = col_split
                best_column = col
        
        return best_column, best_spliter


    def build_tree(self,dataset,depth):
        if(depth==0 or dataset.shape[0]==0 or dataset.iloc[:,-1].nunique()==1):
            return Node(value=dataset.iloc[:,-1].mode()[0])
        
        column_index,split_value = self.best_column(dataset)
        
        #sorted_dataset = dataset.iloc[(dataset.iloc[:,column_index].argsort())]
        
        left_dataset = dataset[(dataset.iloc[:,column_index] <= split_value)]
        right_dataset = dataset[(dataset.iloc[:,column_index] > split_value)]

        left_node = self.build_tree(left_dataset,depth = depth-1)
        right_node = self.build_tree(right_dataset,depth=depth-1)
        
        return Node(
        col_index=column_index,
        spliter=split_value,
        left=left_node,
        right=right_node)
        
    def predict(self,test_dataset):
        prediction=[]
        for i in range(test_dataset.shape[0]):
            row = test_dataset.iloc[i]
            predict_value = self.make_prediction(row,self.root)
            prediction.append(predict_value)
        return np.array(prediction)
        
    def make_prediction(self,row,node):
        if(node.node_value is not None):
            return node.node_value
        col_idx_value = row[node.feature_index]
        if(col_idx_value<=node.split_value):
           return self.make_prediction(row,node.left)
        else:
           return self.make_prediction(row,node.right)

def f1_score(y_pre,y_test):
    data = pd.DataFrame({
        'predicted':y_pre,
        'actual':y_test,
    })
    true_pos = data[(data['predicted']==1)& (data['actual']==1)].shape[0]
    true_neg =  data[(data['predicted']==0)& (data['actual']==0)].shape[0]
    false_pos =  data[(data['predicted']==1)&(data['actual']==0)].shape[0]
    false_neg =  data[(data['predicted']==0)& (data['actual']==1)].shape[0]

    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    f1_score = 2*(precision*recall)/(precision+recall)
    return f1_score


class Train_test_split:
    def __init__(self,dataset,test_size=0.2,random=1,target='target'):
        self.target = target
        self.random_state=random
        self.test_size=test_size
        self.dataset = dataset
        
        self.row = self.dataset.shape[0]


    def split(self):
        
        np.random.seed(self.random_state)
        row_count = int(self.row*self.test_size)
        value = np.arange(0,self.row)
        row_no = np.random.choice(value,replace=False,size=row_count)
        

        test_dataset = self.dataset.iloc[row_no]
        x_test = test_dataset.drop(columns=[self.target])
        y_test = test_dataset[self.target]

        remaining_row=[]
        for i in range(self.row):
            if i not in row_no:
                remaining_row.append(i)
        train_dataset = self.dataset.iloc[remaining_row]
        x_train = train_dataset.drop(columns=[self.target])
        y_train = train_dataset[self.target]
        
        return x_train,x_test,y_train,y_test
        


class Random_Forest:
    def __init__(self,dataset,percent=0.7,tree=5,depth=4):
        self.dataset =dataset
        self.tree_count = tree
        self.depth = depth
        self.size = int(dataset.shape[0]*percent)
        self.trees = []
        for i in range(1,self.tree_count+1):
            t_dataset = self.dataset.sample(n=self.size,random_state=i)
            target_col_name = t_dataset.columns[-1]
            test_train= Train_test_split(t_dataset,random=i,target=target_col_name)
        
            x_train,x_test,y_train,y_test = test_train.split()
            x_train['target']=y_train
            tree_name = 't'+str(i)
            
            tree_obj = Tree(self.depth)
            tree_obj.fit(x_train)
            tree_predicted = tree_obj.predict(x_test)
            tree_score = f1_score(tree_predicted,y_test)
            print(f"Tree {i} f1_score is :{tree_score}")
            
            
            self.trees.append(tree_obj)
        print("""
        (\__/)
        (^ _^)
        (>Training complete""")

            
    def predict(self,test_dataset):
        predicted_value =[]
        for tree in self.trees:
                
            value = tree.predict(test_dataset)
            predicted_value.append(value)
        final_result = []
        voting_result = np.sum(predicted_value,axis=0)
        for number in voting_result:
            if(number>(self.tree_count//2)):
                final_result.append(1)
            else:
                final_result.append(0)
        
        return final_result
            
            
        
            

  (\__/)


In [2]:
class Scaler:
    def __init__(self):
        self.mean = None
        self.std = None
        
    def scale(self,dataset):
        
        self.mean = dataset.mean()
        self.std = dataset.std()
        
    
    def transform(self,dataset):
        new_dataset = (dataset-self.mean)/(self.std)
        return new_dataset
    

In [14]:
data = pd.read_csv('train_binary.csv')
data = data.iloc[:5000,1:] 
target = data.iloc[:,-1]
data = data.iloc[:,:-1]
s = Scaler()
s.scale(data)
data = s.transform(data)
data['target']=target
train = Train_test_split(data)
x_train,x_test,y_train,y_test =train.split()
x_train['target']=y_train

In [15]:
tree = Tree()
tree.fit(x_train)

In [16]:
y_pre = tree.predict(x_test)

  col_idx_value = row[node.feature_index]


In [18]:
score=f1_score(y_pre,y_test)

In [19]:
score

0.9852216748768472

In [21]:
test_data = pd.read_csv('test_binary.csv')
test_data= s.transform(test_data)

In [22]:
pre_value = tree.predict(test_data)

  col_idx_value = row[node.feature_index]


In [25]:
id = np.arange(test_data.shape[0])

In [26]:
sub = pd.DataFrame({
    'id':id,
    'predicted_value':pre_value
})

In [27]:
sub.to_csv('tree_submit.csv')