In [93]:
import pandas as pd
import numpy as np
import random as r

In [94]:
def tr_te_split(dtf, t_size):
    t_size = int(t_size * dtf.shape[0]) # calculating the testsize according to the length of data
    indexes = dtf.index.tolist() # we will take the index of the data-frame to randomize
    t_index = r.sample(population=indexes, k=t_size) #using the random randomizing the indexes
    test_df = dtf.loc[t_index]   #seprating the random test_data based on the random indexes obtained from t_index
    train_df = dtf.drop(t_index)  # in train data we need to remove the t_index as it is in the test data
    return train_df, test_df

In [95]:
data=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',sep='\t')
data=([np.array(i.split()) for i in data.iloc[:,0]])
data=pd.DataFrame(data)
data.columns=['mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin']
data['horsepower']=data.replace('?',np.mean(list(map(float,data[data['horsepower']!='?']['horsepower']))))
for c in data.columns:
    data[c]=data[c].astype(dtype='f')
train, test = tr_te_split(data, t_size=0.2)

In [96]:
train.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,15.0,8.0,350.0,15.0,3693.0,11.5,70.0,1.0
1,18.0,8.0,318.0,18.0,3436.0,11.0,70.0,1.0
3,17.0,8.0,302.0,17.0,3449.0,10.5,70.0,1.0
4,15.0,8.0,429.0,15.0,4341.0,10.0,70.0,1.0
5,14.0,8.0,454.0,14.0,4354.0,9.0,70.0,1.0


In [97]:
class Treeregressor:
    def deviation(sf,count, l, r): 
        return (r/count) - (l/count)**2

    def __init__(sf, train_x, train_y, indexes, depth=5):
        sf.dx = train_x
        sf.dy = train_y
        sf.indexes = indexes
        sf.depth = depth
        sf.length = len(indexes)
        sf.c = train_x.shape[1]
        sf.val = np.mean(train_y[indexes])
        sf.scr = float('inf')
        sf.v_split()
    
    def find_split(sf, column_indexes):
        column = sf.dx.values[sf.indexes,column_indexes]
        target= sf.dy[sf.indexes]
        sorted_indexes = np.argsort(column)
        sorted_target = target[sorted_indexes]
        sorted_column = column[sorted_indexes]
        target_count = sf.length 
        target_sum = sorted_target.sum()
        target_sqr_sum= (sorted_target**2).sum()
        column_count = 0
        column_sum = 0.
        column_sqr_sum = 0.

        for i in range(sf.length-sf.depth-1):
            indivisual_x = sorted_column[i]
            indivisual_y = sorted_target[i]
            column_count += 1
            target_count -= 1
            column_sum += indivisual_y
            target_sum -= indivisual_y
            column_sqr_sum += indivisual_y**2
            target_sqr_sum -= indivisual_y**2
            if i<sf.depth or indivisual_x==sorted_column[i+1]:
                continue

            column_deviation = sf.deviation(column_count, column_sum, column_sqr_sum)
            target_deviation = sf.deviation(target_count, target_sum, target_sqr_sum)
            current_score = column_deviation*column_count/sf.length + target_deviation*target_count/sf.length
            if current_score<sf.scr: 
                sf.column_indexes = column_indexes
                sf.scr = current_score
                sf.best_split = indivisual_x  
    def v_split(sf):
        for i in range(sf.c): 
            sf.find_split(i)

        if sf.scr == float('inf'): 
            return
        column = sf.dx.values[sf.indexes,sf.column_indexes]
        left = np.nonzero(column<=sf.best_split)[0]
        right = np.nonzero(column>sf.best_split)[0]
        sf.lhs = Treeregressor(sf.dx, sf.dy, sf.indexes[left])
        sf.rhs = Treeregressor(sf.dx, sf.dy, sf.indexes[right])

    def prd(sf, prd_x):
        if sf.scr == float('inf'): 
            return sf.val
        if prd_x[sf.column_indexes]<=sf.best_split:
            node = sf.lhs 
        else:
            node=sf.rhs
        return node.prd(prd_x) 

    def predict(sf, test_x):
        return np.array([sf.prd(row) for row in test_x])
    def coeff_determination(sf,y,regy): #r2_score by hand
        sery=sum((y-regy)**2)
        semy=sum((y-np.mean(y))**2)
        return 1-(sery/semy)
    def score(sf,x_test,y_test):
        return sf.coeff_determination(y_test,sf.predict(x_test)),np.mean( (y_test-sf.predict(x_test))**2)


In [98]:
algo=Treeregressor(train.drop('mpg',axis=1),np.array(train['mpg']),np.arange(len(train['mpg'])))

In [99]:
algo.predict(test.drop('mpg',axis=1).values)

array([23.916666, 15.      , 26.9     , 26.9     , 13.      , 33.239998,
       26.116667, 13.      , 21.857143, 15.8125  , 21.014286, 11.      ,
       15.      , 21.014286, 31.136364, 13.      , 19.975   , 21.857143,
       18.766666, 24.05    , 11.      , 27.15    , 14.      , 15.8125  ,
       29.91    , 26.      , 18.766666, 19.975   , 13.      , 15.8125  ,
       21.014286, 23.077778, 28.027271, 26.9     , 22.171429, 11.      ,
       21.014286, 17.949999, 26.9     , 13.      , 23.916666, 23.077778,
       28.975   , 15.8125  , 20.366667, 36.0375  , 21.014286, 32.10909 ,
       32.10909 , 23.077778, 20.366667, 15.8125  , 26.116667, 31.136364,
       13.      , 11.      , 25.      , 29.91    , 19.975   , 15.      ,
       16.983334, 14.      , 23.916666, 14.      , 29.91    , 26.      ,
       15.8125  , 11.      , 13.      , 16.983334, 15.      , 28.027271,
       15.      , 18.766666, 29.91    , 19.      , 34.133335, 36.0375  ,
       16.1     ], dtype=float32)

In [100]:
algo.score(test.drop('mpg',axis=1).values,test['mpg'])

(0.9978269914784383, 0.09743164479732513)

In [101]:
import math
import numpy as np

class DReg():
    def __init__(sf, train_x, train_y,cols, col_indexes,indexes,depth=10, mleaf=5):
        sf.dx=train_x
        sf.dy=train_y
        sf.indexes = indexes
        sf.depth = depth
        sf.min_leaf=mleaf
        sf.col_indexes = col_indexes
        sf.cols = cols
        sf.length = len(indexes)
        sf.c = train_x.shape[1]
        sf.val = np.mean(train_y[indexes])
        sf.scr = float('inf')
        sf.v_split()
    def deviation(sf,count, l, r): 
        return (r/count) - (l/count)**2   
    def v_split(sf):
        for i in range(sf.c): 
            sf.find_split(i)

        if sf.scr == float('inf') or sf.depth <= 0: 
            return
        column = sf.dx.values[sf.indexes,sf.column_indexes]
        left = np.nonzero(column<=sf.best_split)[0]
        right = np.nonzero(column>sf.best_split)[0]
        lf_idxs = np.random.permutation(sf.dx.shape[1])[:sf.cols]
        rf_idxs = np.random.permutation(sf.dx.shape[1])[:sf.cols]
        sf.lhs = DReg(sf.dx, sf.dy,sf.cols, lf_idxs, sf.indexes[left],sf.depth-1, sf.min_leaf)
        sf.rhs = DReg(sf.dx, sf.dy,sf.cols, lf_idxs, sf.indexes[right],sf.depth-1,sf.min_leaf)

    def find_split(sf, column_indexes):
        column = sf.dx.values[sf.indexes,column_indexes]
        target= sf.dy[sf.indexes]
        sorted_indexes = np.argsort(column)
        sorted_target = target[sorted_indexes]
        sorted_column = column[sorted_indexes]
        target_count = sf.length 
        target_sum = sorted_target.sum()
        target_sqr_sum= (sorted_target**2).sum()
        column_count = 0
        column_sum = 0.
        column_sqr_sum = 0.

        for i in range(sf.length-sf.depth-1):
            indivisual_x = sorted_column[i]
            indivisual_y = sorted_target[i]
            column_count += 1
            target_count -= 1
            column_sum += indivisual_y
            target_sum -= indivisual_y
            column_sqr_sum += indivisual_y**2
            target_sqr_sum -= indivisual_y**2
            if i<sf.depth or indivisual_x==sorted_column[i+1]:
                continue

            column_deviation = sf.deviation(column_count, column_sum, column_sqr_sum)
            target_deviation = sf.deviation(target_count, target_sum, target_sqr_sum)
            current_score = column_deviation*column_count/sf.length + target_deviation*target_count/sf.length
            if current_score<sf.scr: 
                sf.column_indexes = column_indexes
                sf.scr = current_score
                sf.best_split = indivisual_x  
    
    def prd(sf, prd_x):
        if sf.scr == float('inf'): 
            return sf.val
        if prd_x[sf.column_indexes]<=sf.best_split:
            node = sf.lhs 
        else:
            node=sf.rhs
        return node.prd(prd_x)    

In [102]:
class RFregressor():
    def __init__(sf, x_tr, y_tr, number_of_trees, cols, sm_size, depth=10, mleaf=5):
        np.random.seed(12)
        if cols == 'sq':
            sf.sl_col = int(np.sqrt(x_tr.shape[1]))
        elif select_type == 'log':
            sf.cols = int(np.log2(x_tr.shape[1]))
        else:
            sf.sl_type = cols
        sf.dx=x_tr
        sf.dy=y_tr
        sf.sm_size=sm_size
        sf.depth=depth
        sf.mleaf=mleaf 
        sf.trees = [sf.single_tree() for i in range(number_of_trees)]

    def single_tree(sf):
        indexes = np.random.permutation(len(sf.dy))[:sf.sm_size]
        selected_cols = np.random.permutation(sf.dx.shape[1])[:sf.sl_col]
        return DReg(sf.dx.iloc[indexes], sf.dy[indexes], sf.sl_col,selected_cols,np.arange(sf.sm_size),sf.depth,sf.mleaf)
        
    def predict(sf, test_x):
        return np.array([np.mean([t.prd(np.array(row)) for t in sf.trees], axis=0) for row in test_x])
    
    def coeff_determination(sf,y,regy): #r2_score by hand
        sery=sum((y-regy)**2)
        semy=sum((y-np.mean(y))**2)
        return 1-(sery/semy)
    def score(sf,x_ts,y_ts):
        return sf.coeff_determination(y_ts,sf.predict(x_ts)),np.mean( (y_ts-sf.predict(x_ts))**2)


In [103]:
reg = RFregressor( train.drop('mpg',axis=1),np.array(train['mpg']),10, 'sq',train['mpg'].shape[0] ,10,5)

In [104]:
reg.predict(test.drop('mpg',axis=1).values)

array([23.916666, 15.      , 26.9     , 26.9     , 13.      , 33.239998,
       26.116669, 13.      , 21.857143, 15.8125  , 21.014286, 11.      ,
       15.      , 21.014286, 31.136362, 13.      , 19.975002, 21.857143,
       18.833332, 24.100002, 11.      , 27.15    , 14.      , 15.8125  ,
       29.91    , 26.      , 19.157145, 19.975002, 13.      , 15.8125  ,
       21.014286, 23.077778, 28.027271, 26.975   , 22.171429, 11.      ,
       21.014286, 17.949999, 26.9     , 13.      , 23.916666, 23.077778,
       28.975   , 15.8125  , 20.366667, 36.0375  , 21.014286, 32.109093,
       32.109093, 23.077778, 20.366667, 15.8125  , 26.116669, 31.136362,
       13.      , 11.      , 25.      , 29.91    , 19.975002, 15.      ,
       16.983334, 14.      , 23.916666, 14.      , 29.91    , 26.      ,
       15.8125  , 11.      , 13.      , 16.983334, 15.      , 28.027271,
       15.      , 19.157145, 29.91    , 19.      , 34.13333 , 36.0375  ,
       16.100002], dtype=float32)

In [105]:
reg.score(test.drop('mpg',axis=1).values,test['mpg'])

(0.9975652272228884, 0.1091684028506279)