In [1]:
####################
## load libraries ##
####################
import numpy as np
import pandas as pd
np.random.seed(123456789)
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

In [2]:
############################
## Toy Dataset Simulation ##
############################
def simulate_df(n=100, seed=123456, binary_flag=False):
    np.random.seed(seed)
    
    ## specify dataframe
    df = pd.DataFrame()

    ## specify variables L1 through L6
    L1_split = 0.52
    L2_split = 0.23
    L3_split = 0.38
    df['L1'] = np.random.choice([0, 1], size=n, replace=True, p=[L1_split, (1-L1_split)])
    df['L2'] = np.random.choice([0, 1], size=n, replace=True, p=[L2_split, (1-L2_split)])
    df['L3'] = np.random.choice([0, 1], size=n, replace=True, p=[L3_split, (1-L3_split)])
    df['L4'] = np.random.normal(0, 1, df.shape[0])
    df['L5'] = np.random.normal(0, 0.75, df.shape[0])
    df['L6'] = np.random.normal(0, 2, df.shape[0])
    
    theta_0 = 5.5
    theta_1 = 1.28
    theta_2 = 0.42
    theta_3 = 2.32
    theta_4 = -3.15
    theta_5 = 3.12
    theta_6 = -4.29
    theta_7 = -1.23
    theta_8 = -10.18
    theta_9 = 2.21
    theta_10 = 10.3
    
    if(binary_flag):
        Z = theta_0 + (theta_1*df['L1']) + (theta_2*df['L2']) + (theta_3*df['L3']) + (theta_4*df['L4']) + (theta_5*df['L5']) + (theta_6*df['L6']) + (theta_7*df['L2']*df['L4']) + (theta_8*df['L3']*df['L6']) + (theta_9*df['L5']*df['L5']) + (theta_10*np.sin(df['L5']))
        p = 1 / (1 + np.exp(-Z))
        df['Y'] = np.random.binomial(1, p)
        df.loc[df['Y']==0, 'Y'] = -1
    else:
        df['Y'] = theta_0 + (theta_1*df['L1']) + (theta_2*df['L2']) + (theta_3*df['L3']) + (theta_4*df['L4']) + (theta_5*df['L5']) + (theta_6*df['L6']) + (theta_7*df['L2']*df['L4']) + (theta_8*df['L3']*df['L6']) + (theta_9*df['L5']*df['L5']) + (theta_10*np.sin(df['L5'])) + np.random.normal(0, 0.1, df.shape[0])

    return(df)

In [3]:
###############################################
## Gradient Boosting with Continuous Outcome ##
###############################################

## simulate the dataset with continuous outcome Y
df = simulate_df(n=1000, seed=123456)
df['Y_original'] = df['Y']

## fit Gradient Boosting Model with depth-3 Regression Trees as Weak Learners
## continue fitting model until loss function < 1
alpha = 100000
current_loss = sum((df['Y'])**2)
while(current_loss > 1):
    model = DecisionTreeRegressor(random_state=0, max_depth=3)
    model.fit(df[['L1', 'L2', 'L3','L4', 'L5', 'L6']], df['Y'])
    df['Y_hat'] = model.predict(df[['L1', 'L2', 'L3','L4', 'L5', 'L6']])
    df['Y_hat_squared'] = df['Y_hat']**2
    df['Y_hat_scaled'] = np.sqrt(df['Y_hat_squared'] / df['Y_hat_squared'].sum()) * np.sign(df['Y_hat'])
    loss_not_lowered_flag = True
    while(loss_not_lowered_flag):
        new_loss = sum((df['Y'] - (alpha*df['Y_hat_scaled']))**2)
        if(new_loss < current_loss):
            loss_not_lowered_flag = False
            current_loss = new_loss
            print('Current Loss: ' + str(current_loss))
            df['Y'] = df['Y'] - (alpha*df['Y_hat_scaled'])
        else:
            alpha = 0.99*alpha
    del model
print('model converged')

Current Loss: 710652.9617096438
Current Loss: 704697.9593626335
Current Loss: 687852.7483979535
Current Loss: 681537.2980095611
Current Loss: 665478.7445945848
Current Loss: 659586.5364438506
Current Loss: 640014.5760520245
Current Loss: 638497.2551319165
Current Loss: 618245.6441627132
Current Loss: 618203.3564164565
Current Loss: 616505.1593726956
Current Loss: 598494.8103541064
Current Loss: 587837.59235525
Current Loss: 579409.3972249589
Current Loss: 574747.8100380445
Current Loss: 561192.49506229
Current Loss: 543560.0029280054
Current Loss: 543207.4725473339
Current Loss: 536458.6003377549
Current Loss: 526373.448554167
Current Loss: 506577.88892479986
Current Loss: 493605.0976276407
Current Loss: 479834.35924008564
Current Loss: 478361.1843650935
Current Loss: 464437.1331300254
Current Loss: 463945.9471018534
Current Loss: 453269.89872302505
Current Loss: 449970.73316805396
Current Loss: 446826.9311030527
Current Loss: 436211.3148086326
Current Loss: 436021.4632674543
Current L

Current Loss: 34424.97350993459
Current Loss: 33582.415921678315
Current Loss: 33483.847406923465
Current Loss: 33474.053710730936
Current Loss: 32444.304678982004
Current Loss: 32211.06997158518
Current Loss: 32188.34952098816
Current Loss: 31874.02188830543
Current Loss: 31536.536546463736
Current Loss: 31282.083505735696
Current Loss: 31104.52653276269
Current Loss: 30841.35230974275
Current Loss: 30648.834421287087
Current Loss: 30577.39050725543
Current Loss: 30276.08752336593
Current Loss: 30187.735729412958
Current Loss: 29972.717394429645
Current Loss: 29793.340421400233
Current Loss: 29638.53875756256
Current Loss: 29510.269755945115
Current Loss: 29379.547656950876
Current Loss: 29226.9979969405
Current Loss: 29207.379156794406
Current Loss: 28989.572158810457
Current Loss: 28978.054291158565
Current Loss: 28766.830405711382
Current Loss: 28751.72197492873
Current Loss: 28552.825122293332
Current Loss: 28539.860716628336
Current Loss: 28347.267987849664
Current Loss: 28330.87

Current Loss: 875.1473875758797
Current Loss: 862.2151120386825
Current Loss: 861.9187834415728
Current Loss: 846.0317224109516
Current Loss: 830.4360781294096
Current Loss: 811.1718902752698
Current Loss: 795.0431537753853
Current Loss: 780.8245888048266
Current Loss: 763.323661082025
Current Loss: 736.5326487554436
Current Loss: 734.0988246381211
Current Loss: 702.4028892503886
Current Loss: 701.3515339670636
Current Loss: 701.1634781191071
Current Loss: 694.4217652014555
Current Loss: 691.8657951267226
Current Loss: 682.7614075280623
Current Loss: 671.5924989116075
Current Loss: 669.0691637400205
Current Loss: 648.7002254709896
Current Loss: 641.2230791145174
Current Loss: 621.1235092204985
Current Loss: 600.9957704077246
Current Loss: 597.2913172845314
Current Loss: 593.3311729603283
Current Loss: 570.9518521194597
Current Loss: 551.7980460767684
Current Loss: 551.6542750008251
Current Loss: 541.2681006778669
Current Loss: 527.951580816983
Current Loss: 506.4348832920625
Current Lo

Current Loss: 8.242284711941187
Current Loss: 8.24215594319912
Current Loss: 8.239628350987497
Current Loss: 8.239444987393718
Current Loss: 8.2370760645657
Current Loss: 8.236709367119886
Current Loss: 8.194228834532975
Current Loss: 8.141858100569834
Current Loss: 7.945139908658383
Current Loss: 7.5749740893925255
Current Loss: 7.553219430945573
Current Loss: 7.363170008864563
Current Loss: 7.1464644024222395
Current Loss: 6.99384925714579
Current Loss: 6.836158204425765
Current Loss: 6.7031606378622905
Current Loss: 6.620879427302571
Current Loss: 6.507187318387684
Current Loss: 6.425745092540402
Current Loss: 6.26560561711109
Current Loss: 6.145804547708139
Current Loss: 6.120287805429225
Current Loss: 5.944613467583897
Current Loss: 5.731580061563935
Current Loss: 5.540359058724032
Current Loss: 5.3753315398235655
Current Loss: 5.145743665385163
Current Loss: 5.145445351284552
Current Loss: 5.1049612772408235
Current Loss: 5.029171902267406
Current Loss: 4.961575705660979
Current 

In [4]:
######################################################
## AdaBoost with Binary Categorical Outcome {-1, 1} ##
######################################################

## simulate the dataset with continuous outcome Y
df = simulate_df(n=1000, seed=123456, binary_flag=True)
df = df.rename(columns={'Y':'Y_original'})
df['w'] = df.shape[0]*[1/df.shape[0]]
df['Y'] = 0

## fit AdaBoost Model with depth-2 Decision Trees as Weak Learners
## continue fitting model until all n-1000 observations predicted correctly
count = True
while(count):
    model = DecisionTreeClassifier(random_state=0, max_depth=2)
    model.fit(df[['L1', 'L2', 'L3','L4', 'L5', 'L6']], df['Y_original'], sample_weight=df['w'])
    df['Y_hat'] = model.predict(df[['L1', 'L2', 'L3','L4', 'L5', 'L6']])
    df.loc[df['Y_hat']==df['Y_original'], 'w'] = 0
    epsilon = sum(df['w'])
    alpha = 0.5*(np.log((1-epsilon)/epsilon))
    
    df['Y'] = df['Y'] + (alpha*df['Y_hat'])
    
    current_loss = sum(np.exp(-df['Y_original']*df['Y']))
    psi = np.exp(-df['Y_original']*df['Y'])
    df['w'] = psi / current_loss
    
    print('Current Loss: ' + str(current_loss))
    
    df['Y_final'] = 1
    df.loc[df['Y']<0, 'Y_final'] = -1
    
    if(df.loc[df['Y_original']!=df['Y_final'], :].shape[0] == 0):
        print('results converged, all datapoints correctly classified')
        break

Current Loss: 659.1631057636724
Current Loss: 487.7655215433816
Current Loss: 385.43015602209204
Current Loss: 309.6683993321115
Current Loss: 255.5542037237089
Current Loss: 218.3147055733867
Current Loss: 198.66608204748306
Current Loss: 171.9175494217112
Current Loss: 151.6839290289567
Current Loss: 138.20003547506386
Current Loss: 131.45019790958239
Current Loss: 126.25445993343435
Current Loss: 122.78411083637391
Current Loss: 116.59354372531038
Current Loss: 108.28050371146264
Current Loss: 104.39126436345835
Current Loss: 101.02340988337336
Current Loss: 97.78481006804952
Current Loss: 95.31392819906642
Current Loss: 92.54905329566287
Current Loss: 87.00935770165633
Current Loss: 85.1447829237424
Current Loss: 83.34377736632854
Current Loss: 81.53898117389382
Current Loss: 77.92521488496628
Current Loss: 74.76450253594427
Current Loss: 71.78607286865689
Current Loss: 69.08618323068401
Current Loss: 67.14548022606846
Current Loss: 66.48569498215035
Current Loss: 65.60058832576867

In [5]:
pd.crosstab(df['Y_original'], df['Y_final'])

Y_final,-1,1
Y_original,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,309,0
1,0,691
