In [1]:
import pandas as pd
from train_val_test import train_val_test_split
import numpy as np

In [2]:
df_data = pd.read_csv('BitcoinHeistData.csv')

In [3]:
# Shuffle with seed
df_data = df_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
df_data.isna().sum()

address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

In [5]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


In [6]:
df_data = df_data.drop(['address'], axis=1)

In [7]:
train, val, test = train_val_test_split(df_data, shuffle=False, train_size=0.7, val_size=0.15, seed=12)

In [8]:
trainX = train.drop(columns=['label'])
trainY = train['label']

valX = val.drop(columns=['label'])
valY = val['label']

testX = test.drop(columns=['label'])
testY = test['label']

# A

In [9]:
from sklearn import tree

In [10]:
depths = [4, 8, 10, 15, 20]
criterion = ['gini', 'entropy']

In [11]:
# Accuracy Function
import numpy as np
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

In [12]:
for criteria in criterion:
    for depth in depths:
        clf = tree.DecisionTreeClassifier(max_depth=depth, criterion=criteria)
        clf = clf.fit(trainX.values, trainY.values)
        preds = clf.predict(testX.values)
        print("Accuracy for criterion = ", criteria, " and depth = ", depth, " is ", accuracy(preds, testY.values)*100 , "%")

Accuracy for criterion =  gini  and depth =  4  is  98.58858755899934 %
Accuracy for criterion =  gini  and depth =  8  is  98.65510108455904 %
Accuracy for criterion =  gini  and depth =  10  is  98.70950046285185 %
Accuracy for criterion =  gini  and depth =  15  is  98.793613787271 %
Accuracy for criterion =  gini  and depth =  20  is  98.66927235117313 %
Accuracy for criterion =  entropy  and depth =  4  is  98.58858755899934 %
Accuracy for criterion =  entropy  and depth =  8  is  98.62173003737101 %
Accuracy for criterion =  entropy  and depth =  10  is  98.74401435412167 %
Accuracy for criterion =  entropy  and depth =  15  is  98.81898492588657 %
Accuracy for criterion =  entropy  and depth =  20  is  98.61578724814574 %


# B

In [13]:
stump_depth = 3
stump_criterion = 'entropy'

In [14]:
import pickle
from tqdm import tqdm, trange

In [15]:
# ls_trees = []
# for i in trange(100):
#     clf = tree.DecisionTreeClassifier(max_depth=stump_depth, criterion=stump_criterion)
#     # Randomly sample 50% of the data
#     random_train = train.sample(frac=0.5, random_state=i)
#     random_trainX = random_train.drop(columns=['label'])
#     random_trainY = random_train['label']
#     clf = clf.fit(random_trainX.values, random_trainY.values)
#     # Save the tree
#     with open(f'trained_trees/stump_{i}_criterion_{stump_criterion}_depth_{stump_depth}.pkl', 'wb') as f:
#         pickle.dump(clf, f)
#     ls_trees.append(clf)

In [16]:
# load the trees
ls_trees = []
for i in trange(100):
    with open(f'trained_trees/stump_{i}_criterion_{stump_criterion}_depth_{stump_depth}.pkl', 'rb') as f:
        ls_trees.append(pickle.load(f))

100%|██████████| 100/100 [00:00<00:00, 206.05it/s]


In [17]:
def ensemble_classification_trees(ls_trees, testX, testY):
    preds = []
    for tree in ls_trees:
        preds.append(tree.predict(testX))
    preds = np.array(preds)
    preds = np.transpose(preds)
    majority_preds = []
    for pred in preds:
        majority_preds.append(pred[np.argmax(pred)])
    return accuracy(majority_preds, testY)

In [23]:
ensemble_result = ensemble_classification_trees(ls_trees, testX.values, testY.values)
print("Accuracy for ensemble of 100 trees is ", ensemble_result*100, "%")



Accuracy for ensemble of 100 trees is  98.58858755899934 %


# C

In [22]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

n_estimators = [4, 8, 10, 15, 20]

for estimator in n_estimators:
    clf = AdaBoostClassifier(n_estimators=estimator, base_estimator=tree.DecisionTreeClassifier(max_depth=15))
    clf = clf.fit(trainX.values, trainY.values)
    preds = clf.predict(testX.values)
    print("Accuracy for AdaBoost Classifier = ", estimator, " is ", accuracy(preds, testY.values)*100 , "%")

Accuracy for AdaBoost Classifier =  4  is  98.48961726151701 %
Accuracy for AdaBoost Classifier =  8  is  97.82196774894001 %
Accuracy for AdaBoost Classifier =  10  is  97.76208271905465 %
Accuracy for AdaBoost Classifier =  15  is  98.24550576564839 %
Accuracy for AdaBoost Classifier =  20  is  98.48687443572074 %
