In [22]:
import pandas as pd
from train_val_test import train_val_test_split
import numpy as np

In [23]:
df_data = pd.read_csv('BitcoinHeistData.csv')

In [24]:
# Shuffle with seed
df_data = df_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [25]:
df_data.isna().sum()

address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

In [26]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


In [27]:
df_data = df_data.drop(['address'], axis=1)

In [28]:
train, val, test = train_val_test_split(df_data, shuffle=False, train_size=0.7, val_size=0.15, seed=12)

In [29]:
trainX = train.drop(columns=['label'])
trainY = train['label']

valX = val.drop(columns=['label'])
valY = val['label']

testX = test.drop(columns=['label'])
testY = test['label']

# A

In [30]:
from sklearn import tree

In [31]:
depths = [4, 8, 10, 15, 20]
criterion = ['gini', 'entropy']

In [32]:
# Accuracy Function
import numpy as np
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

In [33]:
for criteria in criterion:
    for depth in depths:
        clf = tree.DecisionTreeClassifier(max_depth=depth, criterion=criteria)
        clf = clf.fit(trainX.values, trainY.values)
        val_preds = clf.predict(valX.values)
        test_preds = clf.predict(testX.values)
        print("For Validation Set - Accuracy for criterion = ", criteria, " and depth = ", depth, " is ", accuracy(val_preds, valY.values)*100 , "%")
        print("For Test Set - Accuracy for criterion = ", criteria, " and depth = ", depth, " is ", accuracy(test_preds, testY.values)*100 , "%")

For Validation Set - Accuracy for criterion =  gini  and depth =  4  is  98.5609307322202 %
For Test Set - Accuracy for criterion =  gini  and depth =  4  is  98.58858755899934 %
For Validation Set - Accuracy for criterion =  gini  and depth =  8  is  98.62401572553456 %
For Test Set - Accuracy for criterion =  gini  and depth =  8  is  98.65510108455904 %
For Validation Set - Accuracy for criterion =  gini  and depth =  10  is  98.66698666300957 %
For Test Set - Accuracy for criterion =  gini  and depth =  10  is  98.70904332521914 %
For Validation Set - Accuracy for criterion =  gini  and depth =  15  is  98.7773854013097 %
For Test Set - Accuracy for criterion =  gini  and depth =  15  is  98.79475663135278 %
For Validation Set - Accuracy for criterion =  gini  and depth =  20  is  98.64618690072113 %
For Test Set - Accuracy for criterion =  gini  and depth =  20  is  98.67270088341847 %
For Validation Set - Accuracy for criterion =  entropy  and depth =  4  is  98.5609307322202 %
F

# B

In [34]:
stump_depth = 3
stump_criterion = 'entropy'

In [35]:
import pickle
from tqdm import tqdm, trange

In [36]:
# ls_trees = []
# for i in trange(100):
#     clf = tree.DecisionTreeClassifier(max_depth=stump_depth, criterion=stump_criterion)
#     # Randomly sample 50% of the data
#     random_train = train.sample(frac=0.5, random_state=i)
#     random_trainX = random_train.drop(columns=['label'])
#     random_trainY = random_train['label']
#     clf = clf.fit(random_trainX.values, random_trainY.values)
#     # Save the tree
#     with open(f'trained_trees/stump_{i}_criterion_{stump_criterion}_depth_{stump_depth}.pkl', 'wb') as f:
#         pickle.dump(clf, f)
#     ls_trees.append(clf)

In [37]:
# load the trees
ls_trees = []
for i in trange(100):
    with open(f'trained_trees/stump_{i}_criterion_{stump_criterion}_depth_{stump_depth}.pkl', 'rb') as f:
        ls_trees.append(pickle.load(f))

100%|██████████| 100/100 [00:00<00:00, 4348.82it/s]


In [38]:
def ensemble_classification_trees(ls_trees, testX, testY):
    preds = []
    for tree in ls_trees:
        preds.append(tree.predict(testX))
    preds = np.array(preds)
    preds = np.transpose(preds)
    majority_preds = []
    for pred in preds:
        majority_preds.append(pred[np.argmax(pred)])
    return accuracy(majority_preds, testY)

In [39]:
ensemble_result_val = ensemble_classification_trees(ls_trees, valX.values, valY.values)
ensemble_result_test = ensemble_classification_trees(ls_trees, testX.values, testY.values)
print("For Validation Set - Accuracy for ensemble of 100 trees is ", ensemble_result_val*100, "%")
print("For Test Set - Accuracy for ensemble of 100 trees is ", ensemble_result_test*100, "%")



For Validation Set - Accuracy for ensemble of 100 trees is  98.5609307322202 %
For Test Set - Accuracy for ensemble of 100 trees is  98.58858755899934 %


# C

In [40]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

n_estimators = [4, 8, 10, 15, 20]

for estimator in n_estimators:
    clf = AdaBoostClassifier(n_estimators=estimator, base_estimator=tree.DecisionTreeClassifier(max_depth=15))
    clf = clf.fit(trainX.values, trainY.values)
    val_preds = clf.predict(valX.values)
    test_preds = clf.predict(testX.values)
    # Save the Tree
    with open(f'adaboost_saves/adaBoost_{estimator}_depth_{15}.pkl', 'wb') as f:
        pickle.dump(clf, f)
    print("For Validation Set - Accuracy for AdaBoost Classifier = ", estimator, " is ", accuracy(val_preds, testY.values)*100 , "%")
    print("For Test Set - Accuracy for AdaBoost Classifier = ", estimator, " is ", accuracy(test_preds, testY.values)*100 , "%")

For Validation Set - Accuracy for AdaBoost Classifier =  4  is  97.71865464394692 %
For Test Set - Accuracy for AdaBoost Classifier =  4  is  98.48436017874081 %
For Validation Set - Accuracy for AdaBoost Classifier =  8  is  96.79797945166341 %
For Test Set - Accuracy for AdaBoost Classifier =  8  is  97.77579684803602 %
For Validation Set - Accuracy for AdaBoost Classifier =  10  is  97.04734803030823 %
For Test Set - Accuracy for AdaBoost Classifier =  10  is  97.95888046993748 %
For Validation Set - Accuracy for AdaBoost Classifier =  15  is  97.63591273242591 %
For Test Set - Accuracy for AdaBoost Classifier =  15  is  98.45990331539069 %
For Validation Set - Accuracy for AdaBoost Classifier =  20  is  97.69922629455662 %
For Test Set - Accuracy for AdaBoost Classifier =  20  is  98.56824493434361 %
