#### Importing libraries

In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn import preprocessing
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split

from collections import Counter

In [3]:
df = pd.read_csv('../data/BitcoinHeistData.csv')
df = df.sample(frac=1)
df.head()


Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
2862349,38X6c1A7GzCs9kkHPVTDAEymC8wAJMW7sF,2018,276,18,0.03125,1,0,2,903400991.0,white
2747075,3Pk7GfW2jzsHWtjrXAfA5qbzEu2SpRXtTd,2018,161,142,0.123702,5486,0,2,50048836.0,white
2800251,1MURTYfh8HXjCMrLRGuvG7qB55A9Jrx4yT,2018,214,0,1.0,1,0,1,30745186.0,white
1556079,1FYkBsCwkqRXfdRVHHQfmfcK4NPB4qBTfA,2015,65,144,0.051402,3140,2855,2,289883354.0,white
878562,18iijfHK8RaBgDgg9MSGEosB428SzXF4GL,2013,117,8,0.25,1,0,2,87543216.0,white


In [4]:
df.isnull().sum()


address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

In [7]:
def split(df):
    # df = df.drop(['address'], axis=1)
    le = preprocessing.LabelEncoder()
    le.fit(df.address)
    X = df.drop('label', axis=1)
    X.address = le.transform(X.address)
    y = df['label']
    X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, test_size=0.5)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split(df)

#### A. Decision tree

In [8]:
def dt():
    for d in [4, 8, 10, 15, 20]: #15 : 0.9885
        clf = DecisionTreeClassifier(criterion='gini', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'gini, max-depth:{d}, score:{clf.score(X_test, y_test)}')

    for d in [4, 8, 10, 15, 20]: #15 : 0.9888
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'entropy, max-depth:{d}, score:{clf.score(X_test, y_test)}')


dt()

gini, max-depth:4, score:0.9858744471491755
gini, max-depth:8, score:0.9866172958023337
gini, max-depth:10, score:0.9869578633387047
gini, max-depth:15, score:0.9885144169780917
gini, max-depth:20, score:0.9875475708849042
entropy, max-depth:4, score:0.985824162009577
entropy, max-depth:8, score:0.9861990148684016
entropy, max-depth:10, score:0.9875887132718483
entropy, max-depth:15, score:0.9888892698369162
entropy, max-depth:20, score:0.9876549982285917


#### B. Ensembling

In [36]:
def maxc(row):
    c = Counter(row)
    return c.most_common(1)[0][0]
def ensemble(X_train,y_train, n=100):
    trees = []
    print('creating trees...')
    for i in range(n):
        X_, X__, y_, y__ = train_test_split(X_train, y_train, test_size=0.5)
        tree = DecisionTreeClassifier(criterion='entropy', max_depth=3)
        tree.fit(X_, y_)
        trees.append(tree)
    return trees

def ensemble_predict(trees, X_test, y_test):
    y_pred = []
    for tree in trees:
        y_pred.append(tree.predict(X_test))
    y_pred = np.array(y_pred)
    y_pred = np.transpose(y_pred)
    
    y_pred = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], 1, y_pred)
    print(metrics.accuracy_score(y_test, y_pred))
    return y_pred


In [34]:
trees = ensemble(X_train,y_train, n=10)

creating trees...


In [37]:
yp = ensemble_predict(trees, X_test, y_test)

0.985824162009577


#### C. Adaboost

In [42]:
ada_arr = []
for i in [4, 8, 10, 15, 20]:
    clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', max_depth=3), n_estimators=i)
    clf.fit(X_train, y_train)
    print(i,':',clf.score(X_test, y_test))
    ada_arr.append(clf)


4 : 0.9480097370315768
8 : 0.7498177163689558
10 : 0.8332887624141438
15 : 0.6564907829624804
20 : 0.7646198329161953


In [41]:
print(clf)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=3),
                   n_estimators=20)
