#### Importing libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn import preprocessing
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split

from collections import Counter

In [4]:
df = pd.read_csv('../data/BitcoinHeistData.csv')
df = df.sample(frac=1)
df.head()


Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
843739,1FgBNGghSWp4fNuAwTbZjKwQJBa5pXKvjy,2013,83,0,0.083333,1,0,1,50083880.0,white
1539050,1845URxXJU6SbDYKEPVr5kDkGZrhfmAvGC,2015,48,144,0.003736,2535,2411,2,51212260.0,white
462746,1Bx7Hf9mpzeiwpSk8XjsJPEuCwHiA5kq3E,2012,67,0,1.0,1,0,1,397000000.0,white
91109,1LP8gazyPPWxmejnMmmeV7K8hpmYmMskSq,2011,60,0,0.5,1,0,1,1000000000.0,white
2526694,17No8RHp4kaeCxQJJwKF3C13UL5KPFFggN,2017,305,4,1.0,1,0,2,60375620.0,white


In [5]:
df.isnull().sum().sum()


address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

In [6]:
def split(df):
    # df = df.drop(['address'], axis=1)
    le = preprocessing.LabelEncoder()
    le.fit(df.address)
    X = df.drop('label', axis=1)
    X.address = le.transform(X.address)
    y = df['label']
    X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, test_size=0.5)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split(df)

#### A. Decision tree

In [7]:
def dt():
    for d in [4, 8, 10, 15, 20]: #15 : 0.9885
        clf = DecisionTreeClassifier(criterion='gini', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'gini, max-depth:{d}, score:{clf.score(X_test, y_test)}')

    for d in [4, 8, 10, 15, 20]: #15 : 0.9888
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'entropy, max-depth:{d}, score:{clf.score(X_test, y_test)}')


dt()

gini, max-depth:4, score:0.9863087279002526
gini, max-depth:8, score:0.9870538622415744
gini, max-depth:10, score:0.987449286293871
gini, max-depth:15, score:0.9885875589993257
gini, max-depth:20, score:0.9879315664963829
entropy, max-depth:4, score:0.9862515856961634
entropy, max-depth:8, score:0.9866127244260066
entropy, max-depth:10, score:0.9878789956686209
entropy, max-depth:15, score:0.9891224100295997
entropy, max-depth:20, score:0.988025279711089


#### B. Ensembling

In [8]:
def maxc(row):
    c = Counter(row)
    return c.most_common(1)[0][0]
def ensemble(X_train,y_train, n=100):
    trees = []
    print('creating trees...')
    for i in range(n):
        X_, X__, y_, y__ = train_test_split(X_train, y_train, test_size=0.5)
        tree = DecisionTreeClassifier(criterion='entropy', max_depth=3)
        tree.fit(X_, y_)
        trees.append(tree)
    return trees

def ensemble_predict(trees, X_test, y_test):
    y_pred = []
    for tree in trees:
        y_pred.append(tree.predict(X_test))
    y_pred = np.array(y_pred)
    y_pred = np.transpose(y_pred)
    
    y_pred = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], 1, y_pred)
    print(metrics.accuracy_score(y_test, y_pred))
    return y_pred


In [9]:
trees = ensemble(X_train,y_train, n=100)

creating trees...


In [10]:
yp = ensemble_predict(trees, X_test, y_test)

0.9862515856961634


#### C. Adaboost

In [11]:
ada_arr = []
for i in [4, 8, 10, 15, 20]:
    clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'), n_estimators=i)
    clf.fit(X_train, y_train)
    print(i,':',clf.score(X_test, y_test))
    ada_arr.append(clf)


4 : 0.9851704551947977
8 : 0.9851773122592885
10 : 0.9852298830870504
15 : 0.9852755968503217
20 : 0.9851475983131621


In [12]:
print(clf)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'),
                   n_estimators=20)
