#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn import preprocessing
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split

from collections import Counter

In [2]:
df = pd.read_csv('../data/BitcoinHeistData.csv')
df = df.sample(frac=1)
df.head()
le = preprocessing.LabelEncoder()
le.fit(df.address)
df.address = le.transform(df.address)


In [3]:
df.isnull().sum().sum()


0

In [4]:
def split(df):
    # df = df.drop(['address'], axis=1)
    df = df.sample(frac=1)
    l1 = int(len(df)*0.7)
    l2 = int(len(df)*0.85)
    
    X = df.drop('label', axis=1)
    y = df['label']
    
    X_train, X_val, X_test, y_train, y_val, y_test = (X.iloc[:l1], 
                                                      X.iloc[l1:l2], 
                                                      X.iloc[l2:], 
                                                      y.iloc[:l1], 
                                                      y.iloc[l1:l2], 
                                                      y.iloc[l2:])
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split(df)

#### A. Decision tree

In [5]:
def dt():
    for d in [4, 8, 10, 15, 20]: #15 : 0.9885
        clf = DecisionTreeClassifier(criterion='gini', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'gini, max-depth:{d}, score:{clf.score(X_test, y_test)}')

    for d in [4, 8, 10, 15, 20]: #15 : 0.9888
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'entropy, max-depth:{d}, score:{clf.score(X_test, y_test)}')


dt()

gini, max-depth:4, score:0.9858630187083576
gini, max-depth:8, score:0.9865624392864082
gini, max-depth:10, score:0.9869418635215598
gini, max-depth:15, score:0.9883315619250066
gini, max-depth:20, score:0.9877029976800266
entropy, max-depth:4, score:0.9857944480634507
entropy, max-depth:8, score:0.9862493000079999
entropy, max-depth:10, score:0.9873578587673284
entropy, max-depth:15, score:0.9888549845144627
entropy, max-depth:20, score:0.9877167118090079


#### B. Ensembling

In [6]:
def maxc(row):
    c = Counter(row)
    return c.most_common(1)[0][0]
def ensemble(X_train,y_train, n=100):
    trees = []
    print('creating trees...')
    for i in range(n):
        X_, X__, y_, y__ = train_test_split(X_train, y_train, test_size=0.5)
        tree = DecisionTreeClassifier(criterion='entropy', max_depth=3)
        tree.fit(X_, y_)
        trees.append(tree)
    return trees

def ensemble_predict(trees, X_test, y_test):
    y_pred = []
    for tree in trees:
        y_pred.append(tree.predict(X_test))
    y_pred = np.array(y_pred)
    y_pred = np.transpose(y_pred)
    
    y_pred = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], 1, y_pred)
    score = metrics.accuracy_score(y_test, y_pred)
    print(score)
    
    return y_pred,score


In [7]:
trees = ensemble(X_train,y_train, n=100)

creating trees...


In [None]:
yp, score = ensemble_predict(trees, X_test, y_test)

0.9858881612781568


#### C. Adaboost

In [None]:
ada_arr = []
for i in [4, 8, 10, 15, 20]:
    clf = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(criterion='entropy'), 
        n_estimators=i
        )
    clf.fit(X_train, y_train)
    print(i,':',clf.score(X_test, y_test))
    ada_arr.append(clf)
    



4 : 0.985053885098456
8 : 0.9850630278511102
10 : 0.9851041702380544
15 : 0.9850241711523297
20 : 0.9851178843670358


In [None]:
print(clf)


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'),
                   n_estimators=20)
