#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn import preprocessing
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split

from collections import Counter

In [2]:
df = pd.read_csv('../data/BitcoinHeistData.csv')
df = df.sample(frac=1)
df.head()


Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
1005873,18WzmBQp8F7E1z8ZcjRvivgKniMgVjjin1,2013,245,20,0.008333,1,0,2,2778529000.0,white
1185768,14a2bHQvm6xk137kLqkSVQDGQsTUzTxMy2,2014,60,30,1.3e-05,1,0,2,39419100.0,white
2816273,18EKhWapYi89mccCn6fjAEBKBScsk8kycw,2018,230,8,0.125,1,0,2,36136810.0,white
2258204,13W2ZyuFKrZyV6by2REcsAfhnwiR9txTtr,2017,37,8,1.0,1,0,2,259849700.0,white
2739242,3CCViqjqqvaLgB9L4t8kuznCLpimS4nuht,2018,153,0,1.0,1,0,2,42370490.0,white


In [3]:
df.isnull().sum().sum()


0

In [4]:
def split(df):
    # df = df.drop(['address'], axis=1)
    le = preprocessing.LabelEncoder()
    le.fit(df.address)
    X = df.drop('label', axis=1)
    X.address = le.transform(X.address)
    y = df['label']
    X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, test_size=0.5)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split(df)

#### A. Decision tree

In [5]:
def dt():
    for d in [4, 8, 10, 15, 20]: #15 : 0.9885
        clf = DecisionTreeClassifier(criterion='gini', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'gini, max-depth:{d}, score:{clf.score(X_test, y_test)}')

    for d in [4, 8, 10, 15, 20]: #15 : 0.9888
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
        clf.fit(X_train, y_train)
        print(f'entropy, max-depth:{d}, score:{clf.score(X_test, y_test)}')


dt()

gini, max-depth:4, score:0.9859590176112273
gini, max-depth:8, score:0.9868138649844002
gini, max-depth:10, score:0.9873510017028376
gini, max-depth:15, score:0.9885121312899281
gini, max-depth:20, score:0.9879361378727101
entropy, max-depth:4, score:0.9858881612781568
entropy, max-depth:8, score:0.9862881567067805
entropy, max-depth:10, score:0.98750642849796
entropy, max-depth:15, score:0.9890218397504028
entropy, max-depth:20, score:0.9879384235608736


#### B. Ensembling

In [12]:
def maxc(row):
    c = Counter(row)
    return c.most_common(1)[0][0]
def ensemble(X_train,y_train, n=100):
    trees = []
    print('creating trees...')
    for i in range(n):
        X_, X__, y_, y__ = train_test_split(X_train, y_train, test_size=0.5)
        tree = DecisionTreeClassifier(criterion='entropy', max_depth=3)
        tree.fit(X_, y_)
        trees.append(tree)
    return trees

def ensemble_predict(trees, X_test, y_test):
    y_pred = []
    for tree in trees:
        y_pred.append(tree.predict(X_test))
    y_pred = np.array(y_pred)
    y_pred = np.transpose(y_pred)
    
    y_pred = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], 1, y_pred)
    score = metrics.accuracy_score(y_test, y_pred)
    print(score)
    
    return y_pred,score


In [7]:
trees = ensemble(X_train,y_train, n=100)

creating trees...


In [13]:
yp, score = ensemble_predict(trees, X_test, y_test)

0.9858881612781568


#### C. Adaboost

In [14]:
ada_arr = []
for i in [4, 8, 10, 15, 20]:
    clf = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(criterion='entropy'), 
        n_estimators=i
        )
    clf.fit(X_train, y_train)
    print(i,':',clf.score(X_test, y_test))
    ada_arr.append(clf)
    



4 : 0.985053885098456
8 : 0.9850630278511102
10 : 0.9851041702380544
15 : 0.9850241711523297
20 : 0.9851178843670358


In [10]:
print(clf)


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'),
                   n_estimators=20)
