## Decision Trees
* Objectives for this lesson

In [51]:
import pandas as pd
import plotly
import plotly.graph_objects as go

In [10]:
datasource = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
# 1. variance of Wavelet Transformed image (continuous)
# 2. skewness of Wavelet Transformed image (continuous)
# 3. curtosis of Wavelet Transformed image (continuous)
# 4. entropy of image (continuous)
# 5. class (integer)

In [53]:
df=pd.read_csv(datasource, names=['attribute_A', 'attribute_B', 'attribute_C', 'attribute_D', 'forgery'])
df.head()

Unnamed: 0,attribute_A,attribute_B,attribute_C,attribute_D,forgery
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [54]:
df['forgery'].value_counts()

0    762
1    610
Name: forgery, dtype: int64

In [106]:
correlations = df.corr()[['forgery']].drop(['forgery'])
correlations['forgery']

attribute_A   -0.724843
attribute_B   -0.444688
attribute_C    0.155883
attribute_D   -0.023424
Name: forgery, dtype: float64

In [107]:

fig = go.Figure(data=[go.Bar(x=correlations.index, y=correlations['forgery'])])
fig.update_layout(title='Correlation of Features with Banknote Forgery')
fig.show()

In [89]:
comparison = df.groupby('forgery').mean()
fig = go.Figure(data=[
    go.Bar(name='Authentic', x=comparison.columns, y=comparison.iloc[0]),
    go.Bar(name='Forgery', x=comparison.columns, y=comparison.iloc[1])
])
fig.update_layout(title='Banknote Image Comparison',
                  barmode='group')
fig.show()

In [117]:
trace0=go.Scatter3d(
    x = df[df['forgery']==0]['attribute_A'],
    y = df[df['forgery']==0]['attribute_B'],
    z = df[df['forgery']==0]['attribute_C'],
    mode='markers',
    name='Authentic',
    marker=dict(size=4, opacity=0.4, color='blue'),
)
trace1=go.Scatter3d(
    x = df[df['forgery']==1]['attribute_A'],
    y = df[df['forgery']==1]['attribute_B'],
    z = df[df['forgery']==1]['attribute_C'],
    mode='markers',
    name='Forgery',
    marker=dict(size=4, opacity=0.4, color='red'),
)
fig=go.Figure([trace0, trace1])
fig.update_layout(scene = dict(
                    xaxis_title='Attribute A',
                    yaxis_title='Attribute B',
                    zaxis_title='Attribute C'),
                    width=700,
                    margin=dict(r=5, b=5, l=5, t=5),
                 )

fig.show()

In [123]:
# write some simple business rules for predicting forgeries
def biz_rules(banknote):
    if (banknote['attribute_A']<0):
        if (banknote['attribute_B']>-7):
            return 1
        else:
            return 0
    if (banknote['attribute_A']>0):
        if (banknote['attribute_C']<-3):
            return 1
        else:
            return 0

In [125]:
# apply our business rules
df['biz_rule_pred']=df.apply(biz_rules, axis=1)
df.head()

Unnamed: 0,attribute_A,attribute_B,attribute_C,attribute_D,forgery,biz_rule_pred
0,3.6216,8.6661,-2.8073,-0.44699,0,0
1,4.5459,8.1674,-2.4586,-1.4621,0,0
2,3.866,-2.6383,1.9242,0.10645,0,0
3,3.4566,9.5228,-4.0112,-3.5944,0,1
4,0.32924,-4.4552,4.5718,-0.9888,0,0
