In [None]:
import pandas as pd

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize

%matplotlib inline
# to display visuals in the notebook

%config InlineBackend.figure_format='retina'
#to enable high resolution plots

from sklearn.preprocessing import StandardScaler

In [None]:
path = "/Users/cereniyim/GitHub/anomaly-detection-model/transfer_tx_btw_18183000_18183050_v2.csv"
# path = "/Users/cereniyim/GitHub/anomaly-detection-model/transfer_tx_btw_18370600_18370660_v2.csv"
inference_path = "/Users/cereniyim/GitHub/anomaly-detection-model/transfer_tx_btw_18370728_18370788_v2.csv"

txs = pd.read_csv(path)
inference_txs = pd.read_csv(inference_path)
txs.info()

In [None]:
txs.head()

In [None]:
txs = txs.dropna(
    subset=["token", "value"],
    how="any"
).reset_index(drop=True)  # drop True here maybe
txs = txs.drop_duplicates()

In [None]:
txs.info()

In [None]:
inference_txs = inference_txs.dropna(
    subset=["token", "value"],
    how="any"
).reset_index(drop=True)  # drop True here maybe
inference_txs.info()

In [None]:
inference_txs = inference_txs.drop_duplicates()

In [None]:
inference_txs.describe()

In [None]:
inference_txs.describe()

In [None]:
import numpy as np


def apply_log1p_transformation(dataframe, column):
    '''This function takes a dataframe and a column in the string format
    then applies numpy log1p transformation to the column
    as a result returns log1p applied pandas series'''

    dataframe["log_" + column] = np.log1p(dataframe[column])
    return dataframe["log_" + column]

In [None]:
# add gas_cost column

txs["gas_cost_in_gwei"] = (txs["gas_used"] * txs["gas_price"]) / (10 ** 6)
txs["gas_cost_in_eth"] = (txs["gas_used"] * txs["gas_price"]) / (10 ** 18)

In [None]:
apply_log1p_transformation(txs, "value")
apply_log1p_transformation(txs, "gas_used")
apply_log1p_transformation(txs, "gas_price")
apply_log1p_transformation(txs, "gas_cost_in_gwei")
apply_log1p_transformation(txs, "gas_cost_in_eth")
txs.head()

In [None]:
# add gas_cost column

inference_txs["gas_cost_in_gwei"] = (inference_txs["gas_used"] * inference_txs["gas_price"]) / (10 ** 6)
inference_txs["gas_cost_in_eth"] = (inference_txs["gas_used"] * inference_txs["gas_price"]) / (10 ** 18)

apply_log1p_transformation(inference_txs, "value")
apply_log1p_transformation(inference_txs, "gas_used")
apply_log1p_transformation(inference_txs, "gas_price")
apply_log1p_transformation(inference_txs, "gas_cost_in_gwei")
apply_log1p_transformation(inference_txs, "gas_cost_in_eth")
inference_txs.head()

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=3, cols=1,
                    subplot_titles=("Log Value",
                                    "Log Gas Gwei",
                                    "Log Gas ETH"))

fig.append_trace(go.Histogram(x=txs.value),
                 row=1, col=1)

fig.append_trace(go.Histogram(x=txs.gas_cost_in_gwei),
                 row=2, col=1)

fig.append_trace(go.Histogram(x=txs.gas_cost_in_eth),
                 row=3, col=1)

fig.update_layout(height=800, width=800,
                  title_text="Distribution of the Features after Logarithm Transformation")

fig.show()

In [None]:
import plotly.express as px

# visualize log_transformation customer segments with a 3D plot
fig = px.scatter(txs,
                    x="log_value",
                    y="log_gas_cost_in_eth",
                    # color='clusters',
                    hover_data=["tx_hash",
                                "value",
                                "token"]
                    )

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
from sklearn.ensemble import IsolationForest

# create isolation forest model
model = IsolationForest(
    # bootstrap=True,
    random_state=42,
    contamination=0.001,
    # n_estimators=100,
)

In [None]:
txs.iloc[:, -3:]

In [None]:
training_data = txs[["value", "gas_cost_in_eth"]]

In [None]:
inference_data = inference_txs[["value", "gas_cost_in_eth"]]

In [None]:
# training_data.head()

In [None]:
# training_data = txs.iloc[:, -3:]

In [None]:
# scaler = StandardScaler()

# scaled_training_data = scaler.fit_transform(training_data)
# scaled_training_data

In [None]:
fitted_model = model.fit(training_data)

In [None]:
labels = fitted_model.predict(inference_data)

In [None]:
fitted_model.score_samples(inference_data)

In [None]:
inference_txs["clusters"] = labels
inference_txs["anomaly_scores"] = fitted_model.score_samples(inference_data)
inference_txs.describe()

In [None]:
inference_txs.head()

In [None]:
import plotly.express as px

# visualize log_transformation customer segments with a 3D plot
#fig = px.scatter_3d(txs,
#                    x="log_value",
#                    y="log_gas_used",
#                    z="log_gas_price",
#                    color='clusters',
#                    hover_data=["tx_hash",
#                                "value",
#                                "token",
#                                "gas_price",
#                                "gas_used",
#                               ],
#                    # category_orders = {"cluster_name": 
#                    #                    ["0", "1", "2", "3"]},
#                    )

#fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
#fig.show()

In [None]:
import plotly.express as px

# visualize log_transformation customer segments with a 3D plot
fig = px.scatter(inference_txs,
                    x="log_value",
                    y="log_gas_cost_in_eth",
                    color='clusters',
                    hover_data=["tx_hash",
                                "value",
                                "token",
                               "anomaly_scores"]
                    )

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
pd.set_option('display.float_format',  '{:,}'.format)
pd.set_option('display.precision', 5)

inference_txs[inference_txs["clusters"] == -1][["tx_hash", "value", "token", "gas_cost_in_eth", "clusters", "anomaly_scores"]]

In [None]:
inference_txs.iloc[11558]#["tx_hash"]

In [None]:
import sklearn.tree as tree
import pydotplus

from six import StringIO
from IPython.display import Image

import os

In [None]:
single_tree = fitted_model.estimators_[99]

In [None]:
tree.plot_tree(single_tree)

In [None]:
single_tree.tree_.max_depth

In [None]:
dot_data = StringIO()
tree.export_graphviz(single_tree,
                     out_file=dot_data,
                     feature_names=["value", "gas_cost_in_eth"],
                     filled=True,
                     rounded=True,
                     special_characters=True, 
                     max_depth = 4 # single_tree.tree_.max_depth #  4
                    )

In [None]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())