## Data Drift

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_selector
import numpy as np


class borrow_times(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.borrow_per_cli = None

    def fit(self, X, y=None):
        data = X.copy()
        object_var = data.select_dtypes(include='object').columns.to_list()
        vars = object_var + ['borrow_timestamp']

        self.borrow_per_cli = data[vars].groupby(*object_var).count()
        self.borrow_per_cli.rename(columns={'borrow_timestamp': 'borrow_times'}, inplace=True)
        self.borrow_per_cli.reset_index(inplace=True)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        new_X = pd.merge(data, self.borrow_per_cli, on='wallet_address', how='left').fillna(0)
        new_X = new_X.sort_index(axis=1)
        return new_X

    def set_output(self,transform='default'):
        #No modificar este método
        return self

class tx_diff(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ts_diff_tx = None

    def fit(self, X, y=None):
        data = X.copy()
        self.data = data
        return self

    def transform(self, X, y=None):
        data = X.copy()
        data['ts_diff_tx'] = data['last_tx_timestamp'] - data['first_tx_timestamp']
        data.rename(columns={'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}, inplace=True)
        data.drop(columns=['last_tx_timestamp',
                           'first_tx_timestamp',
                           'risky_last_tx_timestamp',
                           'risky_first_tx_timestamp',
                        #    'borrow_timestamp'
                           ], inplace=True)

        new_data = data.sort_index(axis=1)
        return new_data

    def set_output(self,transform='default'):
        return self

class search_binary(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.binary_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        for col in data.columns:
            diff_values = len(data[col].value_counts())
            is_binary = diff_values == 2
            if is_binary:
                self.binary_cols.append(col)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        if self.binary_cols:
            binary_col = self.binary_cols[0] if isinstance(self.binary_cols, list) else self.binary_cols
            data[binary_col] = data[binary_col].astype('category')
        return data

    def set_output(self, transform='default'):
        return self

In [20]:
first_transformer = ColumnTransformer([
    ('scale_data', MinMaxScaler(), make_column_selector(dtype_include='number',)),
    ('object', 'drop', ['wallet_address']),
    ('categorical', 'passthrough', make_column_selector(dtype_include='category'))
    ],
    remainder='passthrough',
    verbose_feature_names_out=False)
first_transformer.set_output(transform='pandas')
transf_pipe = Pipeline([
                ('add_borrow', borrow_times()),
                ('diff_tranf', tx_diff()),
                ('binary_cols', search_binary()),
                ('cols_transf', first_transformer),
                ])

In [21]:
from scipy.stats import kstest
import pandas as pd


X_t0 = pd.read_parquet("X_t0.parquet")

In [22]:
X_t1 = pd.read_parquet("X_t1.parquet")
X_t2 = pd.read_parquet("X_t2.parquet")
X_t3 = pd.read_parquet("X_t3.parquet")
datasets = [X_t0, X_t1, X_t2, X_t3]

In [23]:
from collections import defaultdict

means = defaultdict(list)
stds = defaultdict(list)
kstests = defaultdict(list)
prev_X = None
for X in datasets:
    X = transf_pipe.fit_transform(X)
    for column in X.select_dtypes(include=np.float64).columns:
        description = X[column].describe()
        means[column].append(description['mean'])
        stds[column].append(description['std'])
        if prev_X is not None:
                kstests[column].append(kstest(X[column], prev_X[column]))
    prev_X = X

In [24]:
import plotly.graph_objects as go

fig = go.Figure()
for column in X.select_dtypes(include=np.float64):
    fig.add_trace(go.Scatter(y=means[column], name=column))
fig.update_layout(width=2000, height=1000)
fig

In [25]:
fig = go.Figure()
for column in X.select_dtypes(include=np.float64):
    fig.add_trace(go.Scatter(y=stds[column], name=column))
fig.update_layout(width=2000, height=1000)
fig

In [26]:
kstests['avg_gas_paid_per_tx_eth']

[KstestResult(statistic=0.20358497381253388, pvalue=0.0, statistic_location=0.004328932705755147, statistic_sign=-1),
 KstestResult(statistic=0.29565310637529346, pvalue=0.0, statistic_location=0.007673845695984875, statistic_sign=-1),
 KstestResult(statistic=0.38717263193012036, pvalue=0.0, statistic_location=0.014837239493548465, statistic_sign=-1)]

In [27]:
fig = go.Figure()
for column in X.select_dtypes(include=np.float64):
    fig.add_trace(go.Scatter(y=kstests[column][2], name=column))
fig.update_layout(width=2000, height=1000)
fig

In [28]:
total = pd.DataFrame()
for i, X in enumerate(datasets):
    X['date'] = pd.Timestamp(2024, 11, 1) + pd.DateOffset(months=i)
    X['type'] = 'reference'
    total = pd.concat([total, X], axis=0)

total.reset_index(drop=True)
total.loc[total['date'] > pd.Timestamp(2024, 11, 1), 'type'] = 'analyze'
total

Unnamed: 0,borrow_block_number,borrow_timestamp,wallet_address,first_tx_timestamp,last_tx_timestamp,wallet_age,incoming_tx_count,outgoing_tx_count,net_incoming_tx_count,total_gas_paid_eth,...,market_natr,market_plus_di,market_plus_dm,market_ppo,market_rocp,market_rocr,unique_borrow_protocol_count,unique_lending_protocol_count,date,type
0,7711117,1.557197e+09,0x502cb8985b2c92a8d4bf309cdaa89de9be442708,1.537224e+09,1.557197e+09,19973049.0,199,438,-239,0.397391,...,4.479356,33.216622,33.415526,-2.370346,0.104294,1.104294,0,1,2024-11-01,reference
1,7711123,1.557197e+09,0x502cb8985b2c92a8d4bf309cdaa89de9be442708,1.537224e+09,1.557197e+09,19973188.0,200,439,-239,0.399063,...,4.479356,33.216622,33.415526,-2.370346,0.104294,1.104294,1,1,2024-11-01,reference
2,7711126,1.557197e+09,0x502cb8985b2c92a8d4bf309cdaa89de9be442708,1.537224e+09,1.557197e+09,19973238.0,201,440,-239,0.400895,...,4.479356,33.216622,33.415526,-2.370346,0.104294,1.104294,1,1,2024-11-01,reference
3,7711672,1.557205e+09,0xa7ff0d561cd15ed525e31bbe0af3fe34ac2059f6,1.557191e+09,1.557198e+09,13922.0,8,56,-48,0.604001,...,4.479356,33.216622,33.415526,-2.370346,0.104294,1.104294,0,1,2024-11-01,reference
4,7712572,1.557217e+09,0xbd9ed130a53cfafcf81502e4d35329a6c4d53410,1.557217e+09,1.557217e+09,264.0,2,3,-1,0.003008,...,4.479356,33.216622,33.415526,-2.370346,0.104294,1.104294,0,1,2024-11-01,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88588,18250961,1.696106e+09,0x3dc6d0168838c40e26d105115908ee833b24e56a,1.677567e+09,1.696106e+09,18539076.0,68,91,-23,0.352223,...,3.327281,21.642831,116.954834,-0.430690,0.018477,1.018477,0,1,2025-02-01,analyze
88589,18251701,1.696115e+09,0x3af16178664dd4b2500c9d02c53347e86b3e7873,1.624471e+09,1.695934e+09,71643811.0,154,168,-14,1.382943,...,3.327281,21.642831,116.954834,-0.430690,0.018477,1.018477,1,1,2025-02-01,analyze
88590,18251742,1.696116e+09,0xba01430a43496df98956e42aab08eb85ca107bb1,1.613776e+09,1.696116e+09,82339560.0,163,310,-147,1.218006,...,3.327281,21.642831,116.954834,-0.430690,0.018477,1.018477,1,1,2025-02-01,analyze
88591,18251864,1.696117e+09,0x1f2b17bff4a0313ec6342eef464742ad1c2de83c,1.671973e+09,1.695933e+09,24144360.0,117,221,-104,0.738964,...,3.327281,21.642831,116.954834,-0.430690,0.018477,1.018477,1,1,2025-02-01,analyze


In [None]:
from nannyml import DataReconstructionDriftCalculator

# metadata = extract_metadata(data = total[total['type']=='reference'], model_name='3d_rotation', model_type='classification_binary')
# metadata.timestamp_column_name = 'ordered'
# metadata.target_column_name = 'y_true'

rcerror_calculator = DataReconstructionDriftCalculator(total.select_dtypes(include=np.float64).columns).fit(reference_data=total[total['type']=='reference'])
# let's compute (and visualize) results across all the dataset.
rcerror_results = rcerror_calculator.calculate(data=total)

# let's create plot with results
figure = rcerror_results.plot()
figure.show()


Degrees of freedom <= 0 for slice


invalid value encountered in scalar divide

