In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer


maxcols = pd.get_option("display.max_columns")
pd.set_option("display.max_columns", None)

In [2]:
# para entrenar
X_t0 = pd.read_parquet('X_t0.parquet')
y_t0 = pd.read_parquet('y_t0.parquet')
# para predecir(?)
X_t1 = pd.read_parquet('X_t1.parquet')
X_t0 = X_t0[sorted(X_t0.columns)]
X_t1 = X_t1[sorted(X_t1.columns)]

X = X_t0.copy()
y = y_t0.copy()

# Entrega 1

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# Calcula el número de prestamos por wallet_address
class borrow_times(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.borrow_per_cli = None

    def fit(self, X, y=None):
        data = X.copy()
        object_var = data.select_dtypes(include='object').columns.to_list()
        vars = object_var + ['borrow_timestamp']

        self.borrow_per_cli = data[vars].groupby(*object_var).count()
        self.borrow_per_cli.rename(columns={'borrow_timestamp': 'borrow_times'}, inplace=True)
        self.borrow_per_cli.reset_index(inplace=True)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        new_X = pd.merge(data, self.borrow_per_cli, on='wallet_address', how='left').fillna(0)
        new_X = new_X.sort_index(axis=1)
        return new_X

    def set_output(self,transform='default'):
        #No modificar este método
        return self

# Se calcula tiempo entre las_tx_timestamp y first_tx_timestamp = ts_diff_tx,
# Renombra 'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}
# Elimina las_tx_timestamp y first_tx_timestamp

class tx_diff(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ts_diff_tx = None

    def fit(self, X, y=None):
        data = X.copy()
        self.data = data
        return self

    def transform(self, X, y=None):
        data = X.copy()
        data['ts_diff_tx'] = data['last_tx_timestamp'] - data['first_tx_timestamp']
        data.rename(columns={'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}, inplace=True)
        data.drop(columns=['last_tx_timestamp',
                           'first_tx_timestamp',
                           'risky_last_tx_timestamp',
                           'risky_first_tx_timestamp',
                           'borrow_timestamp'], inplace=True)

        new_data = data.sort_index(axis=1)
        return new_data

    def set_output(self,transform='default'):
        return self

class search_binary(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.binary_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        for col in data.columns:
            diff_values = len(data[col].value_counts())
            is_binary = diff_values == 2
            if is_binary:
                self.binary_cols.append(col)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        if self.binary_cols:
            binary_col = self.binary_cols[0] if isinstance(self.binary_cols, list) else self.binary_cols
            data[binary_col] = data[binary_col].astype('category')  # Modified line
        return data

    def set_output(self,transform='default'):
        return self

In [4]:
class time_tranf(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.time_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        tcols = [col for col in data.columns if "timestamp" in col]
        self.time_cols = tcols
        return self

    def transform(self, X, y=None):
        data = X.copy()
        for col in self.time_cols:
            data[col] = data[col].apply(lambda x: datetime.datetime.fromtimestamp(x))
        return data

    def set_output(self,transform='default'):
        return self

In [5]:
from sklearn.compose import make_column_selector

first_transformer = ColumnTransformer([
                            # ('drop_times', 'drop', make_column_selector(dtype_include=datetime)),
                            ('scale_data', MinMaxScaler(), make_column_selector(dtype_include='number') ),
                            ('object', 'drop', make_column_selector(dtype_include=object)),
                            ('categorical', 'passthrough', make_column_selector(dtype_include='category'))
                            ],
                            remainder='passthrough',
                            verbose_feature_names_out=False)
first_transformer.set_output(transform='pandas')

In [6]:
transf_pipe = Pipeline([
                ('add_borrow', borrow_times()),
                ('diff_tranf', tx_diff()),
                ('binary_cols', search_binary()),
                ('time_cols', time_tranf()),
                ('cols_transf', first_transformer),
                ])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, random_state=29, shuffle=True,)

# X_train.to_csv('X_train.csv')
# X_test.to_csv('X_test.csv')
# y_train.to_csv('y_train.csv')
# y_test.to_csv('y_test.csv')

In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

### Dummy Classifier

In [9]:
# X_train_dummy = pd.read_csv('X_train.csv')
# X_test_dummy = pd.read_csv('X_test.csv')

dummy_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('dummy_clf', DummyClassifier(strategy='most_frequent'))
                    ])
dummy_pipe.fit(X_train, y_train)
y_dummy = dummy_pipe.predict(X_test)
dummy_pipe.score(X_test, y_test)

0.5107231544886749

### Linear SVC

In [11]:
# Linear SVC
# X_train_lsvc = pd.read_csv('X_train.csv')
# X_test_lsvc = pd.read_csv('X_test.csv')

lsvc_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('lsvc_clf', LinearSVC(random_state=29))
                    ])
# lsvc_clf = LinearSVC(random_state=29)
lsvc_pipe.fit(X_train, y_train)
y_lsvc = lsvc_pipe.predict(X_test)
lsvc_pipe.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.7543080743472045

### Decision Tree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

# X_train_tree = pd.read_csv('X_train.csv')
# X_test_tree = pd.read_csv('X_test.csv')

tree_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('tree_clf', DecisionTreeClassifier(random_state=29))
                    ])

tree_pipe.fit(X_train, y_train)
y_tree = tree_pipe.predict(X_test)
tree_pipe.score(X_test, y_test)

0.8370833019790804

### Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

# X_train_rf = pd.read_csv('X_train.csv')
# X_test_rf = pd.read_csv('X_test.csv')

rf_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('rf_clf', RandomForestClassifier(criterion='entropy',random_state=29))
                    ])

rf_pipe.fit(X_train, y_train)
y_tree = rf_pipe.predict(X_test)
rf_pipe.score(X_test, y_test)

  return fit_method(estimator, *args, **kwargs)


0.9014974791180675

# Entrega 2

In [14]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.7-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.7-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.9/78.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [15]:
import optuna
from optuna.visualization.matplotlib import (plot_optimization_history, plot_param_importances,
                                    plot_parallel_coordinate)

In [16]:
rf_pipe.get_params()

{'memory': None,
 'steps': [('preprocess',
   Pipeline(steps=[('add_borrow', borrow_times()), ('diff_tranf', tx_diff()),
                   ('binary_cols', search_binary()), ('time_cols', time_tranf()),
                   ('cols_transf',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('scale_data', MinMaxScaler(),
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x7f634c1a2710>),
                                                    ('object', 'drop',
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x7f634c1a2320>),
                                                    ('categorical', 'passthrough',
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x7f634c1a2c50>)],
                                      verbos

In [18]:
# Optimización de parámetros del modelo seleccionado.
from sklearn.metrics import f1_score
from time import process_time


def objective_function(trial):
    # Comenzamos a tomar el tiempo que tarda en ejecutarse cada iteración
    inicial_time = process_time()

    # Split into train and validation sets

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y , test_size=0.2, random_state=29)

    y_train, y_valid = np.ravel(y_train), np.ravel(y_valid)

    # Hyperparameters
    rf_params = {
        "rf_clf__n_estimators": trial.suggest_int("n_estimators", 10, 400),
        "rf_clf__criterion": trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        "rf_clf__max_depth": trial.suggest_int("max_depth", 3, 50),
        'rf_clf__max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 60),
        'rf_clf__max_features':  trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        }

    # rf_pipe = Pipeline([
    #                 ('preprocess', transf_pipe),
    #                 ('rf_clf', RandomForestClassifier(random_state=29))
    #                 ])
    rf_pipe.fit(
        X_train,
        y_train,
    ).set_params(**rf_params)

    # Predict and evaluate the model
    yhat = rf_pipe.predict(X_valid)
    f1 = f1_score(y_valid, yhat, average='weighted')
    elapsed_time = process_time() - inicial_time

    print('elapsed_time', elapsed_time)
    print('f1 score', f1)
    print('')
    return f1


In [21]:
inicial_opti = process_time()

study = optuna.create_study(study_name='RandomForest optimization', direction="maximize")
study.optimize(objective_function, n_trials=10)
opti_time = process_time() - inicial_opti
print('Tiempo total de opti: ', opti_time)

[I 2024-12-04 13:00:38,025] A new study created in memory with name: RandomForest optimization
[I 2024-12-04 13:01:07,767] Trial 0 finished with value: 0.9085159147706858 and parameters: {'n_estimators': 218, 'criterion': 'gini', 'max_depth': 18, 'max_leaf_nodes': 11, 'max_features': 'log2'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 29.620602392000002
f1 score 0.9085159147706858



[I 2024-12-04 13:01:23,103] Trial 1 finished with value: 0.766620605699887 and parameters: {'n_estimators': 78, 'criterion': 'log_loss', 'max_depth': 7, 'max_leaf_nodes': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 15.018077551000005
f1 score 0.766620605699887



[I 2024-12-04 13:01:27,351] Trial 2 finished with value: 0.7401434558751568 and parameters: {'n_estimators': 353, 'criterion': 'gini', 'max_depth': 27, 'max_leaf_nodes': 15, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 4.180544910999998
f1 score 0.7401434558751568



[I 2024-12-04 13:02:00,099] Trial 3 finished with value: 0.7729109169631426 and parameters: {'n_estimators': 326, 'criterion': 'log_loss', 'max_depth': 50, 'max_leaf_nodes': 17, 'max_features': 'log2'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 32.30054624899999
f1 score 0.7729109169631426



[I 2024-12-04 13:02:32,684] Trial 4 finished with value: 0.7742818912680623 and parameters: {'n_estimators': 240, 'criterion': 'entropy', 'max_depth': 49, 'max_leaf_nodes': 57, 'max_features': 'log2'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 30.046130515000016
f1 score 0.7742818912680623



[I 2024-12-04 13:03:05,454] Trial 5 finished with value: 0.8026176168685022 and parameters: {'n_estimators': 20, 'criterion': 'log_loss', 'max_depth': 41, 'max_leaf_nodes': 35, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 30.517134473
f1 score 0.8026176168685022



[I 2024-12-04 13:03:08,705] Trial 6 finished with value: 0.7852953336213127 and parameters: {'n_estimators': 321, 'criterion': 'entropy', 'max_depth': 43, 'max_leaf_nodes': 8, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 3.1523295929999904
f1 score 0.7852953336213127



[I 2024-12-04 13:03:35,934] Trial 7 finished with value: 0.7612333541664306 and parameters: {'n_estimators': 153, 'criterion': 'gini', 'max_depth': 16, 'max_leaf_nodes': 60, 'max_features': 'log2'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 27.165157053
f1 score 0.7612333541664306



[I 2024-12-04 13:03:52,029] Trial 8 finished with value: 0.8068665117755756 and parameters: {'n_estimators': 194, 'criterion': 'entropy', 'max_depth': 39, 'max_leaf_nodes': 14, 'max_features': None}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 16.02485873499998
f1 score 0.8068665117755756



[I 2024-12-04 13:06:44,817] Trial 9 finished with value: 0.774534980593242 and parameters: {'n_estimators': 290, 'criterion': 'entropy', 'max_depth': 31, 'max_leaf_nodes': 7, 'max_features': 'log2'}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 161.36086601699998
f1 score 0.774534980593242



[I 2024-12-04 13:07:02,995] Trial 10 finished with value: 0.754514939285592 and parameters: {'n_estimators': 395, 'criterion': 'gini', 'max_depth': 15, 'max_leaf_nodes': 31, 'max_features': None}. Best is trial 0 with value: 0.9085159147706858.


elapsed_time 18.08797066400001
f1 score 0.754514939285592



[W 2024-12-04 13:08:12,474] Trial 11 failed with parameters: {'n_estimators': 172, 'criterion': 'entropy', 'max_depth': 35, 'max_leaf_nodes': 25, 'max_features': None} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-18-14ec5d757cb3>", line 30, in objective_function
    rf_pipe.fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 489, in fi

KeyboardInterrupt: 

### XGBoost

In [23]:
!pip install xgboost



In [25]:
from xgboost import XGBClassifier

xgb_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('xgb_clf', XGBClassifier(random_state=29,
                                              enable_categorical=True))
                    ])

xgb_pipe.fit(X_train, y_train)
y_xgb= xgb_pipe.predict(X_test)
xgb_pipe.score(X_test, y_test)

0.8858454360749493

In [31]:
def objective_function(trial):
    # Comenzamos a tomar el tiempo que tarda en ejecutarse cada iteración
    inicial_time = process_time()

    # Split into train and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y , test_size=0.2, random_state=29)

    y_train, y_valid = np.ravel(y_train), np.ravel(y_valid)

    # Hyperparameters
    xgb_params = {
            "xgb_clf__objective": "binary:logistic",
            "xgb_clf__n_estimators": trial.suggest_int("n_estimators", 10, 500),
            "xgb_clf__max_depth": trial.suggest_int("max_depth", 3, 10),
            'xgb_clf__max_leaves': trial.suggest_int("max_leaves", 3, 30),
            "xgb_clf__grow_policy": trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
            "xgb_clf__learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
            "xgb_clf__n_jobs": trial.suggest_int('n_jobs', 1, 3),
            "xgb_clf__gamma": trial.suggest_float("gamma", 0, 1),
            "xgb_clf__min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
            "xgb_clf__subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "xgb_clf__colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }


    xgb_pipe.fit(
        X_train,
        y_train,
      ).set_params(**xgb_params)

    # Predict and evaluate the model
    yhat = xgb_pipe.predict(X_valid)
    f1 = f1_score(y_valid, yhat, average='weighted')
    elapsed_time = process_time() - inicial_time

    print('elapsed_time', elapsed_time)
    print('f1 score', f1)
    print('')
    return f1


In [32]:
inicial_opti = process_time()

study = optuna.create_study(study_name='XGBoost Clf optimization', direction="maximize")
study.optimize(objective_function, n_trials=10)
opti_time = process_time() - inicial_opti
print('Tiempo total de opti: ', opti_time)

[I 2024-12-04 13:17:12,092] A new study created in memory with name: XGBoost Clf optimization
[I 2024-12-04 13:17:22,551] Trial 0 finished with value: 0.8910427663548696 and parameters: {'n_estimators': 303, 'max_depth': 3, 'max_leaves': 5, 'grow_policy': 'depthwise', 'learning_rate': 0.003313019329812414, 'n_jobs': 2, 'gamma': 0.3089944475612574, 'min_child_weight': 7, 'subsample': 0.8191434558476196, 'colsample_bytree': 0.6736108252904942}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 9.902537682999991
f1 score 0.8910427663548696



[I 2024-12-04 13:17:26,010] Trial 1 finished with value: 0.7643809973868376 and parameters: {'n_estimators': 428, 'max_depth': 4, 'max_leaves': 23, 'grow_policy': 'depthwise', 'learning_rate': 0.005851148196562649, 'n_jobs': 3, 'gamma': 0.6237160837698891, 'min_child_weight': 4, 'subsample': 0.9577516810900373, 'colsample_bytree': 0.5831571184910027}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 6.333542559999955
f1 score 0.7643809973868376



[I 2024-12-04 13:17:34,215] Trial 2 finished with value: 0.796449468897661 and parameters: {'n_estimators': 66, 'max_depth': 6, 'max_leaves': 13, 'grow_policy': 'depthwise', 'learning_rate': 0.07772427461413285, 'n_jobs': 3, 'gamma': 0.4370152400666991, 'min_child_weight': 1, 'subsample': 0.7577794391071899, 'colsample_bytree': 0.7089762039075714}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 12.879431273000023
f1 score 0.796449468897661



[I 2024-12-04 13:17:35,816] Trial 3 finished with value: 0.8092659315078963 and parameters: {'n_estimators': 496, 'max_depth': 10, 'max_leaves': 27, 'grow_policy': 'lossguide', 'learning_rate': 0.0015086483651120882, 'n_jobs': 2, 'gamma': 0.0955989326909874, 'min_child_weight': 2, 'subsample': 0.9514015176750305, 'colsample_bytree': 0.6135284114907607}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 2.7061360899999727
f1 score 0.8092659315078963



[I 2024-12-04 13:17:50,621] Trial 4 finished with value: 0.8066910642301962 and parameters: {'n_estimators': 54, 'max_depth': 8, 'max_leaves': 17, 'grow_policy': 'depthwise', 'learning_rate': 0.001740175520440753, 'n_jobs': 1, 'gamma': 0.8126530556006727, 'min_child_weight': 1, 'subsample': 0.9854355826325729, 'colsample_bytree': 0.8614218160341466}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 25.63919285099996
f1 score 0.8066910642301962



[I 2024-12-04 13:17:52,620] Trial 5 finished with value: 0.7516831155260532 and parameters: {'n_estimators': 436, 'max_depth': 9, 'max_leaves': 20, 'grow_policy': 'lossguide', 'learning_rate': 0.004206685112189822, 'n_jobs': 1, 'gamma': 0.6238782976498908, 'min_child_weight': 3, 'subsample': 0.9227043364915105, 'colsample_bytree': 0.5475174137101255}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 2.0341551669999944
f1 score 0.7516831155260532



[I 2024-12-04 13:18:02,403] Trial 6 finished with value: 0.8077092904684153 and parameters: {'n_estimators': 460, 'max_depth': 6, 'max_leaves': 22, 'grow_policy': 'lossguide', 'learning_rate': 0.0027667856580787112, 'n_jobs': 2, 'gamma': 0.5137068047462395, 'min_child_weight': 7, 'subsample': 0.8010941429551077, 'colsample_bytree': 0.9155781566111594}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 9.844075599000007
f1 score 0.8077092904684153



[I 2024-12-04 13:18:15,169] Trial 7 finished with value: 0.8054014570244794 and parameters: {'n_estimators': 90, 'max_depth': 10, 'max_leaves': 8, 'grow_policy': 'depthwise', 'learning_rate': 0.05081484632743441, 'n_jobs': 2, 'gamma': 0.10365399634821582, 'min_child_weight': 2, 'subsample': 0.9464415757588305, 'colsample_bytree': 0.6123402496059216}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 21.859632381999972
f1 score 0.8054014570244794



[I 2024-12-04 13:18:16,728] Trial 8 finished with value: 0.7900777211597091 and parameters: {'n_estimators': 359, 'max_depth': 10, 'max_leaves': 16, 'grow_policy': 'depthwise', 'learning_rate': 0.010672217814816677, 'n_jobs': 3, 'gamma': 0.8345317909666038, 'min_child_weight': 1, 'subsample': 0.5917008950601448, 'colsample_bytree': 0.8829444354681872}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 2.6297084469999845
f1 score 0.7900777211597091



[I 2024-12-04 13:18:25,134] Trial 9 finished with value: 0.8057831347240464 and parameters: {'n_estimators': 400, 'max_depth': 3, 'max_leaves': 22, 'grow_policy': 'depthwise', 'learning_rate': 0.008936571502548035, 'n_jobs': 1, 'gamma': 0.45895309103017434, 'min_child_weight': 7, 'subsample': 0.7092703305250805, 'colsample_bytree': 0.9171486704873568}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 13.265858170000001
f1 score 0.8057831347240464

Tiempo total de opti:  107.14080350799998


In [65]:
# Mejor modelo
print(study.study_name)
print('f1: ', study.best_trial.value)
best_params = study.best_trial.params
print('best params: ', best_params)

# Debemos agregar 'xgb_clf__' a cada key para poder entrenar el modelo con los parametros encontrados
xgb_best_params = {f"xgb_clf__{k}": v for k, v in best_params.items()}

XGBoost Clf optimization
f1:  0.8910427663548696
best params:  {'n_estimators': 303, 'max_depth': 3, 'max_leaves': 5, 'grow_policy': 'depthwise', 'learning_rate': 0.003313019329812414, 'n_jobs': 2, 'gamma': 0.3089944475612574, 'min_child_weight': 7, 'subsample': 0.8191434558476196, 'colsample_bytree': 0.6736108252904942}


In [61]:
best_params.items()

dict_items([('n_estimators', 303), ('max_depth', 3), ('max_leaves', 5), ('grow_policy', 'depthwise'), ('learning_rate', 0.003313019329812414), ('n_jobs', 2), ('gamma', 0.3089944475612574), ('min_child_weight', 7), ('subsample', 0.8191434558476196), ('colsample_bytree', 0.6736108252904942)])

In [57]:
# Entreno el modelo con los mejores parámetros
xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)

## Interpretabilidad

# Re-entrenamiento de modelos

Con la variación y entrega de nuevos datos, un proyecto de data-science debe incluir este paso. Sin embargo, entrenar con todos los datos puede ser costoso. Es importante comprender que un re-entrenamiento puede ser caro y requiere herramientas adecuadas. Como primera aproximación a este paradigma, se les pide lo siguiente:

- Diseñar y ejecutar estrategias de re-entrenamiento para mantener la precisión y relevancia de los modelos, utilizando estrategias de ```partial_fit```
- Automatizar el proceso de actualización de modelos basados en nuevos datos y feedback recibido a través de una función.
- Acompañar el re-entrenamiento de una etapa de optimización.

Podría serles útil la inicialización de modelos en base a pesos pasados. Mayor información la pueden encontrar en el siguiente [link](https://stackoverflow.com/questions/38079853/how-can-i-implement-incremental-training-for-xgboost).

In [33]:
X_t1 = pd.read_parquet('X_t1.parquet')
y_t1 = pd.read_parquet('y_t1.parquet')
X_t2 = pd.read_parquet('X_t2.parquet')


In [67]:
X_train_t1, X_test_t1, y_train_t1, y_test_t1 = train_test_split(X_t1, y_t1,
                                                    test_size=0.3, random_state=29, shuffle=True,)

In [74]:
xgb_pipe.named_steps['xgb_clf'].get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.6736108252904942,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': True,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0.3089944475612574,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.003313019329812414,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 3,
 'max_leaves': 5,
 'min_child_weight': 7,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 303,
 'n_jobs': 2,
 'num_parallel_tree': None,
 'random_state': 29,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.8191434558476196,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [None]:
batch_size = 50
iterations = 25
model = None
for i in range(iterations):
    for start in range(0, len(x_tr), batch_size):
        model = xgb.train({
            'learning_rate': 0.007,
            'update':'refresh',
            'process_type': 'update',
            'refresh_leaf': True,
            #'reg_lambda': 3,  # L2
            'reg_alpha': 3,  # L1
            'silent': False,
        }, dtrain=xgb.DMatrix(x_tr[start:start+batch_size], y_tr[start:start+batch_size]), xgb_model=model)

        y_pr = model.predict(xgb.DMatrix(x_te))
        #print('    MSE itr@{}: {}'.format(int(start/batch_size), sklearn.metrics.mean_squared_error(y_te, y_pr)))
    print('MSE itr@{}: {}'.format(i, sklearn.metrics.mean_squared_error(y_te, y_pr)))

y_pr = model.predict(xgb.DMatrix(x_te))
print('MSE at the end: {}'.format(sklearn.metrics.mean_squared_error(y_te, y_pr)))

In [73]:

# Initial training
# Entreno el modelo con los mejores parámetros
xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)
print("Score t0:", xgb_pipe.score(X_test, y_test))

# Incremental training on next batch
# Suppose X_train_batch_2 and y_train_batch_2 represent the second batch of data
xgb_pipe.named_steps['xgb_clf'].fit(X_train_t1, y_train_t1, xgb_model=xgb_pipe.named_steps['xgb_clf'].get_booster())
y_xgb_2 = xgb_pipe.predict(X_test)
print("Incremental model score after batch 2:", xgb_pipe.score(X_test, y_test))

Score t0: 0.7633380991797727


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:wallet_address: object

In [72]:
X_train_t1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93021 entries, 97944 to 55045
Data columns (total 77 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   borrow_block_number                      93021 non-null  int64  
 1   borrow_timestamp                         93021 non-null  float64
 2   wallet_address                           93021 non-null  object 
 3   first_tx_timestamp                       93021 non-null  float64
 4   last_tx_timestamp                        93021 non-null  float64
 5   wallet_age                               93021 non-null  float64
 6   incoming_tx_count                        93021 non-null  int64  
 7   outgoing_tx_count                        93021 non-null  int64  
 8   net_incoming_tx_count                    93021 non-null  int64  
 9   total_gas_paid_eth                       93021 non-null  float64
 10  avg_gas_paid_per_tx_eth                  93021 

In [71]:
transf_pipe.fit_transform(X_train_t1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93021 entries, 0 to 93020
Data columns (total 73 columns):
 #   Column                                   Non-Null Count  Dtype   
---  ------                                   --------------  -----   
 0   avg_gas_paid_per_tx_eth                  93021 non-null  float64 
 1   avg_risk_factor                          93021 non-null  float64 
 2   avg_weighted_risk_factor                 93021 non-null  float64 
 3   borrow_amount_avg_eth                    93021 non-null  float64 
 4   borrow_amount_sum_eth                    93021 non-null  float64 
 5   borrow_block_number                      93021 non-null  float64 
 6   borrow_count                             93021 non-null  float64 
 7   borrow_repay_diff_eth                    93021 non-null  float64 
 8   borrow_times                             93021 non-null  float64 
 9   deposit_amount_sum_eth                   93021 non-null  float64 
 10  deposit_count                     