In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer


maxcols = pd.get_option("display.max_columns")
pd.set_option("display.max_columns", None)

In [2]:
# para entrenar
X_t0 = pd.read_parquet('X_t0.parquet')
y_t0 = pd.read_parquet('y_t0.parquet')
# para predecir(?)
X_t1 = pd.read_parquet('X_t1.parquet')
X_t0 = X_t0[sorted(X_t0.columns)]
X_t1 = X_t1[sorted(X_t1.columns)]

X = X_t0.copy()
y = y_t0.copy()

# Entrega 1

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# Calcula el número de prestamos por wallet_address
class borrow_times(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.borrow_per_cli = None

    def fit(self, X, y=None):
        data = X.copy()
        object_var = data.select_dtypes(include='object').columns.to_list()
        vars = object_var + ['borrow_timestamp']

        self.borrow_per_cli = data[vars].groupby(*object_var).count()
        self.borrow_per_cli.rename(columns={'borrow_timestamp': 'borrow_times'}, inplace=True)
        self.borrow_per_cli.reset_index(inplace=True)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        new_X = pd.merge(data, self.borrow_per_cli, on='wallet_address', how='left').fillna(0)
        new_X = new_X.sort_index(axis=1)
        return new_X

    def set_output(self,transform='default'):
        #No modificar este método
        return self

# Se calcula tiempo entre las_tx_timestamp y first_tx_timestamp = ts_diff_tx,
# Renombra 'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}
# Elimina las_tx_timestamp y first_tx_timestamp

class tx_diff(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ts_diff_tx = None

    def fit(self, X, y=None):
        data = X.copy()
        self.data = data
        return self

    def transform(self, X, y=None):
        data = X.copy()
        data['ts_diff_tx'] = data['last_tx_timestamp'] - data['first_tx_timestamp']
        data.rename(columns={'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}, inplace=True)
        data.drop(columns=['last_tx_timestamp',
                           'first_tx_timestamp',
                           'risky_last_tx_timestamp',
                           'risky_first_tx_timestamp',
                        #    'borrow_timestamp'
                           ], inplace=True)

        new_data = data.sort_index(axis=1)
        return new_data

    def set_output(self,transform='default'):
        return self

class search_binary(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.binary_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        for col in data.columns:
            diff_values = len(data[col].value_counts())
            is_binary = diff_values == 2
            if is_binary:
                self.binary_cols.append(col)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        if self.binary_cols:
            binary_col = self.binary_cols[0] if isinstance(self.binary_cols, list) else self.binary_cols
            data[binary_col] = data[binary_col].astype('category')
        return data

    def set_output(self, transform='default'):
        return self

In [4]:
class time_tranf(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.time_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        tcols = [col for col in data.columns if "timestamp" in col]
        self.time_cols = tcols
        return self

    def transform(self, X, y=None):
        data = X.copy()
        for col in self.time_cols:
            min_ms = data[col].min()
            data[col] = data[col] - min_ms
        return data

    def set_output(self,transform='default'):
        return self

In [20]:
a = [1,3,4,"as"]
a.remove("as")
a

[1, 3, 4]

In [None]:
from sklearn.compose import make_column_selector
from sklearn.preprocessing import KBinsDiscretizer

# bins_transformer = ColumnTransformer([
#     ('to_binary', KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='kmeans'), ['time_since_last_liquidated'])
#     ], 
#     remainder='passthrough',
#     verbose_feature_names_out=False) 
# bins_transformer.set_output(transform='pandas')

first_transformer = ColumnTransformer([
    # ('drop_times', 'drop', make_column_selector(dtype_include=datetime)),
    ('scale_data', MinMaxScaler(), make_column_selector(dtype_include='number',)),
    # ('object', 'drop', make_column_selector(dtype_include=object)),
    ('object', 'drop', ['wallet_address']),
    ('categorical', 'passthrough', make_column_selector(dtype_include='category'))
    ],
    remainder='passthrough',
    verbose_feature_names_out=False)
first_transformer.set_output(transform='pandas')

In [7]:
transf_pipe = Pipeline([
                ('add_borrow', borrow_times()),
                ('diff_tranf', tx_diff()),
                ('binary_cols', search_binary()),
                # ('time_cols', time_tranf()), # Dejamos la columna temporal como numero para aplicar MinMaxScaler y meter al modelo 
                ('cols_transf', first_transformer),
                ])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, random_state=29, shuffle=True,)

# X_train.to_csv('X_train.csv')
# X_test.to_csv('X_test.csv')
# y_train.to_csv('y_train.csv')
# y_test.to_csv('y_test.csv')

In [9]:
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

### Dummy Classifier

In [10]:
# X_train_dummy = pd.read_csv('X_train.csv')
# X_test_dummy = pd.read_csv('X_test.csv')

dummy_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('dummy_clf', DummyClassifier(strategy='most_frequent'))
                    ])
dummy_pipe.fit(X_train, y_train)
y_dummy = dummy_pipe.predict(X_test)
dummy_pipe.score(X_test, y_test)

ValueError: Duplicated feature names found before concatenating the outputs of the transformers: ['time_since_last_liquidated'].
Transformer to_binary has conflicting columns names: ['time_since_last_liquidated'].
Transformer scale_data has conflicting columns names: ['time_since_last_liquidated'].
Either make sure that the transformers named above do not generate columns with conflicting names or set verbose_feature_names_out=True to automatically prefix to the output feature names with the name of the transformer to prevent any conflicting names.

### Linear SVC

In [None]:
# Linear SVC
# X_train_lsvc = pd.read_csv('X_train.csv')
# X_test_lsvc = pd.read_csv('X_test.csv')

lsvc_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('lsvc_clf', LinearSVC(random_state=29))
                    ])
# lsvc_clf = LinearSVC(random_state=29)
lsvc_pipe.fit(X_train, y_train)
y_lsvc = lsvc_pipe.predict(X_test)
lsvc_pipe.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.7543080743472045

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# X_train_tree = pd.read_csv('X_train.csv')
# X_test_tree = pd.read_csv('X_test.csv')

tree_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('tree_clf', DecisionTreeClassifier(random_state=29))
                    ])

tree_pipe.fit(X_train, y_train)
y_tree = tree_pipe.predict(X_test)
tree_pipe.score(X_test, y_test)

0.8370833019790804

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# X_train_rf = pd.read_csv('X_train.csv')
# X_test_rf = pd.read_csv('X_test.csv')

rf_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('rf_clf', RandomForestClassifier(criterion='entropy',random_state=29))
                    ])

rf_pipe.fit(X_train, y_train)
y_tree = rf_pipe.predict(X_test)
rf_pipe.score(X_test, y_test)

  return fit_method(estimator, *args, **kwargs)


0.9014974791180675

# Entrega 2

In [None]:
!pip install optuna



In [None]:
import optuna
from optuna.visualization.matplotlib import (plot_optimization_history, plot_param_importances,
                                    plot_parallel_coordinate)

In [None]:
rf_pipe.get_params()

{'memory': None,
 'steps': [('preprocess',
   Pipeline(steps=[('add_borrow', borrow_times()), ('diff_tranf', tx_diff()),
                   ('binary_cols', search_binary()), ('time_cols', time_tranf()),
                   ('cols_transf',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('scale_data', MinMaxScaler(),
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x79cbe5e658a0>),
                                                    ('object', 'drop',
                                                     ['wallet_address']),
                                                    ('categorical', 'passthrough',
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x79cbe5e66350>)],
                                      verbose_feature_names_out=False))])),
  ('rf_clf', RandomForestClassifi

In [None]:
from sklearn.metrics import f1_score
from time import process_time

In [None]:
# Optimización de parámetros del modelo seleccionado.

def objective_function(trial):
    # Comenzamos a tomar el tiempo que tarda en ejecutarse cada iteración
    inicial_time = process_time()

    # Split into train and validation sets

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y , test_size=0.2, random_state=29)

    y_train, y_valid = np.ravel(y_train), np.ravel(y_valid)

    # Hyperparameters
    rf_params = {
        "rf_clf__n_estimators": trial.suggest_int("n_estimators", 10, 400),
        "rf_clf__criterion": trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        "rf_clf__max_depth": trial.suggest_int("max_depth", 3, 50),
        'rf_clf__max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 60),
        'rf_clf__max_features':  trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        }

    # rf_pipe = Pipeline([
    #                 ('preprocess', transf_pipe),
    #                 ('rf_clf', RandomForestClassifier(random_state=29))
    #                 ])
    rf_pipe.fit(
        X_train,
        y_train,
    ).set_params(**rf_params)

    # Predict and evaluate the model
    yhat = rf_pipe.predict(X_valid)
    f1 = f1_score(y_valid, yhat, average='weighted')
    elapsed_time = process_time() - inicial_time

    print('elapsed_time', elapsed_time)
    print('f1 score', f1)
    print('')
    return f1


In [None]:
inicial_opti = process_time()

study = optuna.create_study(study_name='RandomForest optimization', direction="maximize")
study.optimize(objective_function, n_trials=10)
opti_time = process_time() - inicial_opti
print('Tiempo total de opti: ', opti_time)

### XGBoost

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier

xgb_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('xgb_clf', XGBClassifier(random_state=29,
                                              enable_categorical=True))
                    ])

xgb_pipe.fit(X_train, y_train)
y_xgb= xgb_pipe.predict(X_test)
xgb_pipe.score(X_test, y_test)

0.8858454360749493

In [None]:
def objective_function(trial):
    # Comenzamos a tomar el tiempo que tarda en ejecutarse cada iteración
    inicial_time = process_time()

    # Split into train and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y , test_size=0.2, random_state=29)

    y_train, y_valid = np.ravel(y_train), np.ravel(y_valid)

    # Hyperparameters
    xgb_params = {
            "xgb_clf__n_estimators": trial.suggest_int("n_estimators", 10, 500),
            "xgb_clf__max_depth": trial.suggest_int("max_depth", 3, 10),
            'xgb_clf__max_leaves': trial.suggest_int("max_leaves", 3, 30),
            "xgb_clf__grow_policy": trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
            "xgb_clf__learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
            "xgb_clf__n_jobs": trial.suggest_int('n_jobs', 1, 3),
            "xgb_clf__gamma": trial.suggest_float("gamma", 0, 1),
            "xgb_clf__min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
            "xgb_clf__subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "xgb_clf__colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }

    xgb_pipe.fit(
        X_train,
        y_train,
      ).set_params(**xgb_params)

    # Predict and evaluate the model
    yhat = xgb_pipe.predict(X_valid)
    f1 = f1_score(y_valid, yhat, average='weighted')
    elapsed_time = process_time() - inicial_time

    print('elapsed_time', elapsed_time)
    print('f1 score', f1)
    print('')
    return f1


In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.7-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.7-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.9/78.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import optuna

inicial_opti = process_time()

study = optuna.create_study(study_name='XGBoost Clf optimization', direction="maximize")
study.optimize(objective_function, n_trials=10)
opti_time = process_time() - inicial_opti
print('Tiempo total de opti: ', opti_time)

[I 2024-12-04 20:26:22,990] A new study created in memory with name: XGBoost Clf optimization
[I 2024-12-04 20:26:39,596] Trial 0 finished with value: 0.8910427663548696 and parameters: {'n_estimators': 130, 'max_depth': 7, 'max_leaves': 27, 'grow_policy': 'depthwise', 'learning_rate': 0.00612164787608574, 'n_jobs': 1, 'gamma': 0.09002119746002757, 'min_child_weight': 2, 'subsample': 0.5548786194877064, 'colsample_bytree': 0.8901226152691828}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 13.163723465000004
f1 score 0.8910427663548696



[I 2024-12-04 20:26:44,374] Trial 1 finished with value: 0.7992617634637122 and parameters: {'n_estimators': 47, 'max_depth': 3, 'max_leaves': 25, 'grow_policy': 'lossguide', 'learning_rate': 0.01807576817824144, 'n_jobs': 1, 'gamma': 0.10749688994823847, 'min_child_weight': 6, 'subsample': 0.849687222441334, 'colsample_bytree': 0.565362222582908}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 4.7097025459999955
f1 score 0.7992617634637122



[I 2024-12-04 20:26:45,741] Trial 2 finished with value: 0.7706240552740429 and parameters: {'n_estimators': 376, 'max_depth': 7, 'max_leaves': 24, 'grow_policy': 'lossguide', 'learning_rate': 0.013834993169236573, 'n_jobs': 2, 'gamma': 0.3528878793736139, 'min_child_weight': 1, 'subsample': 0.677406451303477, 'colsample_bytree': 0.7437384055776143}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 1.4690820089999974
f1 score 0.7706240552740429



[I 2024-12-04 20:26:54,795] Trial 3 finished with value: 0.8350946783059382 and parameters: {'n_estimators': 143, 'max_depth': 7, 'max_leaves': 3, 'grow_policy': 'depthwise', 'learning_rate': 0.04219150808648471, 'n_jobs': 3, 'gamma': 0.65944843026627, 'min_child_weight': 3, 'subsample': 0.7139854424330117, 'colsample_bytree': 0.5234032275806103}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 16.284833629999994
f1 score 0.8350946783059382



[I 2024-12-04 20:26:57,306] Trial 4 finished with value: 0.7736662990839697 and parameters: {'n_estimators': 170, 'max_depth': 7, 'max_leaves': 28, 'grow_policy': 'lossguide', 'learning_rate': 0.027383732784841276, 'n_jobs': 3, 'gamma': 0.4338075275827312, 'min_child_weight': 6, 'subsample': 0.9378191725210808, 'colsample_bytree': 0.7578774859650863}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 3.3301307389999977
f1 score 0.7736662990839697



[I 2024-12-04 20:27:01,709] Trial 5 finished with value: 0.832251274461715 and parameters: {'n_estimators': 98, 'max_depth': 6, 'max_leaves': 28, 'grow_policy': 'depthwise', 'learning_rate': 0.01656075434427193, 'n_jobs': 1, 'gamma': 0.8081107396635475, 'min_child_weight': 5, 'subsample': 0.6715743140880799, 'colsample_bytree': 0.6955345488431928}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 8.00888912100001
f1 score 0.832251274461715



[I 2024-12-04 20:27:04,488] Trial 6 finished with value: 0.8068179150954266 and parameters: {'n_estimators': 481, 'max_depth': 7, 'max_leaves': 6, 'grow_policy': 'depthwise', 'learning_rate': 0.025739676277709202, 'n_jobs': 3, 'gamma': 0.7990891528054374, 'min_child_weight': 5, 'subsample': 0.8008741572004341, 'colsample_bytree': 0.8646440444691503}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 2.891800269000001
f1 score 0.8068179150954266



[I 2024-12-04 20:27:11,682] Trial 7 finished with value: 0.8130346906482963 and parameters: {'n_estimators': 174, 'max_depth': 9, 'max_leaves': 27, 'grow_policy': 'lossguide', 'learning_rate': 0.05768216232615857, 'n_jobs': 3, 'gamma': 0.19512906493995108, 'min_child_weight': 5, 'subsample': 0.8814956591669705, 'colsample_bytree': 0.6562661315967855}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 11.119512861999993
f1 score 0.8130346906482963



[I 2024-12-04 20:27:15,784] Trial 8 finished with value: 0.8566624896713665 and parameters: {'n_estimators': 447, 'max_depth': 10, 'max_leaves': 20, 'grow_policy': 'lossguide', 'learning_rate': 0.0054441086539125496, 'n_jobs': 1, 'gamma': 0.9036958140347973, 'min_child_weight': 6, 'subsample': 0.5686850965343634, 'colsample_bytree': 0.8923193253644796}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 7.571388710999997
f1 score 0.8566624896713665



[I 2024-12-04 20:27:27,291] Trial 9 finished with value: 0.815796460208515 and parameters: {'n_estimators': 430, 'max_depth': 10, 'max_leaves': 11, 'grow_policy': 'depthwise', 'learning_rate': 0.05264160306771081, 'n_jobs': 1, 'gamma': 0.030298751469984286, 'min_child_weight': 6, 'subsample': 0.6996115247115848, 'colsample_bytree': 0.7343145630497011}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 11.544659451000001
f1 score 0.815796460208515

Tiempo total de opti:  80.169018967


In [None]:
# Mejor modelo
print(study.study_name)
print('f1: ', study.best_trial.value)
best_params = study.best_trial.params
print('best params: ', best_params)

# Debemos agregar 'xgb_clf__' a cada key para poder entrenar el modelo con los parametros encontrados
xgb_best_params = {f"xgb_clf__{k}": v for k, v in best_params.items()}

XGBoost Clf optimization
f1:  0.8910427663548696
best params:  {'n_estimators': 130, 'max_depth': 7, 'max_leaves': 27, 'grow_policy': 'depthwise', 'learning_rate': 0.00612164787608574, 'n_jobs': 1, 'gamma': 0.09002119746002757, 'min_child_weight': 2, 'subsample': 0.5548786194877064, 'colsample_bytree': 0.8901226152691828}


In [None]:
best_params.items()

dict_items([('n_estimators', 130), ('max_depth', 7), ('max_leaves', 27), ('grow_policy', 'depthwise'), ('learning_rate', 0.00612164787608574), ('n_jobs', 1), ('gamma', 0.09002119746002757), ('min_child_weight', 2), ('subsample', 0.5548786194877064), ('colsample_bytree', 0.8901226152691828)])

In [None]:
# Entreno el modelo con los mejores parámetros
xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)

## Interpretabilidad

In [None]:
X_t1 = pd.read_parquet('X_t1.parquet')
y_t1 = pd.read_parquet('y_t1.parquet')
X_t2 = pd.read_parquet('X_t2.parquet')

X_train_t1, X_test_t1, y_train_t1, y_test_t1 = train_test_split(X_t1, y_t1,
                                                    test_size=0.3, random_state=29, shuffle=True,)

# Re-entrenamiento de modelos

Con la variación y entrega de nuevos datos, un proyecto de data-science debe incluir este paso. Sin embargo, entrenar con todos los datos puede ser costoso. Es importante comprender que un re-entrenamiento puede ser caro y requiere herramientas adecuadas. Como primera aproximación a este paradigma, se les pide lo siguiente:

- Diseñar y ejecutar estrategias de re-entrenamiento para mantener la precisión y relevancia de los modelos, utilizando estrategias de ```partial_fit```
- Automatizar el proceso de actualización de modelos basados en nuevos datos y feedback recibido a través de una función.
- Acompañar el re-entrenamiento de una etapa de optimización.

Podría serles útil la inicialización de modelos en base a pesos pasados. Mayor información la pueden encontrar en el siguiente [link](https://stackoverflow.com/questions/38079853/how-can-i-implement-incremental-training-for-xgboost).

In [None]:
# xgb_pipe.named_steps['xgb_clf'].get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.7316593364997117,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': True,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0.46692061985110067,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.09043057330159886,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 9,
 'max_leaves': 7,
 'min_child_weight': 2,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 378,
 'n_jobs': 2,
 'num_parallel_tree': None,
 'random_state': 29,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.710585955296333,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [None]:
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [None]:
model = xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)
print("Score t0:", xgb_pipe.score(X_test, y_test))

Score t0: 0.8013394536834977


In [None]:
# Initial training
batch_size=3000
# Entreno el modelo con los mejores parámetros
xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)
print("Score t0:", xgb_pipe.score(X_test, y_test))

for start in range(0, len(X_train_t1), batch_size):

    # Incremental training on next batch
    X_batch, y_batch = X_train_t1[start:start+batch_size], y_train_t1[start:start+batch_size]

    X_train_batch= xgb_pipe.named_steps['preprocess'].transform(X_batch)
    xgb_pipe.named_steps['xgb_clf'].fit(X_train_batch, y_batch,
                                        xgb_model = xgb_pipe.named_steps['xgb_clf']\
                                                            .get_booster())

    y_xgb_2 = xgb_pipe.predict(X_test_t1)
    print(f"Incremental model f1 score after batch {start}:", f1_score(y_test_t1, y_xgb_2, average='weighted'))

Score t0: 0.8013394536834977
Incremental model f1 score after batch 0: 0.794888265236676
Incremental model f1 score after batch 2000: 0.799355505404701
Incremental model f1 score after batch 4000: 0.8008748038477025
Incremental model f1 score after batch 6000: 0.8035382360343697
Incremental model f1 score after batch 8000: 0.8069991117302833
Incremental model f1 score after batch 10000: 0.8079212776015634
Incremental model f1 score after batch 12000: 0.8069717170808425
Incremental model f1 score after batch 14000: 0.810184792595038
Incremental model f1 score after batch 16000: 0.8125981051565923
Incremental model f1 score after batch 18000: 0.8134166040071317
Incremental model f1 score after batch 20000: 0.8133294157461598
Incremental model f1 score after batch 22000: 0.8139232849808791
Incremental model f1 score after batch 24000: 0.8139299199308677
Incremental model f1 score after batch 26000: 0.8147089354488908
Incremental model f1 score after batch 28000: 0.815016073518495
Incremen

In [None]:
y_xgb_2 = xgb_pipe.predict(X_test)

f1_score(y_test, y_xgb_2, average='weighted')

0.7577758356574243

In [None]:
from zipfile import ZipFile
import os
def generateFiles(predict_data, clf_pipe):
    """Genera los archivos a subir en CodaLab

    Input
    ---------------
    predict_data: Dataframe con los datos de entrada a predecir
    clf_pipe: pipeline del clf

    Ouput
    ---------------
    archivo de txt
    """
    y_pred_clf = clf_pipe.predict_proba(predict_data)[:, 1]
    with open('./predictions.txt', 'w') as f:
        for item in y_pred_clf:
            f.write("%s\n" % item)

    with ZipFile('predictions.zip', 'w') as zipObj:
        zipObj.write('predictions.txt')
    os.remove('predictions.txt')

generateFiles(X_t2, xgb_pipe)