In [178]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer


maxcols = pd.get_option("display.max_columns")
pd.set_option("display.max_columns", None)

In [179]:
# para entrenar
X_t0 = pd.read_parquet('X_t0.parquet')
y_t0 = pd.read_parquet('y_t0.parquet')
# para predecir(?)
X_t1 = pd.read_parquet('X_t1.parquet')
X_t0 = X_t0[sorted(X_t0.columns)]
X_t1 = X_t1[sorted(X_t1.columns)]

X = X_t0.copy()
y = y_t0.copy()

# Entrega 1

In [180]:
from sklearn.base import BaseEstimator, TransformerMixin

# Calcula el número de prestamos por wallet_address
class borrow_times(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.borrow_per_cli = None

    def fit(self, X, y=None):
        data = X.copy()
        object_var = data.select_dtypes(include='object').columns.to_list()
        vars = object_var + ['borrow_timestamp']

        self.borrow_per_cli = data[vars].groupby(*object_var).count()
        self.borrow_per_cli.rename(columns={'borrow_timestamp': 'borrow_times'}, inplace=True)
        self.borrow_per_cli.reset_index(inplace=True)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        new_X = pd.merge(data, self.borrow_per_cli, on='wallet_address', how='left').fillna(0)
        new_X = new_X.sort_index(axis=1)
        return new_X

    def set_output(self,transform='default'):
        #No modificar este método
        return self

# Se calcula tiempo entre las_tx_timestamp y first_tx_timestamp = ts_diff_tx,
# Renombra 'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}
# Elimina las_tx_timestamp y first_tx_timestamp

class tx_diff(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ts_diff_tx = None

    def fit(self, X, y=None):
        data = X.copy()
        self.data = data
        return self

    def transform(self, X, y=None):
        data = X.copy()
        data['ts_diff_tx'] = data['last_tx_timestamp'] - data['first_tx_timestamp']
        data.rename(columns={'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}, inplace=True)
        data.drop(columns=['last_tx_timestamp',
                           'first_tx_timestamp',
                           'risky_last_tx_timestamp',
                           'risky_first_tx_timestamp',
                        #    'borrow_timestamp'
                           ], inplace=True)

        new_data = data.sort_index(axis=1)
        return new_data

    def set_output(self,transform='default'):
        return self

class search_binary(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.binary_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        for col in data.columns:
            diff_values = len(data[col].value_counts())
            is_binary = diff_values == 2
            if is_binary:
                self.binary_cols.append(col)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        if self.binary_cols:
            binary_col = self.binary_cols[0] if isinstance(self.binary_cols, list) else self.binary_cols
            data[binary_col] = data[binary_col].astype('category')
        return data

    def set_output(self, transform='default'):
        return self

In [181]:
class time_tranf(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.time_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        tcols = [col for col in data.columns if "timestamp" in col]
        self.time_cols = tcols
        return self

    def transform(self, X, y=None):
        data = X.copy()
        for col in self.time_cols:
            min_ms = data[col].min()
            data[col] = data[col] - min_ms
        return data

    def set_output(self,transform='default'):
        return self

class CategoryToInt(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.astype(np.int64)

    def set_output(self,transform='default'):
        return self

In [183]:
from sklearn.compose import make_column_selector
from sklearn.preprocessing import KBinsDiscretizer

# bins_transformer = ColumnTransformer([
#     ('to_binary', KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='kmeans'), ['time_since_last_liquidated'])
#     ], 
#     remainder='passthrough',
#     verbose_feature_names_out=False) 
# bins_transformer.set_output(transform='pandas')

first_transformer = ColumnTransformer([
    # ('drop_times', 'drop', make_column_selector(dtype_include=datetime)),
    ('scale_data', MinMaxScaler(), make_column_selector(dtype_include='number',)),
    # ('object', 'drop', make_column_selector(dtype_include=object)),
    ('object', 'drop', ['wallet_address']),
    ('categorical', CategoryToInt(), make_column_selector(dtype_include='category'))
    ],
    remainder='passthrough',
    verbose_feature_names_out=False)
first_transformer.set_output(transform='pandas')

In [184]:
transf_pipe = Pipeline([
                ('add_borrow', borrow_times()),
                ('diff_tranf', tx_diff()),
                ('binary_cols', search_binary()),
                # ('time_cols', time_tranf()), # Dejamos la columna temporal como numero para aplicar MinMaxScaler y meter al modelo 
                ('cols_transf', first_transformer),
                ])

In [185]:
"wallet_address" in transf_pipe.fit_transform(X_t0).columns

False

In [186]:
transf_pipe.fit_transform(X_t0)['market_ht_trendmode']

0        1
1        1
2        1
3        1
4        1
        ..
44291    0
44292    0
44293    0
44294    0
44295    0
Name: market_ht_trendmode, Length: 44296, dtype: int64

In [187]:
# transf_pipe.fit_transform(X_t0).select_dtypes(include='category').value_counts() # market_ht_trendmode

In [188]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, random_state=29, shuffle=True,)

# X_train.to_csv('X_train.csv')
# X_test.to_csv('X_test.csv')
# y_train.to_csv('y_train.csv')
# y_test.to_csv('y_test.csv')

In [189]:
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

### Dummy Classifier

In [190]:
# X_train_dummy = pd.read_csv('X_train.csv')
# X_test_dummy = pd.read_csv('X_test.csv')

dummy_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('dummy_clf', DummyClassifier(strategy='most_frequent'))
                    ])
dummy_pipe.fit(X_train, y_train)
y_dummy = dummy_pipe.predict(X_test)
dummy_pipe.score(X_test, y_test)

0.5107231544886749

### Linear SVC

In [191]:
# Linear SVC
# X_train_lsvc = pd.read_csv('X_train.csv')
# X_test_lsvc = pd.read_csv('X_test.csv')

lsvc_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('lsvc_clf', LinearSVC(random_state=29))
                    ])
# lsvc_clf = LinearSVC(random_state=29)
lsvc_pipe.fit(X_train, y_train)
y_lsvc = lsvc_pipe.predict(X_test)
lsvc_pipe.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.7543080743472045

### Decision Tree Classifier

In [192]:
from sklearn.tree import DecisionTreeClassifier

# X_train_tree = pd.read_csv('X_train.csv')
# X_test_tree = pd.read_csv('X_test.csv')

tree_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('tree_clf', DecisionTreeClassifier(random_state=29))
                    ])

tree_pipe.fit(X_train, y_train)
y_tree = tree_pipe.predict(X_test)
tree_pipe.score(X_test, y_test)

0.8412220633606743

### Random Forest Classifier

In [193]:
from sklearn.ensemble import RandomForestClassifier

# X_train_rf = pd.read_csv('X_train.csv')
# X_test_rf = pd.read_csv('X_test.csv')

rf_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('rf_clf', RandomForestClassifier(criterion='entropy',random_state=29))
                    ])

rf_pipe.fit(X_train, y_train)
y_tree = rf_pipe.predict(X_test)
rf_pipe.score(X_test, y_test)

  return fit_method(estimator, *args, **kwargs)


0.9036044849123335

# Entrega 2

In [194]:
!pip install optuna




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [195]:
import optuna
from optuna.visualization.matplotlib import (plot_optimization_history, plot_param_importances,
                                    plot_parallel_coordinate)

In [196]:
rf_pipe.get_params()

{'memory': None,
 'steps': [('preprocess',
   Pipeline(steps=[('add_borrow', borrow_times()), ('diff_tranf', tx_diff()),
                   ('binary_cols', search_binary()),
                   ('cols_transf',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('scale_data', MinMaxScaler(),
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x000002268D863B20>),
                                                    ('object', 'drop',
                                                     ['wallet_address']),
                                                    ('categorical',
                                                     CategoryToInt(),
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x000002268D8443A0>)],
                                      verbose_feature_names_out=False))])),

In [197]:
from sklearn.metrics import f1_score
from time import process_time

In [198]:
# Optimización de parámetros del modelo seleccionado.

def objective_function(trial):
    # Comenzamos a tomar el tiempo que tarda en ejecutarse cada iteración
    inicial_time = process_time()

    # Split into train and validation sets

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y , test_size=0.2, random_state=29)

    y_train, y_valid = np.ravel(y_train), np.ravel(y_valid)

    # Hyperparameters
    rf_params = {
        "rf_clf__n_estimators": trial.suggest_int("n_estimators", 10, 400),
        "rf_clf__criterion": trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        "rf_clf__max_depth": trial.suggest_int("max_depth", 3, 50),
        'rf_clf__max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 60),
        'rf_clf__max_features':  trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        }

    # rf_pipe = Pipeline([
    #                 ('preprocess', transf_pipe),
    #                 ('rf_clf', RandomForestClassifier(random_state=29))
    #                 ])
    rf_pipe.fit(
        X_train,
        y_train,
    ).set_params(**rf_params)

    # Predict and evaluate the model
    yhat = rf_pipe.predict(X_valid)
    f1 = f1_score(y_valid, yhat, average='weighted')
    elapsed_time = process_time() - inicial_time

    print('elapsed_time', elapsed_time)
    print('f1 score', f1)
    print('')
    return f1


In [199]:
# inicial_opti = process_time()

# study = optuna.create_study(study_name='RandomForest optimization', direction="maximize")
# study.optimize(objective_function, n_trials=10)
# opti_time = process_time() - inicial_opti
# print('Tiempo total de opti: ', opti_time)

### XGBoost

In [200]:
!pip install xgboost




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [201]:
from xgboost import XGBClassifier

xgb_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('xgb_clf', XGBClassifier(random_state=29,
                                              enable_categorical=False))
                    ])

xgb_pipe.fit(X_train, y_train)
y_xgb= xgb_pipe.predict(X_test)
xgb_pipe.score(X_test, y_test)

0.8858454360749493

In [202]:
def objective_function(trial):
    # Comenzamos a tomar el tiempo que tarda en ejecutarse cada iteración
    inicial_time = process_time()

    # Split into train and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y , test_size=0.2, random_state=29)

    y_train, y_valid = np.ravel(y_train), np.ravel(y_valid)

    # Hyperparameters
    xgb_params = {
            "xgb_clf__n_estimators": trial.suggest_int("n_estimators", 10, 500),
            "xgb_clf__max_depth": trial.suggest_int("max_depth", 3, 10),
            'xgb_clf__max_leaves': trial.suggest_int("max_leaves", 3, 30),
            "xgb_clf__grow_policy": trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
            "xgb_clf__learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
            "xgb_clf__n_jobs": trial.suggest_int('n_jobs', 1, 3),
            "xgb_clf__gamma": trial.suggest_float("gamma", 0, 1),
            "xgb_clf__min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
            "xgb_clf__subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "xgb_clf__colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }

    xgb_pipe.fit(
        X_train,
        y_train,
      ).set_params(**xgb_params)

    # Predict and evaluate the model
    yhat = xgb_pipe.predict(X_valid)
    f1 = f1_score(y_valid, yhat, average='weighted')
    elapsed_time = process_time() - inicial_time

    print('elapsed_time', elapsed_time)
    print('f1 score', f1)
    print('')
    return f1


In [203]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [204]:
import optuna

inicial_opti = process_time()

study = optuna.create_study(study_name='XGBoost Clf optimization', direction="maximize")
study.optimize(objective_function, n_trials=10)
opti_time = process_time() - inicial_opti
print('Tiempo total de opti: ', opti_time)

[I 2024-12-10 00:25:05,521] A new study created in memory with name: XGBoost Clf optimization
[I 2024-12-10 00:25:06,653] Trial 0 finished with value: 0.8910427663548696 and parameters: {'n_estimators': 83, 'max_depth': 4, 'max_leaves': 8, 'grow_policy': 'depthwise', 'learning_rate': 0.0177418595538853, 'n_jobs': 2, 'gamma': 0.5063393645554641, 'min_child_weight': 3, 'subsample': 0.8761280172829196, 'colsample_bytree': 0.6279551128223035}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 10.984375
f1 score 0.8910427663548696



[I 2024-12-10 00:25:07,679] Trial 1 finished with value: 0.7747471524853812 and parameters: {'n_estimators': 276, 'max_depth': 7, 'max_leaves': 5, 'grow_policy': 'depthwise', 'learning_rate': 0.01557733391163079, 'n_jobs': 3, 'gamma': 0.898219392516796, 'min_child_weight': 5, 'subsample': 0.5431002835266387, 'colsample_bytree': 0.9990400400598369}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 2.328125
f1 score 0.7747471524853812



[I 2024-12-10 00:25:09,417] Trial 2 finished with value: 0.7778556578217273 and parameters: {'n_estimators': 131, 'max_depth': 3, 'max_leaves': 25, 'grow_policy': 'depthwise', 'learning_rate': 0.009353441763945786, 'n_jobs': 3, 'gamma': 0.5894950173111362, 'min_child_weight': 6, 'subsample': 0.6998914671093379, 'colsample_bytree': 0.6182638892999233}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 4.0
f1 score 0.7778556578217273



[I 2024-12-10 00:25:10,444] Trial 3 finished with value: 0.7729648875091851 and parameters: {'n_estimators': 134, 'max_depth': 5, 'max_leaves': 26, 'grow_policy': 'lossguide', 'learning_rate': 0.04943069245679292, 'n_jobs': 1, 'gamma': 0.6566476672882106, 'min_child_weight': 2, 'subsample': 0.7401767251429667, 'colsample_bytree': 0.8288936856453577}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 2.734375
f1 score 0.7729648875091851



[I 2024-12-10 00:25:13,009] Trial 4 finished with value: 0.8314935955306912 and parameters: {'n_estimators': 249, 'max_depth': 9, 'max_leaves': 20, 'grow_policy': 'lossguide', 'learning_rate': 0.03134549159059514, 'n_jobs': 3, 'gamma': 0.06403173793384775, 'min_child_weight': 6, 'subsample': 0.7774115740694165, 'colsample_bytree': 0.68816197420617}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 2.796875
f1 score 0.8314935955306912



[I 2024-12-10 00:25:15,544] Trial 5 finished with value: 0.8417465055210113 and parameters: {'n_estimators': 276, 'max_depth': 5, 'max_leaves': 26, 'grow_policy': 'depthwise', 'learning_rate': 0.027937102610758234, 'n_jobs': 3, 'gamma': 0.8282339201436821, 'min_child_weight': 2, 'subsample': 0.9381581867488027, 'colsample_bytree': 0.7946726995040285}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 5.25
f1 score 0.8417465055210113



[I 2024-12-10 00:25:17,855] Trial 6 finished with value: 0.8330766184549159 and parameters: {'n_estimators': 406, 'max_depth': 10, 'max_leaves': 16, 'grow_policy': 'depthwise', 'learning_rate': 0.0012350261598113577, 'n_jobs': 1, 'gamma': 0.7064871367515569, 'min_child_weight': 6, 'subsample': 0.9934226739870469, 'colsample_bytree': 0.6922294998812054}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 4.84375
f1 score 0.8330766184549159



[I 2024-12-10 00:25:23,059] Trial 7 finished with value: 0.7836784648241615 and parameters: {'n_estimators': 370, 'max_depth': 9, 'max_leaves': 18, 'grow_policy': 'depthwise', 'learning_rate': 0.0020596756669478667, 'n_jobs': 3, 'gamma': 0.12222435288867184, 'min_child_weight': 1, 'subsample': 0.5383955640690188, 'colsample_bytree': 0.8620265749501055}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 5.640625
f1 score 0.7836784648241615



[I 2024-12-10 00:25:26,247] Trial 8 finished with value: 0.7919993203354401 and parameters: {'n_estimators': 305, 'max_depth': 4, 'max_leaves': 16, 'grow_policy': 'depthwise', 'learning_rate': 0.013812339730204869, 'n_jobs': 1, 'gamma': 0.4198717559474403, 'min_child_weight': 5, 'subsample': 0.938212824975702, 'colsample_bytree': 0.8255708701166449}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 7.265625
f1 score 0.7919993203354401



[I 2024-12-10 00:25:30,170] Trial 9 finished with value: 0.8064901611684808 and parameters: {'n_estimators': 457, 'max_depth': 9, 'max_leaves': 15, 'grow_policy': 'lossguide', 'learning_rate': 0.01199174015456294, 'n_jobs': 3, 'gamma': 0.09054973853155412, 'min_child_weight': 2, 'subsample': 0.7465610395740372, 'colsample_bytree': 0.8537566837472124}. Best is trial 0 with value: 0.8910427663548696.


elapsed_time 4.21875
f1 score 0.8064901611684808

Tiempo total de opti:  50.0625


In [205]:
# Mejor modelo
print(study.study_name)
print('f1: ', study.best_trial.value)
best_params = study.best_trial.params
print('best params: ', best_params)

# Debemos agregar 'xgb_clf__' a cada key para poder entrenar el modelo con los parametros encontrados
xgb_best_params = {f"xgb_clf__{k}": v for k, v in best_params.items()}

XGBoost Clf optimization
f1:  0.8910427663548696
best params:  {'n_estimators': 83, 'max_depth': 4, 'max_leaves': 8, 'grow_policy': 'depthwise', 'learning_rate': 0.0177418595538853, 'n_jobs': 2, 'gamma': 0.5063393645554641, 'min_child_weight': 3, 'subsample': 0.8761280172829196, 'colsample_bytree': 0.6279551128223035}


In [206]:
best_params.items()

dict_items([('n_estimators', 83), ('max_depth', 4), ('max_leaves', 8), ('grow_policy', 'depthwise'), ('learning_rate', 0.0177418595538853), ('n_jobs', 2), ('gamma', 0.5063393645554641), ('min_child_weight', 3), ('subsample', 0.8761280172829196), ('colsample_bytree', 0.6279551128223035)])

In [207]:
# Entreno el modelo con los mejores parámetros
xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)

## Interpretabilidad

In [208]:
X_t1 = pd.read_parquet('X_t1.parquet')
y_t1 = pd.read_parquet('y_t1.parquet')
X_t2 = pd.read_parquet('X_t2.parquet')

X_train_t1, X_test_t1, y_train_t1, y_test_t1 = train_test_split(X_t1, y_t1,
                                                    test_size=0.3, random_state=29, shuffle=True,)

# Re-entrenamiento de modelos

Con la variación y entrega de nuevos datos, un proyecto de data-science debe incluir este paso. Sin embargo, entrenar con todos los datos puede ser costoso. Es importante comprender que un re-entrenamiento puede ser caro y requiere herramientas adecuadas. Como primera aproximación a este paradigma, se les pide lo siguiente:

- Diseñar y ejecutar estrategias de re-entrenamiento para mantener la precisión y relevancia de los modelos, utilizando estrategias de ```partial_fit```
- Automatizar el proceso de actualización de modelos basados en nuevos datos y feedback recibido a través de una función.
- Acompañar el re-entrenamiento de una etapa de optimización.

Podría serles útil la inicialización de modelos en base a pesos pasados. Mayor información la pueden encontrar en el siguiente [link](https://stackoverflow.com/questions/38079853/how-can-i-implement-incremental-training-for-xgboost).

In [209]:
# xgb_pipe.named_steps['xgb_clf'].get_params()

In [210]:
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [211]:
model = xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)
print("Score t0:", xgb_pipe.score(X_test, y_test))
# Score t0: 0.8100684776883137

Score t0: 0.7785386409812627


In [212]:
import pickle
with open("./xgb_best_pipe2.pkl", 'wb') as file:
    pickle.dump(model, file)

In [213]:
model

In [214]:
# Initial training
batch_size=3000
# Entreno el modelo con los mejores parámetros
xgb_pipe.fit(X_train, y_train).set_params(**xgb_best_params)
print("Score t0:", xgb_pipe.score(X_test, y_test))

for start in range(0, len(X_train_t1), batch_size):

    # Incremental training on next batch
    X_batch, y_batch = X_train_t1[start:start+batch_size], y_train_t1[start:start+batch_size]

    X_train_batch= xgb_pipe.named_steps['preprocess'].transform(X_batch)
    xgb_pipe.named_steps['xgb_clf'].fit(X_train_batch, y_batch,
                                        xgb_model = xgb_pipe.named_steps['xgb_clf']\
                                                            .get_booster())

    y_xgb_2 = xgb_pipe.predict(X_test_t1)
    print(f"Incremental model f1 score after batch {start}:", f1_score(y_test_t1, y_xgb_2, average='weighted'))

Score t0: 0.7785386409812627
Incremental model f1 score after batch 0: 0.7925846105586122
Incremental model f1 score after batch 3000: 0.7955178717499822
Incremental model f1 score after batch 6000: 0.8014125496992272
Incremental model f1 score after batch 9000: 0.8033563999059767
Incremental model f1 score after batch 12000: 0.8056580950761837
Incremental model f1 score after batch 15000: 0.8084359060528877
Incremental model f1 score after batch 18000: 0.8081759159121878
Incremental model f1 score after batch 21000: 0.8073697293400595
Incremental model f1 score after batch 24000: 0.8090413415262214
Incremental model f1 score after batch 27000: 0.8095607027579842
Incremental model f1 score after batch 30000: 0.8104312739766814
Incremental model f1 score after batch 33000: 0.8096886248288429
Incremental model f1 score after batch 36000: 0.8108626480719334
Incremental model f1 score after batch 39000: 0.8102113503705375
Incremental model f1 score after batch 42000: 0.8127195316993996
Inc

In [215]:
y_xgb_2 = xgb_pipe.predict(X_test)

f1_score(y_test, y_xgb_2, average='weighted')

np.float64(0.7578087397569772)

In [216]:
from zipfile import ZipFile
import os
def generateFiles(predict_data, clf_pipe):
    """Genera los archivos a subir en CodaLab

    Input
    ---------------
    predict_data: Dataframe con los datos de entrada a predecir
    clf_pipe: pipeline del clf

    Ouput
    ---------------
    archivo de txt
    """
    y_pred_clf = clf_pipe.predict_proba(predict_data)[:, 1]
    with open('./predictions.txt', 'w') as f:
        for item in y_pred_clf:
            f.write("%s\n" % item)

    with ZipFile('predictions.zip', 'w') as zipObj:
        zipObj.write('predictions.txt')
    os.remove('predictions.txt')

generateFiles(X_t2, xgb_pipe)