In [1]:
import datetime
import numpy as np 
import pandas as pd 
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer


maxcols = pd.get_option("display.max_columns")
pd.set_option("display.max_columns", None)

In [2]:
# para entrenar
X_t0 = pd.read_parquet('X_t0.parquet')
y_t0 = pd.read_parquet('y_t0.parquet')
# para predecir(?)
X_t1 = pd.read_parquet('X_t1.parquet')
X_t0 = X_t0[sorted(X_t0.columns)]
X_t1 = X_t1[sorted(X_t1.columns)]

X = X_t0.copy()
y = y_t0.copy()

# Entrega 1

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# Calcula el número de prestamos por wallet_address
class borrow_times(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.borrow_per_cli = None
        
    def fit(self, X, y=None):
        data = X.copy()
        object_var = data.select_dtypes(include='object').columns.to_list()
        vars = object_var + ['borrow_timestamp']

        self.borrow_per_cli = data[vars].groupby(*object_var).count()
        self.borrow_per_cli.rename(columns={'borrow_timestamp': 'borrow_times'}, inplace=True)
        self.borrow_per_cli.reset_index(inplace=True)
        return self
    
    def transform(self, X, y=None):
        data = X.copy()
        new_X = pd.merge(data, self.borrow_per_cli, on='wallet_address', how='left').fillna(0)
        new_X = new_X.sort_index(axis=1)
        return new_X

    def set_output(self,transform='default'):
        #No modificar este método
        return self

# Se calcula tiempo entre las_tx_timestamp y first_tx_timestamp = ts_diff_tx,
# Renombra 'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}
# Elimina las_tx_timestamp y first_tx_timestamp

class tx_diff(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ts_diff_tx = None

    def fit(self, X, y=None):
        data = X.copy()
        self.data = data
        return self
    
    def transform(self, X, y=None):
        data = X.copy()
        data['ts_diff_tx'] = data['last_tx_timestamp'] - data['first_tx_timestamp']
        data.rename(columns={'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}, inplace=True)
        data.drop(columns=['last_tx_timestamp', 
                           'first_tx_timestamp',
                           'risky_last_tx_timestamp', 
                           'risky_first_tx_timestamp',
                           'borrow_timestamp'], inplace=True)
        
        new_data = data.sort_index(axis=1)
        return new_data

    def set_output(self,transform='default'):
        return self

class search_binary(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.binary_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        for col in data.columns:
            diff_values = len(data[col].value_counts())
            is_binary = diff_values == 2
            if is_binary:
                self.binary_cols.append(col)
        return self
    
    def transform(self, X, y=None):
        data = X.copy()
        data[self.binary_cols] = data[self.binary_cols].astype('category')
        return data

    def set_output(self,transform='default'):
        return self

In [4]:
class time_tranf(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.time_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        tcols = [col for col in data.columns if "timestamp" in col]
        self.time_cols = tcols
        return self
    
    def transform(self, X, y=None):
        data = X.copy()
        for col in self.time_cols:
            data[col] = data[col].apply(lambda x: datetime.datetime.fromtimestamp(x))
        return data

    def set_output(self,transform='default'):
        return self

In [5]:
from sklearn.compose import make_column_selector

first_transformer = ColumnTransformer([
                            # ('drop_times', 'drop', make_column_selector(dtype_include=datetime)),
                            ('scale_data', MinMaxScaler(), make_column_selector(dtype_include='number') ),
                            ('object', 'drop', make_column_selector(dtype_include=object)),
                            ('categorical', 'passthrough', make_column_selector(dtype_include='category'))
                            ], 
                            remainder='passthrough', 
                            verbose_feature_names_out=False)
first_transformer.set_output(transform='pandas')

In [6]:
transf_pipe = Pipeline([
                ('add_borrow', borrow_times()),
                ('diff_tranf', tx_diff()),
                ('binary_cols', search_binary()),
                ('time_cols', time_tranf()),
                ('cols_transf', first_transformer),
                ])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state=29, shuffle=True,)

X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

In [9]:
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

### Dummy Classifier

In [9]:
X_train_dummy = pd.read_csv('X_train.csv')
X_test_dummy = pd.read_csv('X_test.csv')

dummy_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('dummy_clf', DummyClassifier(strategy='most_frequent'))
                    ])
dummy_pipe.fit(X_train_dummy, y_train)
y_dummy = dummy_pipe.predict(X_test_dummy)
dummy_pipe.score(X_test_dummy, y_test)

TypeError: You have to supply one of 'by' and 'level'

In [16]:
pd.read_csv('X_test.csv').shape

(13289, 78)

In [15]:
y_test.shape

(13289, 1)

### Linear SVC

In [10]:
# Linear SVC
X_train_lsvc = pd.read_csv('X_train.csv')
X_test_lsvc = pd.read_csv('X_test.csv')

lsvc_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('lsvc_clf', LinearSVC(random_state=29))
                    ])
# lsvc_clf = LinearSVC(random_state=29)
lsvc_pipe.fit(X_train_lsvc, y_train)
y_lsvc = lsvc_pipe.predict(X_test_lsvc)
lsvc_pipe.score(X_test_lsvc, y_test)

ValueError: Columns must be same length as key

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

X_train_tree = pd.read_csv('X_train.csv')
X_test_tree = pd.read_csv('X_test.csv')

tree_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('tree_clf', DecisionTreeClassifier(random_state=29))
                    ])

tree_pipe.fit(X_train_tree, y_train)
y_tree = tree_pipe.predict(X_test_tree)
tree_pipe.score(X_test_tree, y_test)

0.8392655579802845

### Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

X_train_rf = pd.read_csv('X_train.csv')
X_test_rf = pd.read_csv('X_test.csv')

rf_pipe = Pipeline([
                    ('preprocess', transf_pipe),
                    ('rf_clf', RandomForestClassifier(criterion='entropy',random_state=29))
                    ])

rf_pipe.fit(X_train_rf, y_train)
y_tree = rf_pipe.predict(X_test_rf)
rf_pipe.score(X_test_rf, y_test)

  return fit_method(estimator, *args, **kwargs)


0.9039807359470239

# Entrega 2

In [9]:
import optuna
from optuna.visualization.matplotlib import (plot_optimization_history, plot_param_importances, 
                                    plot_parallel_coordinate)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
rf_pipe.get_params()

{'memory': None,
 'steps': [('preprocess',
   Pipeline(steps=[('add_borrow', borrow_times()), ('diff_tranf', tx_diff()),
                   ('binary_cols', search_binary()), ('time_cols', time_tranf()),
                   ('cols_transf',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('scale_data', MinMaxScaler(),
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x00000281A77942E0>),
                                                    ('object', 'drop',
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x00000281A7797490>),
                                                    ('categorical', 'passthrough',
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x00000281A7797370>)],
                                

In [17]:
# rf_pipe.set_params(rf_clf__criterion='gini')

In [24]:
y_train_rf = y_train.copy()

In [10]:
u = pd.read_csv('X_test.csv')
rf_pipe.predict(u)

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [11]:
# Optimización de parámetros del modelo seleccionado. 


def objective_function(trial):
    # Split into train and validation sets
    X = pd.read_parquet('X_t0.parquet')
    y = pd.read_parquet('y_t0.parquet')

    X_train, X_valid, y_train, y_valid = train_test_split(
        X,y, test_size=0.2, random_state=29)

    # Hyperparameters 
    rf_params = {
        "rf_clf__n_estimators": trial.suggest_int("n_estimators", 10, 400),
        "rf_clf__criterion": trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        "rf_clf__max_depth": trial.suggest_int("max_depth", 3, 50),
        'rf_clf__max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 60),
        'rf_clf__max_features':  trial.suggest_categorical('max_features', ['sqrt', 'log2'])
        }

    # rf_pipe = Pipeline([
    #                 ('preprocess', transf_pipe),
    #                 ('rf_clf', RandomForestClassifier(random_state=29))
    #                 ])    
    rf_pipe.fit(
        X_train,
        y_train,
    ).set_params(**rf_params)

    # Predict and evaluate the model
    yhat = model.predict(X_valid)
    f1 = f1_score(y_valid, yhat, average='weighted')

    return f1

In [12]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_function, n_trials=5)

[I 2024-12-04 00:22:27,451] A new study created in memory with name: no-name-440060f4-5cad-4292-bd07-d399bcb75feb
[W 2024-12-04 00:22:27,851] Trial 0 failed with parameters: {'n_estimators': 383, 'criterion': 'log_loss', 'max_depth': 15, 'max_leaf_nodes': 56, 'max_features': 'sqrt'} because of the following error: ValueError('Columns must be same length as key').
Traceback (most recent call last):
  File "c:\Users\caron\.conda\envs\labs_env\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\caron\AppData\Local\Temp\ipykernel_38304\1365714880.py", line 25, in objective_function
    rf_pipe.fit(
  File "C:\Users\caron\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\caron\AppData\Roaming\Python\Python310\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "C:\Users\car

ValueError: Columns must be same length as key