In [1]:
import pandas as pd
df = pd.read_csv('./data/data.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df['activated_date'] = pd.to_datetime(df['activated_date'])
df['last_payment_date'] = pd.to_datetime(df['last_payment_date'])
# I assume that days<0 means that it is a reactivated credit
df['days'] = (df['last_payment_date'] - df['activated_date']).dt.days
df = df.drop(['cust_id','activated_date','last_payment_date','oneoff_purchases','purchases_installments_frequency'], axis=1)
X = df.drop(['fraud'], axis=1)
y = df['fraud']
# X.head(5)


In [None]:
# Un par de modelos de deteccion de outliers
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='median')
X_impute = impute.fit_transform(X) 
X = pd.DataFrame(X_impute, columns=X.columns, index=X.index)
# Isolation Forest (Aislamiento)
iso_forest = IsolationForest(contamination=0.05, random_state=42)
X['IF_outlier'] = iso_forest.fit_predict(X)

# Local Outlier Factor (puntos de densidad local baja)
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
X['LOF_outlier'] = lof.fit_predict(X)

X[['IF_outlier', 'LOF_outlier']].head(10)


In [None]:
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import numpy as np

# Crear el modelo
# model = LogisticRegression(C=0.1, penalty='l1', max_iter=100, 
#                            solver='liblinear', random_state=42, class_weight='balanced')

model = LogisticRegression()
impute = SimpleImputer(strategy='median')
X_impute = impute.fit_transform(X) 
X = pd.DataFrame(X_impute, columns=X.columns, index=X.index)
# Obtener la curva de aprendizaje. train_sizes=np.linspace(0.1, 1.0, 10) asegura una distribución uniforme
# de tamaños entre el 10% y el 100% del conjunto de entrenamiento total
train_sizes, train_scores, validation_scores = learning_curve(
    model, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, scoring='f1', n_jobs=-1 )

train_scores_mean = train_scores.mean(axis=1)
validation_scores_mean = validation_scores.mean(axis=1)
print(train_scores_mean)
print(validation_scores_mean)


In [None]:
import matplotlib.pyplot as plt

plt.plot(train_sizes, train_scores_mean, label='f1 de Entrenamiento')
plt.plot(train_sizes, validation_scores_mean, label='f1 de Validación')
plt.ylabel('Valor del f1')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.title('Curva de Aprendizaje')
plt.legend()
plt.show()

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.metrics import mean_squared_error


base_model = Ridge()

# Bagging sobre el modelo base con subconjuntos de features
bagging_model = BaggingRegressor( # BaggingClassifier si es clasificacion
    base_estimator=base_model,
    max_features=0.8,  # Proporción de features para cada subconjunto
    max_samples=0.8,   # Proporción de datos de muestra
    random_state=42,
    n_estimators=10    # Numero de regresores ridge
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
best = bagging_model.fit(X_train, y_train)
y_pred = best.predict(X_test)
mean_squared_error(y_test, y_pred)