In [1]:
#!pip install lightgbm
#!pip install optuna

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_squared_error 

import lightgbm as lgbm
import joblib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna 
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from statistics import mean
from s3_utils.s3_utils import write_parquet_from_pd, read_pd_from_parquet, start_logger,write_pickle,read_pickle

from s3_utils import read_pd_from_csv
pd.set_option('display.max_rows', 500)

In [4]:
def objective(trial):
    param = {
        'metric': 'mse',
        'random_state': 42,
        'n_estimators': trial.suggest_categorical('n_estimators',[1000,1500,2000,2500,3000,4000,5000]),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.01,0.02,0.05,0.1,0.15]),
        'max_depth': trial.suggest_categorical('max_depth', [3,4,5,6,7,8,9,10,15]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100),
        'verbosity':-1
    }
    model = lgbm.LGBMRegressor(**param)

    model.fit(X_train, y_train, eval_set=[(X_test, y_test)],callbacks=[lgbm.early_stopping(stopping_rounds=50)])
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds, squared=False)

    return mse



In [5]:
import logging
import sys
from s3_utils import write_parquet_from_pd

try:
    from awsglue.utils import getResolvedOptions

    args = getResolvedOptions(sys.argv, ["path"])
    path = args["path"]
except Exception as error:
    print("Running script locally")
    path = "glue_scripts/output"


logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
logFormatter = logging.Formatter(
    "%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p"
)
handler.setFormatter(logFormatter)
handler.setLevel(logging.INFO)
logger.addHandler(handler)

Running script locally


In [6]:
prospectos = pd.read_csv("s3://data-science-kavak-dev/projects/cerberus/v2/dev/income/data/prospectos.csv", index_col=0)
prospectos.shape



(13147, 50)

In [7]:
aprobados = pd.read_csv("s3://data-science-kavak-dev/projects/cerberus/v2/dev/income/data/aprobados.csv", usecols=prospectos.columns)
aprobados.shape

(13943, 50)

In [8]:
df = prospectos.loc[~prospectos.researchable_id.isin(aprobados.researchable_id),]

In [9]:
df = df.query("ingreso_neto_comprobado < 300000")

In [10]:
print(f"Las columnas del dataset son {df.columns}")

Las columnas del dataset son Index(['researchable_id', 'declarativa',
       'fecha de apertura de la cuenta más antigua',
       'fecha de apertura de la cuenta más reciente',
       'fecha de la consulta mas reciente', 'mensaje de alerta',
       'moneda del credito', 'nueva direccion en los últimos 60 días',
       'número de cuentas', 'número de cuentas cerradas',
       'número de cuentas con historial de morosidad',
       'número de cuentas con mop = 00', 'número de cuentas con mop = 01',
       'número de cuentas con mop = 02', 'número de cuentas con mop = 03',
       'número de cuentas con mop = 04', 'número de cuentas con mop = 05',
       'número de cuentas con mop = 06', 'número de cuentas con mop = 07',
       'número de cuentas con mop = 96', 'número de cuentas con mop = 97',
       'número de cuentas con mop = 99', 'número de cuentas con mop = UR',
       'número de cuentas con morosidad actual',
       'número de cuentas de pagos fijos e hipotecarios',
       'número de

In [11]:
base = df.copy()
base = base[base["net_income_verified"]>=base["ingreso_neto_comprobado"]]

In [12]:
#base.ingreso_neto_comprobado = base.ingreso_neto_comprobado.clip(4000,100000)

## Modelo % validado

In [13]:
validated_df = base.copy()
validated_df["target"] = (base["ingreso_neto_comprobado"]/base["net_income_verified"])
del validated_df["ingreso_neto_comprobado"]

In [14]:
#validated_df = validated_df[validated_df["target"]<=1.35]

In [15]:
#validated_df = validated_df[validated_df["target"]<=1]

In [16]:
to_drop = ['estimate','researchable_id', 'declarativa' ]
target = ["target"]

In [17]:
to_drop = to_drop + ["target"]
columnas = [col for col in validated_df.columns if col not in to_drop]
X = validated_df[columnas]
y = validated_df[target]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

hp = study.best_params
print("  Value: {}".format(trial.value))
print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

lgbm_model = lgbm.LGBMRegressor(**hp)



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[281]	valid_0's l2: 0.0231788
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	valid_0's l2: 0.0230564
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2886]	valid_0's l2: 0.0237133
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[250]	valid_0's l2: 0.0232185
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[144]	valid_0's l2: 0.0230446
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[73]	valid_0's l2: 0.0231918
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[164]	valid_0's l2: 0.0232416
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1995]	val

In [19]:
lgbm_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],callbacks=[lgbm.early_stopping(stopping_rounds=50)])
y_pred_lgbm = lgbm_model.predict(X_test)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[770]	valid_0's l2: 0.0208851


In [20]:
test_set = pd.merge(base[["ingreso_neto_comprobado","net_income_verified"]], y_test, left_index=True,right_index=True, how="inner")

In [21]:
test_set = test_set.join(pd.Series(y_pred_lgbm, index=y_test.index,name="y_pred"))


In [22]:
test_set["pred"] = test_set["net_income_verified"]*test_set["y_pred"]

In [23]:
MAPE(test_set["ingreso_neto_comprobado"],test_set["pred"])

0.15265498223735516

In [24]:
name = "%_validated"
write_pickle("s3://data-science-kavak-dev/projects/cerberus/v2/dev/income/models/"+f"{name}_model.pkl",lgbm_model)

## Modelo declarado con clip

In [25]:
to_drop = ['estimate','researchable_id', 'declarativa' ]
target = ["ingreso_neto_comprobado"]

In [26]:
base_clip = base.copy()
base_clip = base.query("ingreso_neto_comprobado > 4000 & ingreso_neto_comprobado < 100000")

In [27]:
to_drop = to_drop + ["ingreso_neto_comprobado"]
columnas = [col for col in base_clip.columns if col not in to_drop]
X = base_clip[columnas]
y = base_clip[target]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

hp = study.best_params
print("  Value: {}".format(trial.value))
print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

lgbm_model = lgbm.LGBMRegressor(**hp)



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[212]	valid_0's l2: 4.63613e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[517]	valid_0's l2: 4.77411e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[877]	valid_0's l2: 4.44698e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[432]	valid_0's l2: 4.79924e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[871]	valid_0's l2: 4.30028e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[567]	valid_0's l2: 4.71803e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[842]	valid_0's l2: 4.77246e+07
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration

In [29]:
lgbm_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],callbacks=[lgbm.early_stopping(stopping_rounds=50)])
y_pred_lgbm = lgbm_model.predict(X_test)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1157]	valid_0's l2: 4.29108e+07


In [30]:
MAPE(y_test,y_pred_lgbm)

0.16269582045710504

In [31]:
name = "declarado_final"
write_pickle("s3://data-science-kavak-dev/projects/cerberus/v2/dev/income/models/"+f"{name}_con_clip_model.pkl",lgbm_model)

# Modelo declarado sin clip

In [32]:
df = df[df["net_income_verified"]>=df["ingreso_neto_comprobado"]]

In [33]:
to_drop = ['estimate','researchable_id', 'declarativa' ]
target = ["target"]

In [34]:
to_drop = ['estimate','researchable_id', 'declarativa' ]
target = ["ingreso_neto_comprobado"]

In [35]:
df["ingreso_neto_comprobado"].describe().round(2)

count      8292.00
mean      35321.61
std       33582.38
min        5000.00
25%       15642.00
50%       24000.00
75%       40000.00
max      290000.00
Name: ingreso_neto_comprobado, dtype: float64

In [36]:
to_drop = to_drop + ["ingreso_neto_comprobado"]
columnas = [col for col in df.columns if col not in to_drop]
X = df[columnas]
y = df[target]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

hp = study.best_params
print("  Value: {}".format(trial.value))
print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

lgbm_model = lgbm.LGBMRegressor(**hp)



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[847]	valid_0's l2: 1.06581e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[181]	valid_0's l2: 1.05008e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[342]	valid_0's l2: 1.02034e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[694]	valid_0's l2: 1.1695e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[618]	valid_0's l2: 8.3167e+07
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[99]	valid_0's l2: 1.17359e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[874]	valid_0's l2: 1.04884e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[864]	valid

In [38]:
lgbm_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],callbacks=[lgbm.early_stopping(stopping_rounds=50)])
y_pred_lgbm = lgbm_model.predict(X_test)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[618]	valid_0's l2: 8.3167e+07


In [39]:
MAPE(y_test,y_pred_lgbm)

0.15660925532208383

In [40]:
name = "declarado_sin_clip"
write_pickle("s3://data-science-kavak-dev/projects/cerberus/v2/dev/income/models/"+f"{name}_model.pkl",lgbm_model)

# Modelo sin declarado sin clip

In [41]:
to_drop = ['estimate','researchable_id', 'declarativa' ,"net_income_verified"]
target = ["ingreso_neto_comprobado"]

In [42]:
df["ingreso_neto_comprobado"].describe().round(2)

count      8292.00
mean      35321.61
std       33582.38
min        5000.00
25%       15642.00
50%       24000.00
75%       40000.00
max      290000.00
Name: ingreso_neto_comprobado, dtype: float64

In [43]:
to_drop = to_drop + ["ingreso_neto_comprobado"]
columnas = [col for col in base.columns if col not in to_drop]
X = df[columnas]
y = df[target]

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

hp = study.best_params
print("  Value: {}".format(trial.value))
print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

lgbm_model = lgbm.LGBMRegressor(**hp)



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l2: 6.79136e+08
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3999]	valid_0's l2: 6.79029e+08
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1497]	valid_0's l2: 7.12489e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[565]	valid_0's l2: 5.97637e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1117]	valid_0's l2: 6.95963e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3317]	valid_0's l2: 7.36023e+08
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 7.37089e+08
Training until validation scores don't improve

In [45]:
lgbm_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],callbacks=[lgbm.early_stopping(stopping_rounds=50)])
y_pred_lgbm = lgbm_model.predict(X_test)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2429]	valid_0's l2: 5.60471e+08


In [46]:
MAPE(y_test,y_pred_lgbm)

0.48621439925037385

In [47]:
name = "sin_declarado_sin_clip"
write_pickle("s3://data-science-kavak-dev/projects/cerberus/v2/dev/income/models/"+f"{name}_model.pkl",lgbm_model)

# Modelo sin declarado con clip

In [48]:
to_drop = ['estimate','researchable_id', 'declarativa' ,"net_income_verified"]
target = ["ingreso_neto_comprobado"]

In [49]:
base_clip["ingreso_neto_comprobado"].describe().round(2)

count     7771.00
mean     28448.69
std      18083.28
min       5000.00
25%      15000.00
50%      22000.00
75%      35000.00
max      99999.00
Name: ingreso_neto_comprobado, dtype: float64

In [50]:
to_drop = to_drop + ["ingreso_neto_comprobado"]
columnas = [col for col in base_clip.columns if col not in to_drop]
X = base_clip[columnas]
y = base_clip[target]

In [51]:
columnas

['fecha de apertura de la cuenta más antigua',
 'fecha de apertura de la cuenta más reciente',
 'fecha de la consulta mas reciente',
 'mensaje de alerta',
 'moneda del credito',
 'nueva direccion en los últimos 60 días',
 'número de cuentas',
 'número de cuentas cerradas',
 'número de cuentas con historial de morosidad',
 'número de cuentas con mop = 00',
 'número de cuentas con mop = 01',
 'número de cuentas con mop = 02',
 'número de cuentas con mop = 03',
 'número de cuentas con mop = 04',
 'número de cuentas con mop = 05',
 'número de cuentas con mop = 06',
 'número de cuentas con mop = 07',
 'número de cuentas con mop = 96',
 'número de cuentas con mop = 97',
 'número de cuentas con mop = 99',
 'número de cuentas con mop = UR',
 'número de cuentas con morosidad actual',
 'número de cuentas de pagos fijos e hipotecarios',
 'número de cuentas en aclaración',
 'número de cuentas en despacho de cobranza o administadora de cartera',
 'número de cuentas revolventes y sin límite pre-esta

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

hp = study.best_params
print("  Value: {}".format(trial.value))
print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

lgbm_model = lgbm.LGBMRegressor(**hp)



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3999]	valid_0's l2: 1.99259e+08
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1492]	valid_0's l2: 1.8963e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[978]	valid_0's l2: 1.93321e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[970]	valid_0's l2: 1.8878e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1094]	valid_0's l2: 1.96027e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[668]	valid_0's l2: 1.89604e+08
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[337]	valid_0's l2: 1.91377e+08
Training until validation scores don't improve for 50 rounds
Early stopping, 

In [53]:
lgbm_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],callbacks=[lgbm.early_stopping(stopping_rounds=50)])
y_pred_lgbm = lgbm_model.predict(X_test)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[661]	valid_0's l2: 1.71237e+08


In [54]:
MAPE(y_test,y_pred_lgbm)

0.390339108266836

In [55]:
name = "sin_declarado_con_clip"
write_pickle("s3://data-science-kavak-dev/projects/cerberus/v2/dev/income/models/"+f"{name}_model.pkl",lgbm_model)