In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

df = pd.read_csv('/content/drive/MyDrive/ECH_2019.csv')

# df = pd.read_csv('ECH_2019.csv')

In [3]:
# Reemplazar 0 que representa no hay respuesta
no_data = {0: 'No hay dato', '0': 'No hay dato'}

mask = df.columns.difference(
    [
        'edad',
        'sueldo',
        'hijos en hogar',
        'hijos en otro hogar',
        'hijos en el extranjero',
        'suma_hijos',
        'barrio',
    ]
)

df[mask] = df[mask].replace(no_data)

In [4]:
# Personas con edad suficiente para trabajar
df = df[~df['estado_laboral'].str.contains('Menores de 14 años')]

In [5]:
# Eliminar outliers en los sueldos
quantil_99 = df['sueldo'].quantile(0.99)
print(quantil_99)

df = df[df['sueldo'] < quantil_99]

quartil_1 = df['sueldo'].quantile(0.25)
quartil_3 = df['sueldo'].quantile(0.75)

iqr = quartil_3 - quartil_1

df = df[df['sueldo'] < quartil_3 + 1.5 * iqr]

df = df[df['sueldo'] > quartil_1 - 1.5 * iqr]

84000.0


In [6]:
# Creamos categorias con rangos de sueldos y edades
df['rango_sueldos'] = pd.qcut(df['sueldo'], q=10, duplicates='drop').cat.codes

df['rango_edades'] = pd.qcut(df['edad'], q=10).cat.codes

In [7]:
# Nos quedamos con las columnas que vamos a usar para el modelo
df = df.drop(['id_hogar','nper','sueldo'], axis=1)

# Removemos indice para tener un mejor orden de tratamiento
df = df.reset_index(drop=True)

In [8]:
# Feautures
X = df.drop('rango_sueldos', axis=1).to_numpy()
y = df['rango_sueldos']

print('X shape', X.shape, '\nY shape', y.shape)

X shape (85546, 23) 
Y shape (85546,)


In [9]:
# Definimos los feautures de test y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

print('X shape', X.shape, '\nY shape', y.shape)

X shape (85546, 23) 
Y shape (85546,)


In [10]:
# Creamos el pipeline de ejecución
one_hot = OneHotEncoder(handle_unknown = 'ignore')
normalizer = Normalizer()

pipe = Pipeline(
    steps=[('encoder', one_hot),
           ('normalizer', normalizer),
           ('lightgbm', LGBMClassifier())
           ])

In [11]:
#Conocemos los hyparámetros que podemos personalizar
for key,value in LGBMClassifier().get_params().items():
    print(key,': ', value)

boosting_type :  gbdt
class_weight :  None
colsample_bytree :  1.0
importance_type :  split
learning_rate :  0.1
max_depth :  -1
min_child_samples :  20
min_child_weight :  0.001
min_split_gain :  0.0
n_estimators :  100
n_jobs :  -1
num_leaves :  31
objective :  None
random_state :  None
reg_alpha :  0.0
reg_lambda :  0.0
silent :  True
subsample :  1.0
subsample_for_bin :  200000
subsample_freq :  0


In [12]:
# Definimos los parámetros para el cross validation
param_grid = {
    'lightgbm__num_leaves':[31, 62, 70],
    'lightgbm__max_depth':[-1,5],
}

In [13]:
# Creamos y determinamos el score del modelo
search = RandomizedSearchCV(pipe,
                            param_grid,
                            n_jobs=-1,
                            scoring='accuracy',
                            cv=5,
                            n_iter=5)

search.fit(X_train, y_train)

search.score(X_test, y_test)

0.7758338528678305