In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize': (10, 5)}, font_scale=1.5)

df = pd.read_csv('/content/drive/MyDrive/ECH_2019.csv')

# df = pd.read_csv('ECH_2019.csv')

In [3]:
no_data = {0: 'No hay dato', '0': 'No hay dato'}

mask = df.columns.difference(['edad',
                              'sueldo',
                              'hijos en hogar',
                              'hijos en otro hogar',
                              'hijos en el extranjero',
                              'suma_hijos',
                              'barrio'])

df[mask] = df[mask].replace(no_data)

In [4]:
df['rango_edades'] = pd.qcut(df['edad'], q=10)

df['rango_edades'].value_counts()

(8.0, 16.0]      11803
(39.0, 47.0]     11347
(63.0, 73.0]     10988
(-0.001, 8.0]    10918
(55.0, 63.0]     10848
(47.0, 55.0]     10834
(16.0, 23.0]     10657
(31.0, 39.0]     10484
(23.0, 31.0]     10116
(73.0, 98.0]      9876
Name: rango_edades, dtype: int64

### AJUSTANDO DATOS PARA ENTRENAR EL MODELO

Seleccionamos personas con edad suficiente para trabajar

In [5]:
df = df[~df['estado_laboral'].str.contains('Menores de 14 años')]

In [6]:
df['estado_laboral'].value_counts()

Ocupados                                      49036
Inactivo: jubilado                            16025
Inactivo: estudiante                           8216
Inactivo: realiza los quehaceres del hogar     5936
Inactivo: pensionista                          4438
Desocupados propiamente dichos                 3409
Inactivo: otro                                 1291
Desocupados buscan trabajo por primera vez      787
Inactivo: rentista                              414
Desocupados en seguro de paro                   255
Name: estado_laboral, dtype: int64

In [7]:
df.groupby(by='estado_laboral').mean()

Unnamed: 0_level_0,id_hogar,edad,nper,sueldo,barrio,hijos en hogar,hijos en otro hogar,hijos en el extranjero,suma_hijos
estado_laboral,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Desocupados buscan trabajo por primera vez,2019029000.0,20.670902,3.035578,25.031766,11.876747,0.151207,0.022872,0.0,0.174079
Desocupados en seguro de paro,2019030000.0,39.07451,1.780392,3602.415686,12.407843,0.458824,0.254902,0.019608,0.733333
Desocupados propiamente dichos,2019029000.0,33.915518,2.206512,1559.024641,11.319742,0.596656,0.310355,0.018774,0.925785
Inactivo: estudiante,2019029000.0,17.34518,3.260346,93.646543,8.818768,0.03335,0.00426,0.0,0.03761
Inactivo: jubilado,2019029000.0,72.800936,1.348206,123.482059,10.446365,0.171108,0.959126,0.082683,1.212917
Inactivo: otro,2019029000.0,33.783114,2.883036,618.0945,10.652982,0.237026,0.302866,0.023238,0.563129
Inactivo: pensionista,2019029000.0,64.425192,1.683641,22.757999,9.698738,0.370662,1.67553,0.114015,2.160207
Inactivo: realiza los quehaceres del hogar,2019029000.0,46.889319,1.857311,317.843666,8.732817,1.107311,1.102426,0.056941,2.266678
Inactivo: rentista,2019029000.0,58.461353,1.528986,72.463768,9.009662,0.541063,0.850242,0.123188,1.514493
Ocupados,2019029000.0,42.683681,1.661249,20109.876601,10.482462,0.473183,0.304593,0.017885,0.79566


In [8]:
quantil_99 = df["sueldo"].quantile(0.99)
print(quantil_99)

df = df[df["sueldo"] < quantil_99]

quartil_1 = df["sueldo"].quantile(0.25)
quartil_3 = df["sueldo"].quantile(0.75)

iqr = quartil_3 - quartil_1

df = df[df["sueldo"] < quartil_3 + 1.5 * iqr]

df = df[df["sueldo"] > quartil_1 - 1.5 * iqr]

84000.0


In [9]:
df['rango_sueldos'] = pd.qcut(df['sueldo'], q=10, duplicates='drop').cat.codes

df['rango_edades'] = pd.qcut(df['edad'], q=10).cat.codes

In [10]:
df = df[[
    'sexo', 
    'edad', 
    'ascendencia', 
    'parentesco', 
    'estado_laboral', 
    'busca_otro_trabajo',
    'actividad',
    'asistencia alguna vez a enseñanza',
    'edu preescolar',
    'edu primaria',
    'edu media',
    'edu técnica',
    'edu univ o simil',
    'edu terciario no uni',
    'edu posgrado',
    'edu mag prof',
    'barrio',
    'estado_civil',
    'suma_hijos',
    'leer_escribir',
    'rango_sueldos',
    'rango_edades',
    'region',
    'vivienda',
    'estrato'
    ]]


In [11]:
df = df.reset_index(drop=True)

df.head(2)

Unnamed: 0,sexo,edad,ascendencia,parentesco,estado_laboral,busca_otro_trabajo,actividad,asistencia alguna vez a enseñanza,edu preescolar,edu primaria,edu media,edu técnica,edu univ o simil,edu terciario no uni,edu posgrado,edu mag prof,barrio,estado_civil,suma_hijos,leer_escribir,rango_sueldos,rango_edades,region,vivienda,estrato
0,Mujer,76.0,Blanca,Jefe/a de hogar,Inactivo: jubilado,0.0,.,Sí,No asistió,"Sí, asistió",0.0,No asistió,0.0,0.0,0.0,0.0,0.0,Viudo/a de unión libre,0.0,Sí,0,9,Interior - Localidades de 5.000 habitantes o más,Propietario de la vivienda y el terreno y ya l...,Costa Este
1,Mujer,22.0,Blanca,Otro pariente,Inactivo: realiza los quehaceres del hogar,0.0,.,Sí,"Sí, asistió","Sí, asistió","Sí, asistió",No asistió,No asistió,No asistió,0.0,No asistió,0.0,Soltero/a,0.0,Sí,0,1,Interior - Localidades de 5.000 habitantes o más,Propietario de la vivienda y el terreno y ya l...,Costa Este


In [12]:
df['rango_sueldos'].value_counts()

0    59982
1     9344
3     8546
2     7674
Name: rango_sueldos, dtype: int64

### DEFINIMOS LAS VARIABLES DEPENDIENTES E INDEPENDIENTES

In [13]:
# Feautures
X = df.drop('rango_sueldos', axis=1).to_numpy()
y = df['rango_sueldos']

print('X shape', X.shape, '\nY shape', y.shape)

X shape (85546, 24) 
Y shape (85546,)


In [14]:
y.unique()

array([0, 1, 3, 2], dtype=int8)

In [15]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()

X = one_hot.fit_transform(X)

In [16]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()

X = normalizer.fit_transform(X)

In [17]:
X.shape

(85546, 660)

## Dejamos una fila afuera para testear el modelo más adelante

In [18]:
test_sample = X[0]
test_result = y[0]

X = X[1:]
y = y[1:]

### Dividimos los datos en un 30% de test y 70% de entrenamiento

In [19]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) 

In [20]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(59881, 660) (59881,)
(25664, 660) (25664,)


# DEFINIMOS ALGORITMO DE REGRESIÓN LOGISTICA

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logistic_reg = LogisticRegression(multi_class='multinomial',max_iter=20000)

In [22]:
#Definicion de Hyperparámetros
param_grid = {'solver':['lbfgs', 'saga'],
              'penalty': ['l1', 'l2'], 
              'C':[10, 1.0, 0.1]}

# RANDOM SEARCH

In [23]:
from sklearn.model_selection import RandomizedSearchCV

#Aplicamos la búsqueda al modelo
model = RandomizedSearchCV(logistic_reg, param_grid,n_iter=100, random_state=0, cv=5,scoring='accuracy',verbose=20)

In [24]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5; 1/12] START C=10, penalty=l1, solver=lbfgs.............................
[CV 1/5; 1/12] END C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5; 1/12] START C=10, penalty=l1, solver=lbfgs.............................
[CV 2/5; 1/12] END C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5; 1/12] START C=10, penalty=l1, solver=lbfgs.............................
[CV 3/5; 1/12] END C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5; 1/12] START C=10, penalty=l1, solver=lbfgs.............................
[CV 4/5; 1/12] END C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5; 1/12] START C=10, penalty=l1, solver=lbfgs.............................
[CV 5/5; 1/12] END C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5; 2/12] START C=10, penalty=l1, solver=saga..............................




[CV 1/5; 2/12] END C=10, penalty=l1, solver=saga;, score=0.772 total time= 3.5min
[CV 2/5; 2/12] START C=10, penalty=l1, solver=saga..............................
[CV 2/5; 2/12] END C=10, penalty=l1, solver=saga;, score=0.766 total time= 3.1min
[CV 3/5; 2/12] START C=10, penalty=l1, solver=saga..............................
[CV 3/5; 2/12] END C=10, penalty=l1, solver=saga;, score=0.770 total time= 5.5min
[CV 4/5; 2/12] START C=10, penalty=l1, solver=saga..............................
[CV 4/5; 2/12] END C=10, penalty=l1, solver=saga;, score=0.767 total time= 7.7min
[CV 5/5; 2/12] START C=10, penalty=l1, solver=saga..............................
[CV 5/5; 2/12] END C=10, penalty=l1, solver=saga;, score=0.765 total time= 4.2min
[CV 1/5; 3/12] START C=10, penalty=l2, solver=lbfgs.............................
[CV 1/5; 3/12] END C=10, penalty=l2, solver=lbfgs;, score=0.773 total time=  27.7s
[CV 2/5; 3/12] START C=10, penalty=l2, solver=lbfgs.............................
[CV 2/5; 3/12] END C=

15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.76212814 0.76204464        nan 0.74

RandomizedSearchCV(cv=5,
                   estimator=LogisticRegression(max_iter=20000,
                                                multi_class='multinomial'),
                   n_iter=100,
                   param_distributions={'C': [10, 1.0, 0.1],
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['lbfgs', 'saga']},
                   random_state=0, scoring='accuracy', verbose=20)

In [25]:
print("Mejores parametros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejores parametros: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 10}
Mejor Score: 0.7684072634428635



In [26]:
# Prediccion
prediction = model.predict(X_test)

In [28]:
from sklearn.metrics import confusion_matrix

# Matriz de Confusion
cm = confusion_matrix(y_test,prediction)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[17162   456   147   340]
 [ 1386   744   296   260]
 [  760   394   476   670]
 [  645   143   314  1471]]


# GRID SEARCH CV

In [None]:
from sklearn.model_selection import GridSearchCV

#Grid Search con hyperparámetros
model = GridSearchCV(logistic_reg, param_grid=param_grid, cv=5,scoring='accuracy',verbose=20)

In [None]:
model.fit(X_train, y_train)

In [None]:
model

In [None]:
print("Mejores parametros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

In [None]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(logistic_reg.score(X_test, y_test),2))
print('Score en train ', round(logistic_reg.score(X_train, y_train),2))

In [None]:
from sklearn.metrics import accuracy_score

#Prediccion en Train y Test
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

#Calculo el accuracy en Train y Test
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre train:', round(train_accuracy, 2))
print('Exactitud sobre test:', round(test_accuracy, 2))

### APLICAMOS PREDICCIÓN SOBRE EL MODELO

In [None]:
print(model.predict(test_sample))

print(test_result)

Determinamos la probabilidad de la predicción

In [None]:
probabilidades_prediccion = model.predict_proba(test_sample)
probabilidades_prediccion

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_test_pred)

print("Matriz de confusión:")
print(cm)

# REPETIMOS EL EJERCICIO CON EL MODELO DE RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(100)

random_forest.fit(X_train, y_train)

In [None]:
scores = cross_val_score(random_forest, X, y, cv=5)

scores

In [None]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(logistic_reg.score(X_test, y_test),2))
print('Score en train ', round(logistic_reg.score(X_train, y_train),2))

In [None]:
predictions = logistic_reg.predict(X_test)

confusion_matrix(y_test, predictions)

In [None]:
#Prediccion en Train
y_train_pred = random_forest.predict(X_train)

#Prediccion en Test
y_test_pred = random_forest.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))

In [None]:
print(random_forest.predict(test_sample))
print(test_result)

In [None]:
probabilidades_prediccion = random_forest.predict_proba(test_sample)
probabilidades_prediccion

# REPETIMOS EL EJERCICIO CON EL MODELO DE K NEAREST NEIGHBOURS

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knearest_model = KNeighborsClassifier(n_neighbors=4)

knearest_model.fit(X_train, y_train)

In [None]:
scores = cross_val_score(knearest_model, X, y, cv=5)

scores           

In [None]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(knearest_model.score(X_test, y_test),2))
print('Score en train ', round(knearest_model.score(X_train, y_train),2))

In [None]:
predictions = knearest_model.predict(X_test)

confusion_matrix(y_test, predictions)

In [None]:
#Prediccion en Train
y_train_pred = knearest_model.predict(X_train)

#Prediccion en Test
y_test_pred = knearest_model.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))

In [None]:
print(knearest_model.predict(test_sample))
print(test_result)

In [None]:
probabilidades_prediccion = knearest_model.predict_proba(test_sample)
probabilidades_prediccion

# Columna que diga por arriba de la media de hombres o no

# Cómo determino si afecta o no