In [1]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(rc={'figure.figsize': (25, 10)}, font_scale=1.5)

df = pd.read_csv('/content/drive/MyDrive/ECH_2019.csv')

# df = pd.read_csv('ECH_2019.csv')

In [3]:
df.sample(2)



Unnamed: 0,sexo,edad,ascendencia,parentesco,estado_laboral,sueldo,busca_otro_trabajo,actividad,asistencia alguna vez a enseñanza,edu preescolar,edu primaria,edu media,edu técnica,edu univ o simil,edu terciario no uni,edu posgrado,edu mag prof,barrio,estado_civil,hijos,hijos en hogar,hijos en otro hogar,hijos en el extranjero,leer_escribir,suma_hijos
1350,Mujer,50,Blanca,Otro no pariente,Ocupados,6000,Sí,Comercio al por menor en almacenes no especial...,Sí,No asistió,"Sí, asistió","Sí, asistió",No asistió,0,0,0,0,0,Divorciado/a,Sí,1,0,0,Sí,1
5350,Hombre,63,Blanca,Jefe/a de hogar,Ocupados,0,Sí,Actividades de servicio de manutención y cuida...,Sí,No asistió,"Sí, asistió",No asistió,"Sí, asistió",0,0,0,0,0,0,0,0,0,0,Sí,0


In [4]:
no_data = {0: 'No hay dato', '0': 'No hay dato'}

mask = df.columns.difference(['edad',
                              'sueldo',
                              'hijos',
                              'hijos en hogar',
                              'hijos en otro hogar',
                              'hijos en el extranjero',
                              'suma_hijos',
                              'barrio'])

df[mask] = df[mask].replace(no_data)



### AJUSTANDO DATOS PARA ENTRENAR EL MODELO

Seleccionamos personas con edad suficiente para trabajar

In [5]:
df = df[~df['estado_laboral'].str.contains('Menores de 14 años')]
df = df[df['sueldo'] > 0]

# Categorizamos las edades y sueldos en rangos

In [6]:
df['rango_sueldos'] = pd.qcut(df['sueldo'], q=10).cat.codes

df['rango_edades'] = pd.qcut(df['edad'], q=10).cat.codes

Eliminanos las columnas que no vamos a usar o que ya categorizamos

In [7]:
df = df[[
    'sexo', 
    'edad', 
    'ascendencia', 
    'parentesco', 
    'estado_laboral', 
    'sueldo', 
    'busca_otro_trabajo',
    'actividad',
    'asistencia alguna vez a enseñanza',
    'edu preescolar',
    'edu primaria',
    'edu media',
    'edu técnica',
    'edu univ o simil',
    'edu terciario no uni',
    'edu posgrado',
    'edu mag prof',
    'barrio',
    'estado_civil',
    'suma_hijos',
    'leer_escribir',
    'rango_sueldos',
    'rango_edades'
    ]]
    

### DEFINIMOS LAS VARIABLES DEPENDIENTES E INDEPENDIENTES

In [8]:
# Feautures
X = df.drop('rango_sueldos', axis=1).to_numpy()
y = df['rango_sueldos']

print('X shape', X.shape, '\nY shape', y.shape)

X shape (34565, 22) 
Y shape (34565,)


In [9]:
y.unique()

array([1, 6, 2, 5, 0, 7, 8, 4, 3, 9], dtype=int8)

In [10]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()

X = one_hot.fit_transform(X)


In [11]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()

X = normalizer.fit_transform(X)

X.toarray()

array([[0.        , 0.21320072, 0.        , ..., 0.        , 0.        ,
        0.21320072],
       [0.        , 0.21320072, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.21320072, 0.        , ..., 0.21320072, 0.        ,
        0.        ],
       ...,
       [0.        , 0.21320072, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.21320072, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.21320072, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### Dividimos los datos en un 30% de test y 70% de entrenamiento

In [12]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) 



In [13]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)



(24195, 1876) (24195,)
(10370, 1876) (10370,)


In [14]:
to_test_predict = X_train.toarray()[145].reshape(1, -1)

print(to_test_predict)

prediction_result = y_train.iloc[145]

print(prediction_result)

[[0.21320072 0.         0.         ... 0.         0.         0.        ]]
6


# DEFINIMOS ALGORITMO DE REGRESIÓN LOGISTICA

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logistic_reg = LogisticRegression(multi_class='multinomial', max_iter=20000)

logistic_reg.fit(X_train, y_train)



LogisticRegression(max_iter=20000, multi_class='multinomial')

### VALOR DE CROSS VALIDATION PARA CADA FOLD

In [16]:
scores = cross_val_score(logistic_reg, X, y, cv=5)

scores



array([0.94821351, 0.94358455, 0.94430783, 0.94734558, 0.9392449 ])

In [17]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(logistic_reg.score(X_test, y_test),2))
print('Score en train ', round(logistic_reg.score(X_train, y_train),2))



Promedio de cross_validation  0.94
Score en test  0.95
Score en train  0.95


In [18]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

predictions = logistic_reg.predict(X_test)

confusion_matrix(y_test, predictions)



array([[1109,    7,    1,    0,    2,    2,    0,    0,    0,   28],
       [  56, 1222,    0,    0,    3,    0,    0,    3,    0,   18],
       [  36,   13,  872,    0,    9,    0,    0,    4,    0,   16],
       [  36,   10,    1,  822,    2,    4,    0,    2,    0,   13],
       [  26,   14,    1,    0,  923,    1,    0,    9,    3,   20],
       [  20,    7,    0,    0,    4, 1111,    0,    9,    0,    7],
       [   8,    8,    0,    0,    3,    1,  924,   17,    0,   18],
       [   9,    5,    0,    0,    3,    1,    0, 1153,    1,   34],
       [   6,    2,    0,    0,    1,    0,    0,    7,  717,   24],
       [  10,    3,    0,    0,    2,    0,    0,    5,    1,  961]])

### INVESTIGAMOS LA EFECTIVIDAD DEL MODELO SOBRE LOS DATOS

In [19]:
from sklearn.metrics import accuracy_score

#Prediccion en Train
y_train_pred = logistic_reg.predict(X_train)

#Prediccion en Test
y_test_pred = logistic_reg.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))



Aciertos sobre entrenamiento: 0.95
Aciertos sobre evaluación: 0.95


In [20]:
y_train

80946    1
19660    4
73980    9
54683    2
92444    6
        ..
52859    4
19757    0
35470    6
2799     8
49514    9
Name: rango_sueldos, Length: 24195, dtype: int8

### APLICAMOS PREDICCIÓN SOBRE EL MODELO

In [21]:
print(logistic_reg.predict(to_test_predict))

print(prediction_result)



[6]
6


Determinamos la probabilidad de la predicción

In [22]:
probabilidades_prediccion = logistic_reg.predict_proba(to_test_predict)
probabilidades_prediccion[:,1]



array([0.00598106])

# REPETIMOS EL EJERCICIO CON EL MODELO DE RANDOM FOREST

In [23]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(100)

random_forest.fit(X_train, y_train)



RandomForestClassifier()

In [24]:
scores = cross_val_score(random_forest, X, y, cv=5)

scores



array([0.92752785, 0.92680457, 0.92810647, 0.92579199, 0.92304354])

In [25]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(logistic_reg.score(X_test, y_test),2))
print('Score en train ', round(logistic_reg.score(X_train, y_train),2))



Promedio de cross_validation  0.93
Score en test  0.95
Score en train  0.95


In [26]:
predictions = logistic_reg.predict(X_test)

confusion_matrix(y_test, predictions)



array([[1109,    7,    1,    0,    2,    2,    0,    0,    0,   28],
       [  56, 1222,    0,    0,    3,    0,    0,    3,    0,   18],
       [  36,   13,  872,    0,    9,    0,    0,    4,    0,   16],
       [  36,   10,    1,  822,    2,    4,    0,    2,    0,   13],
       [  26,   14,    1,    0,  923,    1,    0,    9,    3,   20],
       [  20,    7,    0,    0,    4, 1111,    0,    9,    0,    7],
       [   8,    8,    0,    0,    3,    1,  924,   17,    0,   18],
       [   9,    5,    0,    0,    3,    1,    0, 1153,    1,   34],
       [   6,    2,    0,    0,    1,    0,    0,    7,  717,   24],
       [  10,    3,    0,    0,    2,    0,    0,    5,    1,  961]])

In [27]:
#Prediccion en Train
y_train_pred = random_forest.predict(X_train)

#Prediccion en Test
y_test_pred = random_forest.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))



Aciertos sobre entrenamiento: 1.0
Aciertos sobre evaluación: 0.93


In [28]:
print(random_forest.predict(to_test_predict))
print(prediction_result)



[6]
6


# REPETIMOS EL EJERCICIO CON EL MODELO DE K NEAREST NEIGHBOURS

In [29]:
from sklearn.neighbors import KNeighborsClassifier

knearest_model = KNeighborsClassifier(n_neighbors=4)

knearest_model.fit(X_train, y_train)



KNeighborsClassifier(n_neighbors=4)

In [30]:
scores = cross_val_score(knearest_model, X, y, cv=5)

scores           



array([0.43801533, 0.4355562 , 0.42861276, 0.43642413, 0.44163171])

In [31]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(knearest_model.score(X_test, y_test),2))
print('Score en train ', round(knearest_model.score(X_train, y_train),2))



Promedio de cross_validation  0.44
Score en test  0.43
Score en train  0.68


In [32]:
predictions = knearest_model.predict(X_test)

confusion_matrix(y_test, predictions)



array([[699, 211,  60,  38,  32,  32,  26,  24,   5,  22],
       [330, 612,  82,  65,  56,  50,  35,  30,  21,  21],
       [180, 153, 395,  50,  37,  51,  24,  27,  18,  15],
       [101, 147,  68, 404,  46,  35,  19,  34,  12,  24],
       [141, 161, 102,  79, 281,  93,  50,  38,  26,  26],
       [ 86, 145,  91,  69,  85, 507,  43,  58,  28,  46],
       [ 56,  98,  54,  57,  67,  85, 438,  57,  31,  36],
       [ 73,  88,  72,  64,  77, 134,  94, 466,  58,  80],
       [ 43,  55,  54,  47,  60,  69,  52,  91, 209,  77],
       [ 34,  55,  35,  41,  46,  65,  60, 112,  83, 451]])

In [33]:
#Prediccion en Train
y_train_pred = knearest_model.predict(X_train)

#Prediccion en Test
y_test_pred = knearest_model.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))



Aciertos sobre entrenamiento: 0.68
Aciertos sobre evaluación: 0.43


In [34]:
print(knearest_model.predict(to_test_predict))
print(prediction_result)



[2]
6


# El porcentaje alto de resultados puede signifcar que hay overfitting o bias. Tanto la base como los modelos necesitan customización o se puede hacer un sampling para comparar los resultados. El modelo de KNN podría considerarse el peor condidato.