In [1]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(rc={'figure.figsize': (25, 10)}, font_scale=1.5)

df = pd.read_csv('/content/drive/MyDrive/ECH_2019.csv')

# df = pd.read_csv('ECH_2019.csv')

In [3]:
df.sample(2)



Unnamed: 0,id_hogar,sexo,edad,ascendencia,parentesco,estado_laboral,nper,sueldo,busca_otro_trabajo,actividad,asistencia alguna vez a enseñanza,edu preescolar,edu primaria,edu media,edu técnica,edu univ o simil,edu terciario no uni,edu posgrado,edu mag prof,barrio,estado_civil,hijos,leer_escribir
89774,2019048309,Hombre,46,Blanca,Jefe/a de hogar,Ocupados,1,0,No,Instalación eléctrica y de comunicaciones,Sí,"Sí, asistió","Sí, asistió","Sí, asistió","Sí, asistió",0,0,0,0,0,0,0,Sí
92606,2019049847,Hombre,68,Blanca,Jefe/a de hogar,Ocupados,1,0,No,Actividades de apoyo a la ganadería,Sí,No asistió,"Sí, asistió",No asistió,No asistió,0,0,0,0,0,Soltero/a,0,Sí


In [4]:
no_data = {0: 'No hay dato', '0': 'No hay dato'}

mask = df.columns.difference(['edad', 'sueldo', 'hijos', 'barrio'])

df[mask] = df[mask].replace(no_data)



### AJUSTANDO DATOS PARA ENTRENAR EL MODELO

Seleccionamos personas con edad suficiente para trabajar

In [5]:
df = df[~df['estado_laboral'].str.contains('Menores de 14 años')]

In [6]:
# df = df[df['estado_laboral'].str.contains('Ocupados')]

# Categorizamos las edades y sueldos en rangos

In [7]:
df['rango_sueldos'] = pd.cut(df['sueldo'], bins=[
    1,
    5000,
    10000,
    30000,
    50000,
    100000,
    float('Inf'),
    ]).cat.codes

df['rango_edades'] = pd.cut(df['edad'], bins=[
    0,
    10,
    15,
    20,
    30,
    40,
    50,
    60,
    70,
    float('Inf'),
    ]).cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Eliminanos las columnas que no vamos a usar o que ya categorizamos

In [8]:
df = df[[
    'sexo', 
    'ascendencia', 
    'parentesco', 
    'estado_laboral', 
    'busca_otro_trabajo',
    'actividad',
    'asistencia alguna vez a enseñanza',
    'edu preescolar',
    'edu primaria',
    'edu media',
    'edu técnica',
    'edu univ o simil',
    'edu terciario no uni',
    'edu posgrado',
    'edu mag prof',
    'barrio',
    'estado_civil',
    'hijos',
    'leer_escribir',
    'rango_sueldos',
    'rango_edades'
    ]]

    

In [9]:
df = df[~df['rango_sueldos'].isna()]

### DEFINIMOS LAS VARIABLES DEPENDIENTES E INDEPENDIENTES

In [10]:
# Feautures
X = df.drop('rango_sueldos', axis=1).to_numpy()
y = df['rango_sueldos']

print('X shape', X.shape, '\nY shape', y.shape)

X shape (89807, 20) 
Y shape (89807,)


In [11]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()

X = one_hot.fit_transform(X)


In [12]:
X

<89807x536 sparse matrix of type '<class 'numpy.float64'>'
	with 1796140 stored elements in Compressed Sparse Row format>

In [13]:
y

0        -1
1        -1
2        -1
3         2
4         2
         ..
107865   -1
107866   -1
107867    1
107869    2
107870    2
Name: rango_sueldos, Length: 89807, dtype: int8

In [14]:
prediction = X.toarray()
prediction = prediction[49000].reshape(1, -1)

### Dividimos los datos en un 30% de test y 70% de entrenamiento

In [15]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) 



In [16]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)



(62864, 536) (62864,)
(26943, 536) (26943,)


# DEFINIMOS ALGORITMO DE REGRESIÓN LOGISTICA

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logistic_reg = LogisticRegression(multi_class='multinomial', max_iter=20000)

logistic_reg.fit(X_train, y_train)



LogisticRegression(max_iter=20000, multi_class='multinomial')

### VALOR DE CROSS VALIDATION PARA CADA FOLD

In [18]:
scores = cross_val_score(logistic_reg, X, y, cv=5)

scores



array([0.76027168, 0.75999332, 0.75786426, 0.76437838, 0.75959022])

In [19]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(logistic_reg.score(X_test, y_test),2))
print('Score en train ', round(logistic_reg.score(X_train, y_train),2))



Promedio de cross_validation  0.76
Score en test  0.76
Score en train  0.77


In [20]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

predictions = logistic_reg.predict(X_test)

confusion_matrix(y_test, predictions)



array([[15393,    25,    26,  1020,    98,    52,     9],
       [  153,    55,    62,   231,     2,     0,     0],
       [  238,    30,    51,   507,    13,     4,     0],
       [ 1189,    12,    44,  3955,   454,    50,     3],
       [  393,     1,     0,  1012,   774,   103,     4],
       [  115,     0,     0,   157,   324,   223,    16],
       [   25,     0,     0,    10,    28,    63,    19]])

### INVESTIGAMOS LA EFECTIVIDAD DEL MODELO SOBRE LOS DATOS

In [21]:
from sklearn.metrics import accuracy_score

#Prediccion en Train
y_train_pred = logistic_reg.predict(X_train)

#Prediccion en Test
y_test_pred = logistic_reg.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))



Aciertos sobre entrenamiento: 0.77
Aciertos sobre evaluación: 0.76


In [22]:
y_train

96757     2
95740     3
12540    -1
102117   -1
101479    4
         ..
7592     -1
65971     4
92267    -1
1055      4
18964    -1
Name: rango_sueldos, Length: 62864, dtype: int8

### APLICAMOS PREDICCIÓN SOBRE EL MODELO

In [23]:
print(logistic_reg.predict(prediction))

print(y_train[49000])



[-1]
-1


Determinamos la probabilidad de la predicción

In [24]:
probabilidades_prediccion = logistic_reg.predict_proba(prediction)
probabilidades_prediccion[:,1]



array([0.06288286])

# REPETIMOS EL EJERCICIO CON EL MODELO DE RANDOM FOREST

In [25]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(100)

random_forest.fit(X_train, y_train)



RandomForestClassifier()

In [26]:
scores = cross_val_score(random_forest, X, y, cv=5)

scores



array([0.73059793, 0.73299187, 0.72980346, 0.73002617, 0.73141807])

In [27]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(logistic_reg.score(X_test, y_test),2))
print('Score en train ', round(logistic_reg.score(X_train, y_train),2))



Promedio de cross_validation  0.73
Score en test  0.76
Score en train  0.77


In [28]:
predictions = logistic_reg.predict(X_test)

confusion_matrix(y_test, predictions)



array([[15393,    25,    26,  1020,    98,    52,     9],
       [  153,    55,    62,   231,     2,     0,     0],
       [  238,    30,    51,   507,    13,     4,     0],
       [ 1189,    12,    44,  3955,   454,    50,     3],
       [  393,     1,     0,  1012,   774,   103,     4],
       [  115,     0,     0,   157,   324,   223,    16],
       [   25,     0,     0,    10,    28,    63,    19]])

In [29]:
#Prediccion en Train
y_train_pred = random_forest.predict(X_train)

#Prediccion en Test
y_test_pred = random_forest.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))



Aciertos sobre entrenamiento: 0.96
Aciertos sobre evaluación: 0.73


In [30]:
print(random_forest.predict(prediction))
print(y_train[49000])



[-1]
-1


# REPETIMOS EL EJERCICIO CON EL MODELO DE K NEAREST NEIGHBOURS

In [31]:
from sklearn.neighbors import KNeighborsClassifier

knearest_model = KNeighborsClassifier(n_neighbors=4)

knearest_model.fit(X_train, y_train)



KNeighborsClassifier(n_neighbors=4)

In [32]:
scores = cross_val_score(knearest_model, X, y, cv=5)

scores           



array([0.69151542, 0.6899009 , 0.6906631 , 0.69311286, 0.69283447])

In [33]:
print('Promedio de cross_validation ', round(scores.mean(),2))
print('Score en test ', round(knearest_model.score(X_test, y_test),2))
print('Score en train ', round(knearest_model.score(X_train, y_train),2))



Promedio de cross_validation  0.69
Score en test  0.69
Score en train  0.79


In [34]:
predictions = knearest_model.predict(X_test)

confusion_matrix(y_test, predictions)



array([[14958,    72,   114,  1139,   250,    75,    15],
       [  210,    52,    72,   164,     5,     0,     0],
       [  313,    48,   117,   336,    23,     6,     0],
       [ 1938,    75,   192,  2911,   493,    94,     4],
       [  740,     9,    17,   866,   518,   125,    12],
       [  272,     1,     3,   175,   210,   147,    27],
       [   53,     0,     1,    14,    18,    45,    14]])

In [35]:
#Prediccion en Train
y_train_pred = knearest_model.predict(X_train)

#Prediccion en Test
y_test_pred = knearest_model.predict(X_test)

#Calculo el accuracy en Train
train_accuracy = accuracy_score(y_train, y_train_pred)

#Calculo el accuracy en Test
test_accuracy = accuracy_score(y_test, y_test_pred)

print('Aciertos sobre entrenamiento:', round(train_accuracy, 2))
print('Aciertos sobre evaluación:', round(test_accuracy, 2))



Aciertos sobre entrenamiento: 0.79
Aciertos sobre evaluación: 0.69


In [36]:
print(knearest_model.predict(prediction))
print(y_train[49000])



[-1]
-1
