In [1]:
# Importamos librerías
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler,  OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump
from sklearn.decomposition import TruncatedSVD

In [2]:
# Cargamos los datos
Data = pd.read_csv("../Datos/data_adults.csv")
Data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Vista rápida de los datos
print(Data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB
None


In [4]:
# Eliminamos columnas irrelevantes
Data_cop = Data.drop("fnlwgt", axis=1)
Data_cop = Data_cop.drop("education-num", axis=1)

In [5]:
X = Data_cop.drop("income", axis=1)
y = Data_cop['income'].isin(['>50K.','>50K'])

In [6]:
# Eliminamos columnas irrelevantes
Data_cop = Data.drop("fnlwgt", axis=1)
Data_cop = Data_cop.drop("education-num", axis=1)
X = Data_cop.drop("income", axis=1)
y = Data_cop['income'].isin(['>50K.','>50K'])
# Definimos las variables categóricas y numéricas
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Creamos pipelines de preprocesamiento

# Pipeline RL, Naive Bayes y Gradient Boosting
numerical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Pipeline SVM
numerical_transformer_2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer_2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('svd', TruncatedSVD(n_components=50, random_state=25))
])
# Usamos ColumnTransformer para combinar ambas transformaciones
preprocessor_1 = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_1, numerical_features),
        ('cat', categorical_transformer_1, categorical_features)
    ])

# 
preprocessor_2 = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_2, numerical_features),
        ('cat', categorical_transformer_2, categorical_features)
    ])

In [7]:
# Definimos el pipeline completo
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor_1)])
model_pipeline_svm = Pipeline(steps=[('preprocessor', preprocessor_2)])

## Regresión Logística

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Dividimos los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Creamos un nuevo pipeline que incluye el preprocesamiento y el modelo
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor_1),
                                 ('classifier', LogisticRegression(random_state=25))])

# Entrenamos el pipeline completo en los datos de entrenamiento
lr_pipeline.fit(X_train, y_train)

# Hacemos predicciones en el conjunto de prueba
y_pred = lr_pipeline.predict(X_test)

# Evaluamos el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.88      0.93      0.91      7418
        True       0.74      0.60      0.66      2351

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.78      9769
weighted avg       0.85      0.85      0.85      9769



## SVM

In [9]:
## Importamos warnings para evitar los mensajes de advertencia
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import SVC

# Creamos un nuevo pipeline que incluye el preprocesamiento y el modelo
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor_1),
                                 ('classifier', SVC(kernel = "linear", C= 1.0, random_state=25))])

# Entrenamos el pipeline completo en los datos de entrenamiento
svm_pipeline.fit(X_train, y_train)

# Hacemos predicciones en el conjunto de prueba
y_pred = svm_pipeline.predict(X_test)

# Evaluamos el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.87      0.94      0.91      7418
        True       0.75      0.57      0.65      2351

    accuracy                           0.85      9769
   macro avg       0.81      0.75      0.78      9769
weighted avg       0.84      0.85      0.84      9769



## Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB

# Creamos un nuevo pipeline que incluye el preprocesamiento y el modelo
gnb_pipeline = Pipeline(steps=[('preprocessor', preprocessor_2),
                                 ('classifier', GaussianNB())])

# Entrenamos el pipeline completo en los datos de entrenamiento
gnb_pipeline.fit(X_train, y_train)

# Predicciones en el conjunto de prueba
y_pred = gnb_pipeline.predict(X_test)

# Evaluamos el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.92      0.79      0.85      7418
        True       0.54      0.78      0.63      2351

    accuracy                           0.78      9769
   macro avg       0.73      0.78      0.74      9769
weighted avg       0.83      0.78      0.80      9769



## Gradient Boosting

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

# Creamos un nuevo pipeline que incluye el preprocesamiento y el modelo
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor_1),
                                 ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))])

# Entrenamos el pipeline completo en los datos de entrenamiento
model_pipeline.fit(X_train, y_train)

# Predicciones en el conjunto de prueba
y_pred = model_pipeline.predict(X_test)

# Evaluamos el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.88      0.96      0.92      7418
        True       0.81      0.58      0.68      2351

    accuracy                           0.87      9769
   macro avg       0.84      0.77      0.80      9769
weighted avg       0.86      0.87      0.86      9769

