In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from collections import Counter
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Definições gerais para o modelo

In [2]:
#Definindo uma seed para o projeto
seed = 42
np.random.seed(seed)

In [3]:
def card_score(score):
    return go.Figure(
        go.Indicator(
            mode="number",
            value=(score*100),
            number={'suffix': '%'}
        )
    )


# Business Understanding

### Projeto de classificação binaria que tem como objetivo se uma pessoa ganha ou não mais de cinquenta mil dólares anuais apenas pelas variáveis do dataset

# Data Understanding

In [31]:
df = pd.read_csv('adult.data',
                 header=None,
                 names=['age',
                        'workclass',
                        'fnlwgt',
                        'education',
                        'education_num',
                        'marital_status',
                        'occupation',
                        'relationship',
                        'race',
                        'sex',
                        'capital_gain',
                        'capital_loss',
                        'hours_per_week',
                        'native_country',
                        'salary'],
                 )


##### As colunas não estão nomeadas neste dataset. Utilizei o header=None para não perder a primeira linha de dados e depois passar o nome das colunas pelo atributo df.columns e há um espaço no final de cada valor string

In [32]:
df_string = df.select_dtypes(include=[object]).columns

for columnName in df_string:
    df[columnName] = df[columnName].str.strip()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### Significado das variáveis

##### age - Idade do individuo
##### workclass - Setor da economia qeu o individuo trabalha
##### fnlwgt - ?
##### education - Nível de educação do individuo      
##### education_num - Uma representação numérica do nível de educação
##### marital_status - Estado civíl do individuo
##### occupation - Profissão do individuo
##### relationship - ?
##### race - Raça do individuo
##### sex - Sexo do individuo
##### capital_gain - Se o individuo teve ganho em investimentos no período
##### capital_loss - Se o individuo teve perda em investimentos no período
##### hours_per_week - Horas trabalhadas por semana
##### native_country - País de origem
##### salary - Salário

#### Uma breve sumarização dos dados. Temos 14 atributos no dataset, misturando entre dados do tipo categórico e numérico. O atributo a ser previsto está na variável salary. No total, temos 32561 observações no dataset.

### O atributo previsor

In [5]:
df['salary'].value_counts()

 <=50K    24720
 >50K      7841
Name: salary, dtype: int64

In [10]:
pie_data = df['salary'].value_counts()
px.pie(pie_data, values=pie_data.values, names=pie_data.index)

##### Existe um desbalanceamento muito grande na variável previsora. Setenta e cinco por cento das pessoas que estão dentro do dataset ganham menos de 50K. 

### Variaveis numéricas

In [12]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


#### Com o método describe podemos identificar algumas informações. As variávies possuem escalas bem diferentes umas das outras, e as com menor desvio padrão seriam hours_per_week e age. 

#### As variáveis capital_gain e capital_loss possuem o comportamento mais suspeito no dataset, com a grande maioria dos seus valores sendo 0.

In [70]:
education_num = df.groupby(['education_num', 'salary'])

In [73]:
education_num.size()

education_num  salary
1               <=50K      51
2               <=50K     162
                >50K        6
3               <=50K     317
                >50K       16
4               <=50K     606
                >50K       40
5               <=50K     487
                >50K       27
6               <=50K     871
                >50K       62
7               <=50K    1115
                >50K       60
8               <=50K     400
                >50K       33
9               <=50K    8826
                >50K     1675
10              <=50K    5904
                >50K     1387
11              <=50K    1021
                >50K      361
12              <=50K     802
                >50K      265
13              <=50K    3134
                >50K     2221
14              <=50K     764
                >50K      959
15              <=50K     153
                >50K      423
16              <=50K     107
                >50K      306
dtype: int64

# Ajustando as duas classes preditoras

In [8]:
df['salary'] = df['salary'].apply(lambda x: 1 if x == '>50K' else 0)

# Criando uma baseline

In [32]:
df_pipeline = df.copy()


preprocessor = ColumnTransformer(
    transformers=[
        ('category', OneHotEncoder(), ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
    ],
    remainder='passthrough'
)


pipeline_baseline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('logistic_regression', LogisticRegression(max_iter=300))
])


stratKfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
score = cross_val_score(estimator=pipeline_baseline, X=df_pipeline.drop(['salary'], axis=1), y=df_pipeline['salary'], cv=stratKfold, scoring='f1')

card_score(np.mean(score))

#### A criação de uma baseline é um processo importante para qualquer projeto de Machine Learning. Com ela, podemos estabelecer um marco inicial de comparação para ver quão melhor nosso modelo fica em comparação com apenas as features puras. 

#### Escolhi o metodo de Regressão Logística por ser um modelo simples de ser aplicado. A única transformação necessária foi o encoding das variáveis categóricas. Com as features cruas, nosso score mínimo foi de 40%

# Data Preparation

In [153]:
df_model = df.copy()
column = df_model.columns

In [119]:
df_model

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


#### Abaixo serão criadas algumas transformações com o auxilio do objeto ColumnTransformer. A primeira será imputar a moda nos valores faltantes das colunas workclass, occupation e native_country. Na sequência, os dados quantitativos serão normalizados com um StandardScaler.

In [26]:
preprocessor_model = ColumnTransformer(
    transformers=[
        (
            'category', Pipeline(
                steps=[
                    ('most_frequent', SimpleImputer(missing_values='?', strategy='most_frequent')),
                    ('category', OneHotEncoder(handle_unknown='ignore')),
                ]
            ), ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex',  'native_country']
        ),
        (
            'numeric', Pipeline(
                steps=[
                    ('standard_scaler', StandardScaler())
                ]
            ),  ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
        )
    ]
)

# Modeling

In [19]:
df_model = df.copy()

pipeline_model = Pipeline(
    steps=[
        ('preprocessing', preprocessor_model),
        ('random_forest', DecisionTreeClassifier())
    ]
)

stratKfoldModel = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
scoreModel = cross_val_score(estimator=pipeline_model, X=df_model.drop(['salary'], axis=1), y=df_model['salary'], cv=stratKfoldModel, scoring='f1')


card_score(np.mean(scoreModel))

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\compose\_column_transformer.py", line 793, in _hstack
    converted_Xs = [
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\compose\_column_transformer.py", line 794, in <listcomp>
    check_array(X, accept_sparse=True, force_all_finite=False)
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\pandas\core\generic.py", line 2064, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Bachelors'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\compose\_column_transformer.py", line 714, in fit_transform
    return self._hstack(list(Xs))
  File "C:\Users\bruno.gabriel\AppData\Roaming\Python\Python310\site-packages\sklearn\compose\_column_transformer.py", line 798, in _hstack
    raise ValueError(
ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.


In [10]:
scoreModel

array([0.62547529, 0.6175359 , 0.61309187])