In [1]:
!pip install xgboost



In [2]:
!pip install yellowbrick



# Projeto de Machine Learning com Pipeline e Preprocessamento

In [3]:
import pandas as pd

data = pd.read_csv("census.csv", header=None, index_col=False,
                   names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'income'])

In [4]:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
'occupation', 'income']]

display(data)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,age,workclass,education,sex,hour-per-week,occupation,income
1,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
2,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
3,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
4,53,Private,11th,Male,40,Handlers-cleaners,<=50K
...,...,...,...,...,...,...,...
32557,27,Private,Assoc-acdm,Female,38,Tech-support,<=50K
32558,40,Private,HS-grad,Male,40,Machine-op-inspct,>50K
32559,58,Private,HS-grad,Female,40,Adm-clerical,<=50K
32560,22,Private,HS-grad,Male,20,Adm-clerical,<=50K


In [5]:
data = data.drop(0)

# data['age'].value_counts()

In [6]:
data['age'] = data['age'].astype(int)

In [7]:
print("Original Dataset: \n", list(data.columns), "\n")

Original Dataset: 
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 



In [8]:
data['income'] = data['income'].map({' <=50K':0, ' >50K':1})
data['income']

1        0
2        0
3        0
4        0
5        0
        ..
32557    0
32558    1
32559    0
32560    0
32561    1
Name: income, Length: 32561, dtype: int64

In [9]:
X = data.drop('income', axis = 1)
y = data['income']

print("X.shape: {} \ny.shape: {}".format(X.shape, y.shape))

X.shape: (32561, 6) 
y.shape: (32561,)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [11]:
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

categorical_features, numerical_features

(['workclass', 'education', 'gender', 'hours-per-week', 'occupation'], ['age'])

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessador = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(drop='first'), categorical_features),
        ('numerical', StandardScaler(), numerical_features)
    ]
)

In [13]:
from sklearn.pipeline import Pipeline
import xgboost as xgb

pipeline = Pipeline([
    ('preprocessador', preprocessador),
    ('classifier', xgb.XGBClassifier())
])

pipeline.fit(X_train, y_train)


In [14]:
print("Test score: {:.2f}".format(pipeline.score(X_test, y_test)))

Test score: 0.81


In [15]:
from sklearn.metrics import classification_report


print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      4918
           1       0.65      0.47      0.55      1595

    accuracy                           0.81      6513
   macro avg       0.75      0.69      0.71      6513
weighted avg       0.80      0.81      0.80      6513

