In [1]:
#Bibliotécas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
#DataFrame
headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']

df = pd.read_csv('adult.data.csv', names=headers)

In [3]:
#Tratamento de dados faltantes para o mais representatívo
columns = df.columns
for i in columns:
    missing = df[i].isin([' ?']).sum()
    df[i] = df[i].replace(' ?', np.NaN)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [4]:
#Divisão em Parâmetro e Classe
X_df = df.iloc[:, 0:13].values
y_df = df.iloc[:, 14].values

In [5]:
#LabelEncoder
def labelencoder(pd_serie):
    labelencoder = LabelEncoder()
    pd_serie = labelencoder.fit_transform(pd_serie)
    return pd_serie

X_df[:, 1] = labelencoder(X_df[:, 1])
X_df[:, 3] = labelencoder(X_df[:, 3])
X_df[:, 5] = labelencoder(X_df[:, 5])
X_df[:, 6] = labelencoder(X_df[:, 6])
X_df[:, 7] = labelencoder(X_df[:, 7])
X_df[:, 8] = labelencoder(X_df[:, 8])
X_df[:, 9] = labelencoder(X_df[:, 9])
y_df = labelencoder(y_df)

In [6]:
#One Hot Encoder
onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1,3,5,6,7,8,9])], remainder='passthrough')
X_df = onehotencorder.fit_transform(X_df).toarray()

In [7]:
#Standard Scaler
scaler = StandardScaler()
X_df = scaler.fit_transform(X_df)

In [8]:
#Divisão df de treinamento e teste 15%
X_df_train, X_df_test, y_df_train, y_df_test =  train_test_split(X_df, y_df, test_size=0.15, random_state=0)
print(X_df_train.shape, X_df_test.shape, y_df_train.shape, y_df_test.shape)

(27676, 64) (4885, 64) (27676,) (4885,)


In [9]:
#treinamento Regressão Logistica
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='ovr', solver='liblinear', random_state=1)
lr.fit(X_df_train, y_df_train)

LogisticRegression(multi_class='ovr', random_state=1, solver='liblinear')

In [10]:
predict_lr = lr.predict(X_df_test)

In [11]:
accuracy_score(y_df_test, predict_lr)

0.8483111566018424

In [12]:
confusion_matrix(y_df_test, predict_lr)

array([[3433,  260],
       [ 481,  711]], dtype=int64)

In [13]:
print(classification_report(y_df_test, predict_lr))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      3693
           1       0.73      0.60      0.66      1192

    accuracy                           0.85      4885
   macro avg       0.80      0.76      0.78      4885
weighted avg       0.84      0.85      0.84      4885



In [14]:
lr_hold = lr.fit(X_df, y_df)

In [15]:
#treinamento SVM
from sklearn.svm import SVC
svm = SVC(kernel='linear', random_state=1, C=1)
svm.fit(X_df_train, y_df_train)

SVC(C=1, kernel='linear', random_state=1)

In [16]:
predict_svm = svm.predict(X_df_test)

In [17]:
accuracy_score(y_df_test, predict_svm)

0.8448311156601842

In [18]:
confusion_matrix(y_df_test, predict_svm)

array([[3453,  240],
       [ 518,  674]], dtype=int64)

In [19]:
print(classification_report(y_df_test, predict_svm))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3693
           1       0.74      0.57      0.64      1192

    accuracy                           0.84      4885
   macro avg       0.80      0.75      0.77      4885
weighted avg       0.84      0.84      0.84      4885



In [20]:
svm_hold = svm.fit(X_df, y_df)

In [21]:
#treinamento Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB(priors=None, var_smoothing=1e-09)
nb.fit(X_df_train, y_df_train)

GaussianNB()

In [22]:
predict_nb = nb.predict(X_df_test)

In [23]:
accuracy_score(y_df_test, predict_nb)

0.5733879222108496

In [24]:
confusion_matrix(y_df_test, predict_nb)

array([[1668, 2025],
       [  59, 1133]], dtype=int64)

In [25]:
print(classification_report(y_df_test, predict_nb))

              precision    recall  f1-score   support

           0       0.97      0.45      0.62      3693
           1       0.36      0.95      0.52      1192

    accuracy                           0.57      4885
   macro avg       0.66      0.70      0.57      4885
weighted avg       0.82      0.57      0.59      4885



In [26]:
nb_hold = nb.fit(X_df, y_df)

In [27]:
#DataFrame
headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']

df = pd.read_csv('adult.test.csv', names=headers)

#Tratamento de dados faltantes para o mais representatívo
columns = df.columns
for i in columns:
    missing = df[i].isin([' ?']).sum()
    df[i] = df[i].replace(' ?', np.NaN)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

#Divisão em Parâmetro e Classe
X_df = df.iloc[:, 0:13].values
y_df = df.iloc[:, 14].values

#LabelEncoder
def labelencoder(pd_serie):
    labelencoder = LabelEncoder()
    pd_serie = labelencoder.fit_transform(pd_serie)
    return pd_serie

X_df[:, 1] = labelencoder(X_df[:, 1])
X_df[:, 3] = labelencoder(X_df[:, 3])
X_df[:, 5] = labelencoder(X_df[:, 5])
X_df[:, 6] = labelencoder(X_df[:, 6])
X_df[:, 7] = labelencoder(X_df[:, 7])
X_df[:, 8] = labelencoder(X_df[:, 8])
X_df[:, 9] = labelencoder(X_df[:, 9])
y_df = labelencoder(y_df)

#One Hot Encoder
onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1,3,5,6,7,8,9])], remainder='passthrough')
X_df = onehotencorder.fit_transform(X_df).toarray()

#Standard Scaler
scaler = StandardScaler()
X_df = scaler.fit_transform(X_df)

In [31]:
#previsão Regressão Logistica
predict_lr_hold = lr_hold.predict(X_df)

In [32]:
accuracy_score(y_df, predict_lr_hold)

0.8503777409250046

In [33]:
confusion_matrix(y_df, predict_lr_hold)

array([[11572,   863],
       [ 1573,  2273]], dtype=int64)

In [34]:
print(classification_report(y_df, predict_lr_hold))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90     12435
           1       0.72      0.59      0.65      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.76      0.78     16281
weighted avg       0.84      0.85      0.84     16281



In [35]:
#previsão SVM
predict_svm_hold = svm_hold.predict(X_df)

In [36]:
accuracy_score(y_df, predict_svm_hold)

0.8497635280388183

In [37]:
confusion_matrix(y_df, predict_svm_hold)

array([[11654,   781],
       [ 1665,  2181]], dtype=int64)

In [38]:
print(classification_report(y_df, predict_svm_hold))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91     12435
           1       0.74      0.57      0.64      3846

    accuracy                           0.85     16281
   macro avg       0.81      0.75      0.77     16281
weighted avg       0.84      0.85      0.84     16281



In [39]:
#previsão NB
predict_nb_hold = nb_hold.predict(X_df)

In [40]:
accuracy_score(y_df, predict_nb_hold)

0.7637737239727289

In [41]:
confusion_matrix(y_df, predict_nb_hold)

array([[12435,     0],
       [ 3846,     0]], dtype=int64)

In [42]:
print(classification_report(y_df, predict_nb_hold))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87     12435
           1       0.00      0.00      0.00      3846

    accuracy                           0.76     16281
   macro avg       0.38      0.50      0.43     16281
weighted avg       0.58      0.76      0.66     16281

  _warn_prf(average, modifier, msg_start, len(result))
