In [1]:
import pandas as pd

In [2]:
base = pd.read_csv("credit_data.csv")
base.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [3]:
base.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


### Filtrar registros com idades negativas

In [4]:
base.loc[base["age"] < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


### Estratégias para deixar os dados consistentes

In [5]:
base[base["age"] < 0].index

Int64Index([15, 21, 26], dtype='int64')

In [6]:
# apagar a coluna
# base.drop("age", 1, inplace=True)

# apagar somente os registros
# base.drop(base[base["age"] < 0].index, inplace=True)

# contatar os usuários e corrigir os dados

# preencher os valores com a média
print(base.mean(), "\n")
# essa média ainda está errada porque considera os valores negativos para o cálculo
m = base["age"][base["age"] > 0].mean()
print(m)
base.loc[base["age"] < 0, "age"] = m

clientid     1000.500000
income      45331.600018
age            40.807559
loan         4444.369695
default         0.141500
dtype: float64 

40.92770044906149


### Procurar valores faltantes

In [7]:
base.loc[pd.isnull(base["age"])]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


### Atributos previsores e classe

In [8]:
previsores = base.iloc[:, 1:4]    # todas as linhas e colunas 1,2 e 3
previsores.head()

Unnamed: 0,income,age,loan
0,66155.925095,59.017015,8106.532131
1,34415.153966,48.117153,6564.745018
2,57317.170063,63.108049,8020.953296
3,42709.534201,45.751972,6103.64226
4,66952.688845,18.584336,8770.099235


In [9]:
classe = base.iloc[:, 4].values    # todas as linhas e apenas a coluna 4
classe                             # classe não é mais um dataframe, agora é um array

array([0, 0, 0, ..., 1, 0, 0])

### Pré-processamento com scikit-learn

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
imputer = SimpleImputer().fit(previsores)
previsores = imputer.transform(previsores)
previsores

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

### Escalonamento de atributos

Padronização (Standardisation)

$$x = \frac{x - \text{média}(x)}{\text{desvio padrão}(x)}$$

Normalização (Normalization)

$$x = \frac{x - \text{mínimo}(x)}{\text{máximo}(x) - mínimo(x)}$$

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
previsores

array([[ 1.45393393,  1.36538005,  1.20281942],
       [-0.76217555,  0.54265932,  0.69642695],
       [ 0.83682073,  1.67417101,  1.17471147],
       ...,
       [-0.07122592, -0.97448606,  0.35420081],
       [-0.11000289,  1.73936652, -0.92675625],
       [ 1.682986  ,  1.14917551,  0.96381038]])

### Base de dados do censo

[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php)

In [14]:
base = pd.read_csv("census.csv")
base.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Atributos previsores e classes

In [15]:
previsores = base.iloc[:, :14].values
previsores

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

In [16]:
classe = base.iloc[:, 14].values
classe

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

### Transformação Variáveis Categóricas

In [17]:
from sklearn.preprocessing import LabelEncoder

LabelEncoder transforma variáveis categóricas em numéricas

In [18]:
labelencoder_previsores = LabelEncoder()
for i in [1, 3, 5, 6, 7, 8, 9, 13]:
    previsores[:, i] = labelencoder_previsores.fit_transform(previsores[:, i])
previsores

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [19]:
from sklearn.preprocessing import OneHotEncoder