In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
import plotly.express as px


In [4]:
base_credt = pd.read_csv("credit_data.csv")

In [5]:
base_credt.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [6]:
base_credt.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [7]:
np.unique(base_credt["default"], return_counts = True) #Olha os dados unicos de deteerminada coluna

(array([0, 1], dtype=int64), array([1717,  283], dtype=int64))

## Tratando valores inconsitentes

In [8]:
base_credt.loc[base_credt['age'] < 0  ]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [9]:
## Apagando coluna inteira
credit2 = base_credt.drop('age', axis = 1)
credit2

Unnamed: 0,clientid,income,loan,default
0,1,66155.925095,8106.532131,0
1,2,34415.153966,6564.745018,0
2,3,57317.170063,8020.953296,0
3,4,42709.534201,6103.642260,0
4,5,66952.688845,8770.099235,1
...,...,...,...,...
1995,1996,59221.044874,1926.729397,0
1996,1997,69516.127573,3503.176156,0
1997,1998,44311.449262,5522.786693,1
1998,1999,43756.056605,1622.722598,0


In [10]:
## Apagando registros com valores incosistentes da coluna Age 
credit3 = base_credt.drop(base_credt[base_credt['age'] < 0].index)
credit3

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [11]:
credit3.loc[credit3['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default


In [12]:
## Preencher valores faltantes com a media 
base_credt.mean()

clientid     1000.500000
income      45331.600018
age            40.807559
loan         4444.369695
default         0.141500
dtype: float64

In [13]:
base_credt['age'][base_credt['age'] > 0].mean()

40.92770044906149

In [14]:
base_credt.loc[base_credt['age'] < 0] = 40.92

In [15]:
base_credt.loc[base_credt['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default


 ## Tratando valores faltantes

In [16]:
base_credt.isnull().sum()

clientid    0
income      0
age         3
loan        0
default     0
dtype: int64

In [17]:
base_credt.loc[pd.isnull(base_credt['age'])]

Unnamed: 0,clientid,income,age,loan,default
28,29.0,59417.805406,,2082.625938,0.0
30,31.0,48528.852796,,6155.78467,0.0
31,32.0,23526.302555,,2862.010139,0.0


In [18]:
base_credt['age'].fillna(base_credt['age'].mean(), inplace = True)

In [19]:
base_credt.loc[pd.isnull(base_credt['age'])]


Unnamed: 0,clientid,income,age,loan,default


##### Divisão entre previsores e classe

In [20]:
Xcredit = base_credt.iloc[:,1:4].values
Xcredit

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [21]:
Ycredit = base_credt.iloc[:, 4].values
Ycredit

array([0., 0., 0., ..., 1., 0., 0.])

##### Escalonamento de valores

In [22]:
from sklearn.preprocessing import StandardScaler
scaler_credit = StandardScaler()

In [23]:
Xcredit = scaler_credit.fit_transform(Xcredit)

In [24]:
Xcredit

array([[ 1.44913211,  1.36538093,  1.2047111 ],
       [-0.75194963,  0.5426602 ,  0.69874452],
       [ 0.83620364,  1.67417189,  1.17662679],
       ...,
       [-0.06568543, -0.97448519,  0.35680621],
       [-0.10419945,  1.73936739, -0.92307353],
       [ 1.67663095,  1.14917639,  0.96590308]])

#### Divisão de dados para treino e testes

In [25]:
from sklearn.model_selection import train_test_split

In [30]:
Xcredito_trein, Xcredito_teste, Ycredtio_trein, Ycredtio_teste = train_test_split(Xcredit, Ycredit, test_size = 0.25, random_state = 0)

In [31]:
Xcredito_teste.shape

(500, 3)

##### Salvando variaveis

In [32]:
import pickle

In [33]:
 with open ('credit.pkl', mode = 'wb') as f:
        pickle.dump([Xcredito_trein, Xcredito_teste, Ycredtio_trein, Ycredtio_teste], f)