## Credit Classifier

## Passo 1. Setup do projeto

In [25]:
import joblib
import os
import urllib

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, classification_report

print('Matplot version : {}'.format(matplotlib.__version__))
print('Numpy version : {}'.format(np.__version__))
print('Pandas version : {}'.format(pd.__version__))
print('Sklearn version : {}'.format(sklearn.__version__))

# pacotes incluidos.
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

CREDIT_ROOT = 'http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/'
CREDIT_URL  = CREDIT_ROOT + 'crx.data'

DATA_PATH = os.path.join('..', 'data', 'raw')
DATA_FILE = 'wine.csv'
CREDIT_DATA = os.path.join(DATA_PATH, DATA_FILE)

Matplot version : 3.3.2
Numpy version : 1.19.2
Pandas version : 1.1.3
Sklearn version : 0.23.2


## Passo 2. Recuperação dos dados

In [13]:
def download_data(data_url, data_path, data_file):
    os.makedirs(data_path, exist_ok=True)
    local_path = os.path.join(data_path, data_file)
    urllib.request.urlretrieve(data_url, local_path)

download_data(CREDIT_URL, DATA_PATH, DATA_FILE)

def load_data(data_path, data_file):
    local_path = os.path.join(data_path, data_file)
    return pd.read_csv(local_path, header=None, names=['A1', 'A2', 'A3', 
                                                        'A4', 'A5', 'A6',
                                                        'A7', 'A8', 'A9',
                                                        'A10', 'A11', 'A12', 'A13',
                                                        'A14', 'A15', 'A16'])
credit_data = load_data(DATA_PATH, DATA_FILE)

## Passo 3. Análise Exploratória de Dados

In [14]:
nRow, nCol = credit_data.shape
print(f'Existe um total de {nRow} linhas e {nCol} colunas')

Existe um total de 690 linhas e 16 colunas


In [15]:
credit_data.head(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [29]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    object 
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    object 
 14  A15     690 non-null    int64  
 15  A16     690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [31]:
# Verifica por dados ausentes
credit_data.isnull().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

## Passo 4. Preparação de dados

In [35]:
# Troca os '?' por NaN
credit_data = credit_data.replace('?', np.NaN)

# Imputa os valores ausentes com a média
credit_data.fillna(credit_data.mean(), inplace=True)

# Verifica por dados ausentes
credit_data.isnull().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [36]:
# Percorre cada coluna de credit_data
for col in credit_data.columns:
    # Verifica se a coluna tem o tipo object
    if credit_data[col].dtype == 'object':
        # Coloca o valor mais frequente
        credit_data = credit_data.fillna(credit_data[col].value_counts().index[0])

# Verifica por dados ausentes
credit_data.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
A16    0
dtype: int64

## Passo 5. Transformação de dados