In [None]:
import os
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Carregando a base de dados

In [None]:
data = pd.read_csv('dataset/loan.csv')

In [None]:
data.head(100)

In [None]:
data.info()

In [None]:
data.Loan_Status.value_counts()

In [None]:
data2 = data[data.Loan_Status=='Y'].sample(200)

In [None]:
data = data2.append(data[data.Loan_Status=='N'].sample(192))

In [None]:
data.Loan_Status.value_counts()

## Checando Missing Values

In [None]:
data2.isnull().sum()

Preechendo Missing Values:

- `Dependents`: Assumindo o valor majoritário da coluna.
- `Self_Employed`: Assumindo o valor majoritário da coluna.
- `Loan_Amount_Term`: Preenchendo com o valor médio da coluna.
- `Credit_History`: Assumindo o valor majoritário da coluna.
- `Married`: Assumindo o valor majoritário da coluna.
- `Gender`: Assumindo o valor majoritário da coluna.

In [None]:
data['Gender'] = data['Gender'].fillna('Male')

In [None]:
data['Married'] = data['Married'].fillna('No')

In [None]:
data['Dependents'] = data['Dependents'].fillna('0')

In [None]:
data['Self_Employed'] = data['Self_Employed'].fillna('No')

In [None]:
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())

In [None]:
data['Credit_History'] = data['Credit_History'].fillna(1.0)

In [None]:
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())

In [None]:
data.Credit_History.value_counts()

### Checando novamente Missing Values

In [None]:
data.isnull().sum()

**Transformando dados categóricos**

Várias colunas do dataframe são categóricas, precisamos transforma-las, são elas: `Gender`, `Married`, `Education`, `Self_Employed` & `Property_Area` columns.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
gender_values = {'Female' : 0, 'Male' : 1} 
married_values = {'No' : 0, 'Yes' : 1}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1}
dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
loan_values = {'Y':1,'N':0}
data.replace({'Gender': gender_values,
                 'Married': married_values, 
                 'Education': education_values,
                 'Self_Employed': employed_values, 
                 'Dependents': dependent_values,
                 'Loan_Status': loan_values
                }, inplace=True)

In [None]:
data.drop(columns=['Loan_ID','CoapplicantIncome','Loan_Amount_Term','Credit_History','Property_Area'], axis=1, inplace=True)

In [None]:
data.head()

Selecionando o melhor classificador através de Pipeline e GridSearchCV

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
#from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

In [None]:
pipe_random_forest = Pipeline([
        ('scl', StandardScaler()),
        
        ('clf', RandomForestClassifier())
        ]
)

In [None]:
pipe_svm = Pipeline([
        ('scl', StandardScaler()),
        ('clf', svm.SVC())
        ]
)

In [None]:
pipe_knn = Pipeline([
        ('scl', StandardScaler()),
        ('clf', KNeighborsClassifier())
        ]
)

#### Valores para Grid

In [None]:
valores = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
grid_params_rf = [{
    'clf__criterion': ['gini', 'entropy'],
    'clf__min_samples_leaf': valores,
    'clf__max_depth': valores,
    'clf__min_samples_split': valores[1:]
}]

In [None]:
grid_params_svm = [{
    'clf__kernel': ['linear', 'rbf'], 
    'clf__C': valores
}]

In [None]:
grid_params_knn = [{
    'clf__n_neighbors': valores,
}]

#### Construindo GridSearch

In [None]:
gs_rf = GridSearchCV(
    estimator=pipe_random_forest,
    param_grid=grid_params_rf,
    scoring='accuracy',
    cv=10 
)

In [None]:
gs_svm = GridSearchCV(
    estimator=pipe_svm,
    param_grid=grid_params_svm,
    scoring='accuracy',
    cv=10,
)

In [None]:
gs_knn = GridSearchCV(
    estimator=pipe_knn,
    param_grid=grid_params_knn,
    scoring='accuracy',
    cv=10,
)

In [None]:
X_train = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

#### Computando o GridSearch para Random Forest

In [None]:
gs_rf.fit(X_train,y)

#### Melhores parametros e scoring

In [None]:
print('Melhores parâmetros: %s' % gs_rf.best_params_)
print('Melhores Acurácia: %.3f' % gs_rf.best_score_)

#### Computando o GridSearch para SVM

In [None]:
gs_svm.fit(X_train,y)

#### Melhores parametros e scoring

In [None]:
print('Melhores parâmetros: %s' % gs_svm.best_params_)
print('Melhores Acurácia: %.3f' % gs_svm.best_score_)

#### Computando o GridSearch para KNN

In [None]:
gs_knn.fit(X_train,y)

#### Melhores parametros e scoring

In [None]:
print('Melhores parâmetros: %s' % gs_knn.best_params_)
print('Melhores Acurácia: %.3f' % gs_knn.best_score_)

### Métricas de Validação ###

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X_train, y)

In [None]:
print (pd.crosstab(y_teste, gs_rf.predict(X_teste), rownames=['Real'], colnames=['Predito'], margins=True), '')

In [None]:
print (metrics.classification_report(y_teste,gs_rf.predict(X_teste)))

### Persistindo o modelo de Machine Learning para o disco. ###

In [None]:
#from sklearn.externals import joblib

#### Persistindo o melhor modelo em disco.

In [None]:
#joblib.dump(gs_rf, 'api/model/model.pkl')
#model = joblib.load('api/model/model.pkl')

#from google.colab import files
import pickle
filename = 'api/model/modelo.pkl'

model = pickle.dump(gs_rf, open(filename, 'wb'))
filename

In [None]:
model = pickle.load(open(filename, 'rb'))

#### Carregando o modelo a partir do disco para a memória.

In [None]:
print("Atributos do Modelo:\n\nClasses:{}\nEstimator:{}".format(model.classes_,model.estimator))

**Verificando o Dataset final gerado.**

In [None]:
X_train.head()

**Teste de Classificação.**

In [None]:
teste = np.array([[1,1,3,0,0,9504,275.0]])

In [None]:
teste

In [None]:
model.predict(teste)

**Probabilidades de Classes.**

In [None]:
svm.SVC(probability=True)
#model.predict_proba(teste)

In [None]:
model.predict_proba(teste)