# Importar bibliotecas básicas

In [1]:
!pip install sklearn



In [2]:
!pip install scikit-learn



In [3]:
!pip install sklearn.preprocessing



In [4]:
!pip install cython



In [5]:
!pip install --upgrade pip



In [6]:
!pip install seaborn



In [7]:
!pip install plotly



In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn

# Importar dados

In [9]:
data = pd.read_csv('data/raw/loan_data.csv')

In [10]:
data.sample(300)

# Tratamento de dados

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
Loan_ID              381 non-null object
Gender               376 non-null object
Married              381 non-null object
Dependents           373 non-null object
Education            381 non-null object
Self_Employed        360 non-null object
ApplicantIncome      381 non-null int64
CoapplicantIncome    381 non-null float64
LoanAmount           381 non-null float64
Loan_Amount_Term     370 non-null float64
Credit_History       351 non-null float64
Property_Area        381 non-null object
Loan_Status          381 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


Como é possível ver abaixo, há diversos valores nulos. Algumas estratégias sugeridas para cada atributo seriam:
-    Gender -> Remoção
-    Dependents -> Remoção
-    Self_Employed -> Substituir por moda
-    Loan_Amount_Term -> Substituir por moda
-    Credit_History -> Substituir por moda

In [12]:
data.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
moda = data['Credit_History'].mode()
print(f"moda do histórico de crédito: {moda[0]}")
data['Credit_History'] = data['Credit_History'].fillna(moda[0])

moda do histórico de crédito: 1.0


In [14]:
moda = data['Self_Employed'].mode()
print(f"moda do histórico de crédito: {moda[0]}")
data['Self_Employed'] = data['Self_Employed'].fillna(moda[0])

moda do histórico de crédito: No


In [15]:
moda = data['Loan_Amount_Term'].mode()
print(f"moda do histórico de crédito: {moda[0]}")
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(moda[0])

moda do histórico de crédito: 360.0


In [16]:
data.dropna(inplace=True)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 368 entries, 0 to 380
Data columns (total 13 columns):
Loan_ID              368 non-null object
Gender               368 non-null object
Married              368 non-null object
Dependents           368 non-null object
Education            368 non-null object
Self_Employed        368 non-null object
ApplicantIncome      368 non-null int64
CoapplicantIncome    368 non-null float64
LoanAmount           368 non-null float64
Loan_Amount_Term     368 non-null float64
Credit_History       368 non-null float64
Property_Area        368 non-null object
Loan_Status          368 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 40.2+ KB


# LabelEncoder

In [18]:
from sklearn.preprocessing import LabelEncoder

atributos_para_encode = ['Loan_ID','Gender','Married','Dependents','Education','Self_Employed'
                         ,'Property_Area','Loan_Status']
for atributo in atributos_para_encode:
    le = LabelEncoder()
    data[atributo] = le.fit_transform(data[atributo].values)

data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 368 entries, 0 to 380
Data columns (total 13 columns):
Loan_ID              368 non-null int64
Gender               368 non-null int64
Married              368 non-null int64
Dependents           368 non-null int64
Education            368 non-null int64
Self_Employed        368 non-null int64
ApplicantIncome      368 non-null int64
CoapplicantIncome    368 non-null float64
LoanAmount           368 non-null float64
Loan_Amount_Term     368 non-null float64
Credit_History       368 non-null float64
Property_Area        368 non-null int64
Loan_Status          368 non-null int64
dtypes: float64(4), int64(9)
memory usage: 40.2 KB


# Divisão entre previsores e classe

In [19]:
Y = data.Loan_Status.values
data.drop('Loan_Status',axis=1,inplace=True)

X = data.values


# Normalização

In [21]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
x_norm        = minmax_scaler.fit_transform(X)
data          = pd.DataFrame(x_norm)
data.head()

# PCA - redução de dimensões