# Seleção de características para aprovação de crédito

In [None]:
import time
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from csv import reader
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures

In [None]:
# fixar a semente do gerador de números aleatórios
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
# mostrar todas as colunas do dataset
pd.set_option('display.max_columns', None)

### 1. Explorando o dataset

In [None]:
# carregando o dataset
df = pd.read_csv("datasets/application_data.csv", sep=",", encoding="latin-1")
df.head(10)

In [None]:
df.describe()

In [None]:
df["NAME_CONTRACT_TYPE"].value_counts()

In [None]:
df["CODE_GENDER"].value_counts()

In [None]:
df["FLAG_OWN_CAR"].value_counts()

In [None]:
df["FLAG_OWN_REALTY"].value_counts()

### 2. Limpando o dataset

#### 2.1 Substituindo valores NaN

In [None]:
columns_with_null_values = []
    
for column in df:
    if df[column].isnull().any():
        columns_with_null_values.append(column)

len(columns_with_null_values)

In [None]:
df[columns_with_null_values].isnull().sum().sort_values(ascending = False)

In [None]:
drop_columns = []
columns_to_analyze = []

for column in columns_with_null_values:
    if df[column].isnull().sum() > len(df)/2:
        drop_columns.append(column)
    else:
        columns_to_analyze.append(column)
        
len(drop_columns)

In [None]:
total_before_drop = len(df.columns)

df = df.drop(columns = drop_columns)
total_after_drop = len(df.columns)

print("{0} - {1} = {2}".format(total_before_drop, len(drop_columns), total_after_drop))

In [None]:
df[columns_to_analyze].isnull().sum().sort_values(ascending = False)

In [None]:
df[columns_to_analyze].describe()

In [None]:
# Listas com colunas de moda e mediana
mode_list = []
median_list = []

for column in columns_to_analyze:
    suffix = column[-4:]
    if suffix == "MODE":
        mode_list.append(column)
    elif suffix == "MEDI":
        median_list.append(column)

In [None]:
# MODA
mode_list.remove('EMERGENCYSTATE_MODE')

for column in mode_list:
    df[column] = df[column].fillna(df[column].mode()[0])

    columns_to_analyze.remove(column)

In [None]:
# MEDIANA
for column in median_list:
    df[column] = df[column].fillna(df[column].median())

    columns_to_analyze.remove(column)

In [None]:
# Lista com colunas médias
avg_list = []

for column in columns_to_analyze:
    suffix = column[-3:]
    prefix = column[:3]
    if suffix == "AVG" or prefix == "AMT" or prefix == "EXT" or df[column].isnull().sum() < 1050:
        avg_list.append(column)        

In [None]:
# MÉDIA
for column in avg_list:
    df[column] = df[column].fillna(df[column].mean())
    columns_to_analyze.remove(column)

In [None]:
df[columns_to_analyze].isnull().sum().sort_values(ascending = False)

In [None]:
df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].fillna('Unaccompanied')
df['EMERGENCYSTATE_MODE'] = df['EMERGENCYSTATE_MODE'].fillna('Undefined')
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].fillna('Other')

In [None]:
df.isnull().any()

#### 2.2 Removendo colunas irrelevantes

In [None]:
remove_data = []

for column in df.columns:
    if column[:3] == 'EXT':
        remove_data.append(column)
    elif column[:8] == 'FLAG_DOC':
        remove_data.append(column)
        
len(remove_data)

In [None]:
df = df.drop(columns=remove_data, axis=1)

#### 2.3 Convertendo categóricas

In [None]:
df = pd.get_dummies(df)
df.head()

### 3. Dividindo conjuntos de teste e treinamento

In [None]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
print("{} train + {} test".format(len(train_set), len(test_set)))

### 4. Correlação entre as features

In [None]:
def correlation_color(value):
    if value == 1:
        color = 'gold'
    elif abs(value) > 0.75:
        color = 'royalblue'
    elif value < 0:
        color = 'red'
    else:
        color = 'green'
    return 'color: %s' % color

In [None]:
pd.set_option('display.max_rows', None)
correlation = train_set.corr().style.applymap(correlation_color)
correlation

### 4. Separando a variável dependente: TARGET

In [None]:
x_train = train_set.drop(columns=["TARGET"])
y_train = train_set["TARGET"]
x_test = test_set.drop(columns=["TARGET"])
y_test = test_set["TARGET"]

### 5. Ajustando variáveis categóricas

In [None]:
categoricas = list(x_train.select_dtypes('object').columns)
categoricas

In [None]:
#One Hot Encoding
encoder = OneHotEncoder(categories="auto")

cat_1hot = encoder.fit_transform(x_train[categoricas])
cat_1hot.toarray()

In [None]:
x_train_num = x_train.drop(columns=x_train[categoricas], axis=1)
x_test_num = x_test.drop(columns=x_test[categoricas], axis=1)
x_train_num.head()

In [None]:
num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler()),
    ]
)

x_train_num_tr = num_pipeline.fit_transform(x_train_num)
x_test_num_tr = num_pipeline.fit_transform(x_test_num)

x_train_num_tr

In [None]:
#para juntar a numerica com a categorica
num_attribs_train = list(x_train_num)
num_attribs_test = list(x_test_num)

full_pipeline = ColumnTransformer(
    [
        ("num", num_pipeline, num_attribs_train),
        ("cat", OneHotEncoder(sparse=False), categoricas),
    ]
)

full_pipeline_test = ColumnTransformer(
    [
        ("num", num_pipeline, num_attribs_test),
        ("cat", OneHotEncoder(sparse=False), categoricas),
    ]
)

# x_train_prepared = full_pipeline.fit_transform(x_train)
# x_test_prepared = full_pipeline_test.fit_transform(x_test)

In [None]:
# x_train.to_csv("datasets/train_dataset.csv", index=True)

### 5. Lasso Regression

In [None]:
def experiment(msg, pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    
    model = pipeline.named_steps['lin_reg']
    print('{}: \nintercept = {},\ncoefs = {}'.format(msg, model.intercept_, model.coef_))
    
    y_pred = pipeline.predict(X_test)
    RMSE = np.sqrt(mean_squared_error(y_pred, y_test))
    print('RMSE: {}'.format(RMSE))
    print()
    
alpha = 1e-3

# Test o fit da regularização lasso.
reg_lasso = Pipeline([
        ("std_scaler", StandardScaler()), 
        ("lin_reg", Lasso(alpha=alpha))
    ])
experiment('Regularização Lasso', reg_lasso, x_train, y_train, x_test, y_test)

### 6. Forward Stepwise Selection

In [None]:
x_train_short = x_train[0:5000]
x_test_short = x_test[0:5000]
y_train_short = y_train[0:5000]
y_test_short = y_test[0:5000]

In [None]:
def processSubset(feature_set):
    # Fit model on feature_set and calculate RSS
    model = sm.OLS(y_train_short,x_train_short[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(x_test_short[list(feature_set)]) - y_test_short) ** 2).sum()
    return {"model":regr, "RSS":RSS}

In [None]:
def forward(predictors):

    # Pull out predictors we still need to process
    remaining_predictors = [p for p in x_train_short.columns if p not in predictors]
    
    tic = time.time()
    
    results = []
    
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [None]:
models_fwd = pd.DataFrame(columns=["RSS", "model"])

tic = time.time()
predictors = []

for i in range(1,len(x_train_short.columns)+1):    
    models_fwd.loc[i] = forward(predictors)
    predictors = models_fwd.loc[i]["model"].model.exog_names

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [None]:
print(models_fwd.loc[10, "model"].summary())

### 7. Backward Elimination

In [None]:
def backward(predictors):
    
    tic = time.time()
    
    results = []
    
    for combo in itertools.combinations(predictors, len(predictors)-1):
        results.append(processSubset(combo))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)-1, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [None]:
models_bwd = pd.DataFrame(columns=["RSS", "model"], index = range(1,len(x_train_short.columns)))

tic = time.time()
predictors = x_train_short.columns

while(len(predictors) > 1):  
    models_bwd.loc[len(predictors)-1] = backward(predictors)
    predictors = models_bwd.loc[len(predictors)-1]["model"].model.exog_names

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [None]:
print(models_bwd.loc[10, "model"].summary().)

In [None]:
print("-----------------")
print("Foward Selection:")
print("-----------------")
print(models_fwd.loc[10, "model"].params)

In [None]:
print("-------------------")
print("Backward Selection:")
print("-------------------")
print(models_bwd.loc[10, "model"].params)

In [None]:
x_train.head(25)