<a href="https://colab.research.google.com/github/cbsobral/ml-fies/blob/main/Module00_LoadingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Module 00 - Loading data and creating training and test sets

In this first module, we perform the following steps:

1. Load the data from Google Drive in two parts (the file is too big to load at once);
2. Exclude irrelevant variables or variables with too many missing values;
3. Rename variables in English and possibly with shorter names;
4. Divide training and test set.

### 1 - Load the data in two parts:

In [1]:
import pandas as pd

url_a = "https://drive.google.com/file/d/1prPbFSiXFTHmTHzXTGxy4HrtRxXUHhce/view?usp=sharing"
path_a = "https://drive.google.com/uc?export=download&id="+url_a.split("/")[-2]
base_df_a = pd.read_excel(path_a)
base_df_a.shape

(309999, 37)

In [2]:
url_b = "https://drive.google.com/file/d/1nGckSszPPifPvR3o5FeYaKArUYbfjHGn/view?usp=sharing"
path_b = "https://drive.google.com/uc?export=download&id="+url_b.split("/")[-2]
base_df_b = pd.read_excel(path_b)
base_df_b.shape

(327823, 37)

In [83]:
complete_set = base_df_a.append(base_df_b)
complete_set.shape

(637822, 37)

### 2 - Renaming variables

New variables:

In [84]:
complete_set = complete_set[pd.to_numeric(complete_set["ANO_NASC"],errors='coerce').notna()]

# student
complete_set = complete_set.assign(student = 1*(complete_set["DS_OCUPACAO"]=="Estudante"))

# default
complete_set = complete_set.assign(default = 1*(complete_set["nu_dias_atraso"]>=365))

# age
complete_set = complete_set.assign(age     = 2015 - (complete_set["ANO_NASC"]))

In [85]:
new_names ={"NU_ANO_SEMESTRE_INSCRICAO":"semester_enroll",
            "SG_SEXO":"gender",
            "DS_OCUPACAO":"occupation",
            "DS_RACA_COR" : "ethnicity",
            "NU_ANO_CONCLUSAO_ENSINO_MEDIO":"high_school_endyear",
            "SG_UF_CURSO": "state_course",
            "QT_SEMESTRES_CURSO": "n_sem_course",
            "VL_SEMESTRE_COM_DESCONTO": "tuition_discounted",
            "NU_PERCENT_SOLICITADO_FINANC":"perc_requested",
            "VL_FINANC_RECURSO_ALUNO":"student_resource",
            #"nu_dias_atraso":"days_delay",
            "ANO_NASC":"birth_year",
            "DS_ESTADO_CIVIL":"marital_status", 
            "VL_RENDA_PESSOAL_BRUTA_MENSAL": "personal_income",
            "VL_RENDA_FAMILIAR_BRUTA_MENSAL": "family_income",
            "ST_ENSINO_MEDIO_ESCOLA_PUBLICA": "public_hs",
            "NO_CURSO": "degree",
            "QT_SEMESTRE_CONCLUIDO": "n_completed_sem",
            "VL_SEMESTRE_ATUAL": "tuition_current",
            "VL_FINANCIADO_SEMESTRE": "loan_value_sem",
            "fase_contrato": "contract_phase",
            "vl_divida": "total_debt",            
            "VL_AVALIACAO_IGC": "igc",
            "VL_FAIXA_CPC": "cpc",
            "VL_FAIXA_CC": "cc",
            "QT_SEMESTRE_FINANCIAMENTO": "sem_funded",
            "QT_MEMBRO": "fam_size",
            "VL_RENDA_PER_CAPITA":"income_pc",
            "NU_PERCENTUAL_COMPROMETIMENTO":"inc_prop",
            "VL_TOTAL_FINANCIAMENTO":"loan_value",
            "VL_LIMITE_GLOBAL":"loan_limit",
            "dt_inicio_cont":"date_contract"
}

complete_set = complete_set.rename(index=str, columns=new_names)
#complete_set.dtypes
print(complete_set.columns)

Index(['semester_enroll', 'gender', 'occupation', 'marital_status',
       'family_income', 'personal_income', 'NO_MUNICIPIO', 'SG_UF',
       'ethnicity', 'public_hs', 'high_school_endyear',
       'NU_SEMESTRE_REFERENCIA', 'state_course', 'degree', 'igc', 'cpc', 'cc',
       'n_sem_course', 'n_completed_sem', 'sem_funded',
       'QT_MESES_FINANC_SEMESTRE_ATUAL', 'fam_size', 'income_pc',
       'VL_SEMESTRE_SEM_DESCONTO', 'tuition_discounted', 'tuition_current',
       'inc_prop', 'perc_requested', 'loan_value_sem', 'student_resource',
       'loan_value', 'loan_limit', 'date_contract', 'contract_phase',
       'nu_dias_atraso', 'total_debt', 'birth_year', 'student', 'default',
       'age'],
      dtype='object')


In [86]:
import numpy as np

# Calculate values for wrong entries (larger than 300,000 reais)
complete_set['loan_value_sem'] = np.where((complete_set.loan_value_sem > 300000), (complete_set["perc_requested"]/100 * complete_set["tuition_current"]), 
                                          complete_set.loan_value_sem)

### 3 - Cleaning Dataset

In [87]:
# Drop variables
variables_to_drop = ["NO_MUNICIPIO",
                     "SG_UF",
                     "VL_SEMESTRE_SEM_DESCONTO",
                     "NU_SEMESTRE_REFERENCIA", 
                     "nu_dias_atraso", 
                     "QT_MESES_FINANC_SEMESTRE_ATUAL"] # adicionei aqui pq não estava sendo usada -- carol 

complete_set_clean = complete_set.drop(variables_to_drop, axis = 1)                

In [88]:
import datetime as dt
                    
complete_set_clean["date_contract"]= complete_set_clean["date_contract"].map(dt.datetime.toordinal)

Eliminating outliers for *family income* (4 obs.):

In [89]:
complete_set_clean = complete_set_clean.loc[(complete_set_clean['family_income'] < 100000)] 

In [90]:
complete_set_clean.shape

(637715, 34)

Limiting the variable *age* to interval (10,100):

In [91]:
complete_set_clean["age"].unique()

array([24, 25, 19, 23, 22, 20, 26, 27, 32, 18, 21, 62, 46, 30, 31, 33, 34,
       48, 52, 35, 36, 50, 45, 60, 28, 44, 47, 37, 56, 39, 43, 29, 49, 59,
       51, 55, 57, 63, 38, 65, 58, 42, 61, 40, 54, 41, 53, 64, 68, 66, 70,
       72, 7, 17, 71, 75, 67, 69, 74, 16, 8, 76, 115, 10, 77, 78, 79, 73,
       87, 15, 13, 12, 11, 118], dtype=object)

In [92]:
complete_set_clean = complete_set_clean.loc[(complete_set_clean['age'] < 100) & (complete_set_clean['age'] > 10)] 

In [98]:
complete_set_clean["age"].value_counts()

21    63278
20    58433
22    54722
23    45584
24    38360
      ...  
79        1
15        1
13        1
12        1
11        1
Name: age, Length: 69, dtype: int64

Some variables are better interprerted as categories.

In [94]:
# Calculate correlations with target (default)
corr_matrix = complete_set_clean.corr()
corr_matrix["default"].sort_values(ascending=False, key=pd.Series.abs)

default                1.000000
n_sem_course          -0.225095
family_income         -0.224988
income_pc             -0.206732
sem_funded            -0.191537
total_debt            -0.169225
loan_value            -0.166834
tuition_discounted    -0.150613
perc_requested         0.149974
tuition_current       -0.149190
student_resource      -0.144995
loan_limit            -0.139687
loan_value_sem        -0.112047
fam_size              -0.102780
igc                   -0.080037
student               -0.079215
cc                    -0.054754
cpc                   -0.050746
inc_prop               0.046422
n_completed_sem        0.043032
date_contract         -0.041759
personal_income       -0.017000
high_school_endyear    0.013630
semester_enroll        0.006785
Name: default, dtype: float64

### 4 - Add atributes

Adds a columns for a dummy that defines if the main occupation is student. An another one to define if the delay in repayment is more than one year (365 days)

In [95]:
#Recode ocupacao  - included in the beginning
#complete_set_clean=complete_set_clean.assign(student=1*(complete_set_clean["occupation"]=="Estudante"))
#complete_set_clean=complete_set_clean.assign(default=1*(complete_set_clean["days_delay"]>=365))

### 5 - Cleaning Data 

In [99]:
#Here are the columns we want to drop NA's and not treat like using median, or something else
complete_set_clean = complete_set_clean.dropna(subset=["semester_enroll",
                                                       "gender",
                                                       "occupation",
                                                       "marital_status",
                                                       "ethnicity",
                                                       "public_hs",
                                                       "state_course",
                                                       "degree",
                                                       "igc","cpc","cc",
                                                       "contract_phase",
                                                       "high_school_endyear"])

In [100]:
#Birth year has to be corrected in the excel file
complete_set_clean = complete_set_clean.astype({"semester_enroll":str,
                                                "igc":'int64',"cpc":'int64',"cc":'int64'
                                               })

In [101]:
#Get only categorical
complete_set_cat = complete_set_clean.select_dtypes(include=["object"])

### 6 - Creating a training and a test set

In this section we will create the training and test set using the function *train_test_split* from Scikit-Learn. Two important considerations about our choice:

* Our dataset is a sample provided by the Brazilian Governent and will not be updated. Therefore, we chose not to be concerned about future splits with updated data;

* Our data includes 637,822 instances. We assume it is big enough and do not employ stratified sampling.

In [116]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(complete_set_clean, test_size=0.2, random_state=42)

In [117]:
fies = train_set.drop("default", axis=1) # drop labels for training set
fies_labels = train_set["default"].copy()

In [118]:
sample_incomplete_rows = fies[fies.isnull().any(axis=1)].head()
sample_incomplete_rows.describe()

Unnamed: 0,family_income,personal_income,high_school_endyear,igc,cpc,cc,n_sem_course,n_completed_sem,sem_funded,fam_size,...,tuition_current,inc_prop,perc_requested,loan_value_sem,student_resource,loan_value,loan_limit,date_contract,total_debt,student
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,2560.3,1420.3,204.2,3.4,3.0,4.2,8.4,2.6,8.2,3.25,...,4682.366,195.374,100.0,4682.366,0.0,37816.16,46957.9,735221.2,26449.566,0.2
std,1803.03985,1597.124823,438.751296,0.547723,0.707107,0.447214,2.607681,1.949359,2.48998,0.957427,...,2094.114767,159.839277,0.0,2094.114767,0.0,11197.141124,14497.239533,238.084439,12335.191915,0.447214
min,700.0,0.0,2.0,3.0,2.0,4.0,4.0,0.0,5.0,2.0,...,2181.0,63.42,100.0,2181.0,0.0,23694.0,28056.0,734955.0,9250.76,0.0
25%,1141.0,0.0,3.0,3.0,3.0,4.0,8.0,1.0,6.0,2.75,...,3188.15,102.67,100.0,3188.15,0.0,30596.4,38245.5,734981.0,18226.22,0.0
50%,2198.0,1141.0,13.0,3.0,3.0,4.0,10.0,4.0,10.0,3.5,...,4628.0,116.0,100.0,4628.0,0.0,36960.0,46200.0,735298.0,32331.59,0.0
75%,3762.5,2198.0,14.0,4.0,3.0,4.0,10.0,4.0,10.0,4.0,...,6056.28,238.6,100.0,6056.28,0.0,47725.0,59656.25,735435.0,33081.24,0.0
max,5000.0,3762.5,989.0,4.0,4.0,5.0,10.0,4.0,10.0,4.0,...,7358.4,456.18,100.0,7358.4,0.0,50105.4,62631.75,735437.0,39358.02,1.0


### 7 - Pipeline

Functions that will be used to tranform our dataset

In [130]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("num_imputer", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler()),
    ])

In [131]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

ord_pipeline = Pipeline([
        ("ord_imputer", SimpleImputer(strategy="most_frequent")),
        ("ord_encoder", OrdinalEncoder()),
    ])

In [132]:
ord_attribs = ['igc','cpc','cc','date_contract'] # 4 attributes

num_attribs = ['family_income',   #17
               'personal_income',
               'high_school_endyear',
               'n_sem_course',
               'n_completed_sem',
               'sem_funded',
               'fam_size',
               'income_pc',
               'tuition_discounted',
               'tuition_current',
               'inc_prop',
               'perc_requested',
               'loan_value_sem',
               'student_resource',
               'loan_value',
               'loan_limit',
               'total_debt',
               'age']
  

cat_attribs = ['semester_enroll',  #9
               'gender',
               'occupation', 
               'marital_status',
               'ethnicity', 
               'public_hs', 
               'state_course', 
               'degree', 
               'contract_phase']

In [133]:
# Full pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
        ("ord", ord_pipeline,ord_attribs)
        ])

In [134]:
fies_prepared = full_pipeline.fit_transform(fies)

In [135]:
fies_prepared[:1]

<1x403 sparse matrix of type '<class 'numpy.float64'>'
	with 31 stored elements in Compressed Sparse Row format>

### 8 - Classifiers



In [128]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

sgd_clf = SGDClassifier(max_iter=1000, random_state=42)
sgd_clf.fit(fies_prepared, fies_labels)
(cross_val_score(sgd_clf, fies_prepared, fies_labels, cv=3, scoring="roc_auc")).mean()

0.7386817068514108

In [136]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs')
logr.fit(fies_prepared, fies_labels)
(cross_val_score(logr, fies_prepared, fies_labels, cv=3, scoring="roc_auc")).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.7547940144606627

In [56]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(fies_prepared, fies_labels)
(cross_val_score(rf, fies_prepared, fies_labels, cv=3, scoring="roc_auc")).mean()

0.797843514468243

In [54]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=500)
mlp.fit(fies_prepared, fies_labels.values.ravel())

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [55]:
(cross_val_score(mlp, fies_prepared, fies_labels, cv=3, scoring="roc_auc")).mean()

0.8324472103276103

In [138]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(C=1, loss="hinge")

svm_clf.fit(fies_prepared, fies_labels)



LinearSVC(C=1, loss='hinge')

In [139]:
(cross_val_score(svm_clf, fies_prepared, fies_labels, cv=3, scoring="roc_auc")).mean()



0.6390250782163186