# Modelo oficial

- **Objetivo:** Identificar se a pessoa quer ou não trocar de trabalho baseado em suas características

- **Métricas**: 
    - Acurácia
    - Recall
    - Precision
    - F1-score

- **Modelo de ML**: Light Gradient Boosting Machine

## 0. Setup

In [1]:
import pandas as pd
import numpy as np

## 1. Carregando os dados

In [2]:
dados = pd.read_csv(filepath_or_buffer = '../data/raw/aug_train.csv')

dados.head(3)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0


## 2. Criar as novas features

### 2.1. Agrupar os valores de company_size em PP, P, M e G

In [3]:
def add_feature_company_size(df):
    
    """
    # PP: Até 50 pessoas
    # P: Vai de 50 até 500
    # M: Vai de 500 até 4999
    # G: Acima de 5000
    """
    
    df1 = df.copy()
    
    df1['company_size_cat'] = np.where(dados['company_size'].isin(['<10', '10/49']), 'PP', 
                                       np.where(dados['company_size'].isin(['50-99', '100-500']), 'P',
                                                np.where(dados['company_size'].isin(['500-999', '1000-4999']), 'M',
                                                         np.where(dados['company_size'].isin(['5000-9999', '10000+']), 'GG', 
                                                                  np.nan))))
    
    return df1

### 2.2. Fazer uma feature que divide a quantidade de horas treinadas por 24 (resultados em quantos dias de treinamento ela participou)

In [4]:
def add_feature_training_hours(df):
    
    df1 = df.copy()
    
    df1['days_training_hours'] = df1['training_hours'] / 24
    
    return df1

### 2.3. Criar uma variável categórica que diz se a pessoa é nova ou não no mercado de trabalho. Ex.: Se a pessoa tem 3 ou menos anos de experiência, ela é nova, senão ela é "velha"

In [5]:
def add_feature_experience(df):
    
    df1 = df.copy()
    
    df1['experience_cat'] = np.where(dados['experience'].isin(['<1', '1', '2', '3', '4', '5', '6', '7', '8', '9']), 0, 
                                     np.where(dados['experience'].isin(['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '>20']), 1, 
                                              np.nan))
    
    return df1

### 2.4. Agrupar os valores de company_type relacionadas a startup

In [6]:
def add_feature_company_type(df):
    
    df1 = df.copy()
    
    df1['company_type_cat'] = np.where(dados['company_type'].isin(['Funded Startup', 'Early Stage Startup']), 1, 
                                     np.where(dados['company_type'].isin(['Pvt Ltd', 'Other', 'Public Sector', 'NGO']), 0, 
                                              np.nan))
    
    return df1

### 2.5. Criando função para identificar nulos em qualquer variável (se for nulo, 1, 0)

In [7]:
def add_feature_null_column(df, col):
    
    df1 = df.copy()
    
    df1['check_null_' + col] = np.where(df1[col].isna(), 1, 0)
    
    return df1

### 2.6. Criando função para identificar nulos em variáveis qualitativas

In [8]:
def add_feature_null_qualitative(df, col):
    
    df1 = df.copy()
    
    df1[col] = np.where(df1[col].isna(), 'Outras', df1[col])
    
    return df1

### 2.7. Criando função para identificar nulos em variáveis quantitativas

In [9]:
def add_feature_null_quantitative(df, col):
    
    df1 = df.copy()
    
    df1[col] = np.where(df1[col].isna(), 99999, df1[col])
    
    return df1

## 3. Criação do modelo

### 3.0. Setup

In [58]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline 
from sklearn.compose import make_column_transformer
from sklearn import set_config 
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, precision_score
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer, QuantileTransformer, StandardScaler, OneHotEncoder
import category_encoders as ce
import lightgbm as lgb


set_config(display = "diagram")

### 3.1. Divisão da base de treino e teste

In [11]:
X = dados.drop(columns = 'target', axis = 1)

y = dados.target

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 19, stratify = y)

In [13]:
print(f'Quantidade de linhas do X_train: {X_train.shape[0]} \n \
Quantidade de linhas do X_test: {X_test.shape[0]}\n \
Quantidade de linhas do y_train: {y_train.shape[0]}\n \
Quantidade de linhas do y_test: {y_test.shape[0]}\
')

Quantidade de linhas do X_train: 13410 
 Quantidade de linhas do X_test: 5748
 Quantidade de linhas do y_train: 13410
 Quantidade de linhas do y_test: 5748


### 3.2. Definindo os passos do Pipeline de Feature Engineering

In [14]:
encoder1 = ce.BackwardDifferenceEncoder()
encoder2 = ce.BaseNEncoder()
encoder3 = ce.BinaryEncoder()
encoder4 = ce.CatBoostEncoder()
encoder5 = ce.CountEncoder()
encoder6 = ce.GLMMEncoder()
encoder7 = ce.HashingEncoder()
encoder8 = ce.HelmertEncoder()
encoder9 = ce.JamesSteinEncoder()
encoder10 = ce.LeaveOneOutEncoder()
encoder11 = ce.MEstimateEncoder()
encoder12 = OneHotEncoder(handle_unknown = "ignore")
encoder13 = ce.OrdinalEncoder()
encoder14 = ce.SumEncoder()
encoder15 = ce.PolynomialEncoder()
encoder16 = ce.TargetEncoder()
encoder17 = ce.WOEEncoder()
encoder18 = ce.QuantileEncoder()
encoder19 = MaxAbsScaler()
encoder20 = MinMaxScaler()
encoder21 = Normalizer()
encoder22 = QuantileTransformer()
encoder23 = StandardScaler()

model = lgb.LGBMClassifier(random_state = 42)



In [15]:
features_qual = list(dados.select_dtypes(include = ['object']).columns)
features_quant = list(dados.drop(columns = ['enrollee_id', 'target'], axis = 1).select_dtypes(include = [int, float]).columns)

In [16]:
pipeline_inicial = make_column_transformer(\
                                           (encoder12, features_qual),
                                           (encoder23, features_quant),
                                           remainder = 'drop'
                       )

pipeline_inicial

In [17]:
pipeline_com_modelo = make_pipeline(pipeline_inicial, model)

pipeline_com_modelo

In [18]:
pipeline_com_modelo.fit(X_train, y_train)

In [19]:
y_pred = pipeline_com_modelo.predict(X_test)

y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [20]:
pd.crosstab(y_test, y_pred, rownames = ['Vida real'], colnames = ['Predito'], margins = True)

Predito,0.0,1.0,All
Vida real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,3733,582,4315
1.0,581,852,1433
All,4314,1434,5748


In [21]:
print(classification_report(y_true = y_test, y_pred = y_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.87      0.87      4315
         1.0       0.59      0.59      0.59      1433

    accuracy                           0.80      5748
   macro avg       0.73      0.73      0.73      5748
weighted avg       0.80      0.80      0.80      5748



In [22]:
accuracy_score(y_true = y_test, y_pred = y_pred) * 100

79.7668754349339

In [23]:
recall_score(y_true = y_test, y_pred = y_pred) * 100

59.45568736915562

In [24]:
precision_score(y_true = y_test, y_pred = y_pred) * 100

59.41422594142259

In [25]:
f1_score(y_true = y_test, y_pred = y_pred) * 100

59.43494942448553

## 4. Tuning do modelo anterior

### 4.1. Testando novos encoders/transformações para as variáveis qualitativas

In [31]:
params = {}

params['columntransformer__onehotencoder'] = [encoder1, encoder2, encoder3, encoder4, encoder5, encoder6, encoder7, 
                                              encoder8, encoder9, encoder10, encoder11, encoder12, encoder13, 
                                              encoder14, encoder15, encoder16, encoder17, encoder18]

In [33]:
grid = GridSearchCV(estimator = pipeline_com_modelo, 
                    param_grid = params,
                    scoring = 'recall',
                    n_jobs = -1,
                    cv = 4
                   )

grid.fit()

In [35]:
grid.fit(X_train, y_train)

4 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/rafael/Documentos/Github/

In [41]:
pd.DataFrame(grid.cv_results_)\
    .sort_values(by = 'rank_test_score', ascending = True)\
    .head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__onehotencoder,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
8,1.197525,0.116292,0.169009,0.03076,JamesSteinEncoder(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
1,1.166486,0.227653,0.214309,0.062656,BaseNEncoder(),{'columntransformer__onehotencoder': BaseNEnco...,0.570574,0.568182,0.564593,0.553828,0.564294,0.006407,2
2,1.023903,0.117201,0.14968,0.038793,BinaryEncoder(),{'columntransformer__onehotencoder': BinaryEnc...,0.570574,0.568182,0.564593,0.553828,0.564294,0.006407,2
4,0.740691,0.140218,0.136731,0.038292,CountEncoder(combine_min_nan_groups=True),{'columntransformer__onehotencoder': CountEnco...,0.570574,0.570574,0.55622,0.558612,0.563995,0.006633,4
5,23.394788,0.828549,0.065274,0.014622,GLMMEncoder(),{'columntransformer__onehotencoder': GLMMEncod...,0.566986,0.570574,0.557416,0.559809,0.563696,0.005307,5


### 4.2. Tunando as variáveis quantitativas e qualitativas

In [46]:
params = {}

params['columntransformer__onehotencoder'] = [encoder1, encoder2, encoder3, encoder4, encoder5, encoder6, encoder7, 
                                              encoder8, encoder9, encoder10, encoder11, encoder12, encoder13, 
                                              encoder14, encoder15, encoder16, encoder17, encoder18]

params['columntransformer__standardscaler'] = [encoder19, encoder20, encoder21, encoder22, encoder23]

In [48]:
grid = GridSearchCV(estimator = pipeline_com_modelo, 
                    param_grid = params,
                    scoring = 'recall',
                    n_jobs = -1,
                    cv = 4
                   )

In [49]:
grid.fit(X_train, y_train)

20 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/rafael/Documentos/Gith

In [50]:
pd.DataFrame(grid.cv_results_)\
    .sort_values(by = 'rank_test_score', ascending = True)\
    .head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__onehotencoder,param_columntransformer__standardscaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
44,0.789047,0.151458,0.108159,0.019682,JamesSteinEncoder(),StandardScaler(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
43,0.776704,0.090444,0.124026,0.033233,JamesSteinEncoder(),QuantileTransformer(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
41,0.897332,0.100427,0.098119,0.014403,JamesSteinEncoder(),MinMaxScaler(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
40,0.939258,0.081043,0.126997,0.035658,JamesSteinEncoder(),MaxAbsScaler(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
14,1.047245,0.045237,0.133891,0.012376,BinaryEncoder(),StandardScaler(),{'columntransformer__onehotencoder': BinaryEnc...,0.570574,0.568182,0.564593,0.553828,0.564294,0.006407,5


### 4.3. Tunando o lightgbm

- Baseado em árvores
- Xgboost

In [54]:
params = {}

params['columntransformer__onehotencoder'] = [encoder1, encoder2, encoder3, encoder4, encoder5, encoder6, encoder7, 
                                              encoder8, encoder9, encoder10, encoder11, encoder12, encoder13, 
                                              encoder14, encoder15, encoder16, encoder17, encoder18]

params['columntransformer__standardscaler'] = [encoder19, encoder20, encoder21, encoder22, encoder23]

params['lgbmclassifier__n_estimators'] = [100, 200, 500, 100, 5000, 10000]
params['lgbmclassifier__max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10]
params['lgbmclassifier__num_leaves'] = [2, 5, 7, 9, 11, 15, 17, 20]
params['lgbmclassifier__learning_rate'] = [0.1, 0.3, 0.5, 0.7, 0.9]

In [60]:
grid = RandomizedSearchCV(estimator = pipeline_com_modelo, 
                          param_distributions = params,
                          scoring = 'recall',
                          n_jobs = -1,
                          cv = 4
                   )

In [61]:
grid.fit(X_train, y_train)

4 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/rafael/Documentos/Github/hr_analysis/env/lib/python3.10/site-packages/sklearn/pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/rafael/Documentos/Github/

In [62]:
pd.DataFrame(grid.cv_results_)\
    .sort_values(by = 'rank_test_score', ascending = True)\
    .head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbmclassifier__num_leaves,param_lgbmclassifier__n_estimators,param_lgbmclassifier__max_depth,param_lgbmclassifier__learning_rate,param_columntransformer__standardscaler,param_columntransformer__onehotencoder,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
4,0.977543,0.143476,0.153296,0.031558,17,100,8,0.1,Normalizer(),TargetEncoder(),"{'lgbmclassifier__num_leaves': 17, 'lgbmclassi...",0.575359,0.583732,0.587321,0.580144,0.581639,0.004425,1
7,0.744699,0.061902,0.139111,0.008077,20,200,5,0.1,MaxAbsScaler(),OrdinalEncoder(),"{'lgbmclassifier__num_leaves': 20, 'lgbmclassi...",0.5,0.563397,0.534689,0.533493,0.532895,0.02245,2
3,10.800696,1.417751,5.136325,0.509938,5,5000,7,0.3,StandardScaler(),MEstimateEncoder(),"{'lgbmclassifier__num_leaves': 5, 'lgbmclassif...",0.434211,0.446172,0.456938,0.467703,0.451256,0.012442,3
9,9.721816,0.497888,8.116082,0.267395,15,5000,8,0.5,Normalizer(),TargetEncoder(),"{'lgbmclassifier__num_leaves': 15, 'lgbmclassi...",0.448565,0.440191,0.447368,0.466507,0.450658,0.009695,4
0,12.528738,1.322527,9.748416,0.72328,5,10000,6,0.3,Normalizer(),TargetEncoder(),"{'lgbmclassifier__num_leaves': 5, 'lgbmclassif...",0.434211,0.454545,0.434211,0.455742,0.444677,0.010475,5


In [50]:
pd.DataFrame(grid.cv_results_)\
    .sort_values(by = 'rank_test_score', ascending = True)\
    .head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__onehotencoder,param_columntransformer__standardscaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
44,0.789047,0.151458,0.108159,0.019682,JamesSteinEncoder(),StandardScaler(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
43,0.776704,0.090444,0.124026,0.033233,JamesSteinEncoder(),QuantileTransformer(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
41,0.897332,0.100427,0.098119,0.014403,JamesSteinEncoder(),MinMaxScaler(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
40,0.939258,0.081043,0.126997,0.035658,JamesSteinEncoder(),MaxAbsScaler(),{'columntransformer__onehotencoder': JamesStei...,0.572967,0.576555,0.559809,0.563397,0.568182,0.006819,1
14,1.047245,0.045237,0.133891,0.012376,BinaryEncoder(),StandardScaler(),{'columntransformer__onehotencoder': BinaryEnc...,0.570574,0.568182,0.564593,0.553828,0.564294,0.006407,5


## Próximos passos:

1. Avaliar as métricas comparando com o modelo sem tuning e a baseline

In [68]:
# IDEIA DO PIPELINE: Fazer a passo a passo o que o modelo vai fazer
# - Excluir uma variável
# - Aplicar a função que add_feature_null_column
# - Aplicar a função que add_feature_null_qualitative
# - Aplicar a função que add_feature_null_quantitativa
# - Utilizar os encoders nas features qualitativas