In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

In [11]:
# Carregar a base
df = pd.read_csv("base_atrito.csv")

# Ajustar nomes de colunas e remover colunas irrelevantes
df = df.drop(columns=["Unnamed: 0", "ID"])

In [12]:
# Definir variável alvo
y = df["Deixou a empresa"]
X = df.drop(columns=["Deixou a empresa"])

In [15]:
# Separar treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [16]:
colunas_numericas = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
colunas_categoricas = X_train.select_dtypes(include=["object"]).columns.tolist()

In [None]:
#criar pré processador
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), colunas_numericas),
        ("cat", OneHotEncoder(handle_unknown="ignore"), colunas_categoricas)
    ]
)

In [19]:
#pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [20]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
y_pred = pipeline.predict(X_test)


In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.77      0.76      7804
           1       0.74      0.73      0.74      7096

    accuracy                           0.75     14900
   macro avg       0.75      0.75      0.75     14900
weighted avg       0.75      0.75      0.75     14900



In [26]:
import joblib

In [27]:
joblib.dump(pipeline, "modelo_atrito.pkl")

['modelo_atrito.pkl']

In [28]:
# Adicionando mais etapas de pré-processamento

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separar colunas
colunas_numericas = X.select_dtypes(include="number").columns.tolist()
colunas_categoricas = X.select_dtypes(include="object").columns.tolist()

# Pipeline para numéricos: imputação + escalonamento
pipeline_num = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Pipeline para categóricos: imputação + encoding
pipeline_cat = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="-1")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# ColumnTransformer com ambos
preprocessador = ColumnTransformer(transformers=[
    ("num", pipeline_num, colunas_numericas),
    ("cat", pipeline_cat, colunas_categoricas)
])

In [31]:
# Criar pipeline
pipeline = Pipeline(steps=[
    ("preprocessamento", preprocessador),
    ("modelo", RandomForestClassifier(random_state=42, max_depth=10, n_estimators=100))
])

In [32]:
# Treinar pipeline
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessamento', ...), ('modelo', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'-1'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
# Adicionando mais etapas de pré-processamento

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separar colunas
colunas_numericas = X.select_dtypes(include="number").columns.tolist()
colunas_categoricas = X.select_dtypes(include="object").columns.tolist()

# Pipeline para numéricos: imputação + escalonamento
pipeline_num = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

coluna_genero = ['Genero']
colunas_categoricas_sem_genero = [col for col in colunas_categoricas if col not in coluna_genero]

# Pipeline para categóricos: imputação + encoding
pipeline_genero = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="-1")),
    ("encoder",OrdinalEncoder(categories=[['Masculino', 'Feminino']],
                                handle_unknown='use_encoded_value', 
                                unknown_value=-1)),
])

# Pipeline para categóricos: imputação + encoding
pipeline_outras = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="-1")),
    ("encoder",OneHotEncoder(handle_unknown="ignore")),
])


# ColumnTransformer com ambos
preprocessador = ColumnTransformer(transformers=[
    ("num", pipeline_num, colunas_numericas),
    ("genero", pipeline_genero, coluna_genero),
    ("cat_outras", pipeline_outras, colunas_categoricas_sem_genero)
])

In [47]:
# Criar pipeline
pipeline = Pipeline(steps=[
    ("preprocessamento", preprocessador),
    ("modelo", RandomForestClassifier(random_state=42, max_depth=10, n_estimators=100))
])

In [48]:
# Treinar pipeline
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessamento', ...), ('modelo', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('genero', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'-1'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Masculino', 'Feminino']]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'-1'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True
