In [0]:
import pandas as pd

In [0]:
df = pd.read_csv('/content/train.csv')

In [0]:
df.columns = [x.lower() for x in df.columns]

In [8]:
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
x = df.loc[:,['pclass','sex','age','sibsp','parch','fare','embarked']]
y = df.survived

In [13]:
y.isna().sum()

0

In [14]:
x.isna().sum()

pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [0]:
## Preprocessando os dados
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer  

## Modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

## Avaliando os modelos
from sklearn.model_selection import cross_val_score, GridSearchCV

## Automatizando o tratamento e treinamento
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 

In [0]:
## Separando as features

cat_features = ['pclass','sex','embarked']
num_features = ['age','sibsp','parch','fare']

In [0]:
## Criando os passos

steps_cat = [('imputer_categorica',SimpleImputer(strategy='most_frequent')),('ohe',OneHotEncoder())]
steps_num = [('imputer_numericas',SimpleImputer(strategy='mean'))] 

In [0]:
## Criando os pipelines

pipe_cat = Pipeline(steps_cat)
pipe_num = Pipeline(steps_num)

In [0]:
## Criando os transformadores
transformers = [('cat',pipe_cat,cat_features),('num',pipe_num,num_features)]
col_transformer = ColumnTransformer(transformers)

In [35]:
col_transformer

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cat',
                                 Pipeline(memory=None,
                                          steps=[('imputer_categorica',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0)),
                                                 ('ohe',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
           

In [37]:
pd.DataFrame(col_transformer.fit_transform(x)).isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
dtype: int64

In [38]:
pd.DataFrame(col_transformer.fit_transform(x))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,22.000000,1.0,0.0,7.2500
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,38.000000,1.0,0.0,71.2833
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,26.000000,0.0,0.0,7.9250
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,35.000000,1.0,0.0,53.1000
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,27.000000,0.0,0.0,13.0000
887,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19.000000,0.0,0.0,30.0000
888,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,29.699118,1.0,2.0,23.4500
889,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,26.000000,0.0,0.0,30.0000


In [0]:
pipe_final = Pipeline(steps = [('pre_processing',col_transformer),('random_forest',RandomForestClassifier(n_estimators=1000))])

In [41]:
pipe_final.get_params()

{'memory': None,
 'pre_processing': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('cat',
                                  Pipeline(memory=None,
                                           steps=[('imputer_categorica',
                                                   SimpleImputer(add_indicator=False,
                                                                 copy=True,
                                                                 fill_value=None,
                                                                 missing_values=nan,
                                                                 strategy='most_frequent',
                                                                 verbose=0)),
                                                  ('ohe',
                                                   OneHotEncoder(categories='auto',
                                      

In [0]:
def compare_models(model1,model2,model3,model4,x,y,cv):
  print('-- COMPARANDO MODELOS --')
  print(f'Model 1: {cross_val_score(model1,X=x,y=y,cv=cv).mean()}')
  print(f'Model 2: {cross_val_score(model2,X=x,y=y,cv=cv).mean()}')
  print(f'Model 3: {cross_val_score(model3,X=x,y=y,cv=cv).mean()}')
  print(f'Model 4: {cross_val_score(model4,X=x,y=y,cv=cv).mean()}')

In [45]:
df.survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: survived, dtype: float64

In [0]:
from sklearn.svm import SVC
dummy = DummyClassifier(strategy='most_frequent')
pipe_final_DT = Pipeline(steps= [('pre_processing',col_transformer),('random_forest',DecisionTreeClassifier())])
pipe_final_SVC = Pipeline(steps= [('pre_processing',col_transformer),('random_forest',SVC())])

In [64]:
compare_models(dummy,pipe_final,pipe_final_DT,pipe_final_SVC,x,y,10)

-- COMPARANDO MODELOS --
Model 1: 0.616167290886392
Model 2: 0.8070411985018726
Model 3: 0.7878901373283396
Model 4: 0.6813233458177278


In [65]:
pipe_final.get_params()

{'memory': None,
 'pre_processing': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('cat',
                                  Pipeline(memory=None,
                                           steps=[('imputer_categorica',
                                                   SimpleImputer(add_indicator=False,
                                                                 copy=True,
                                                                 fill_value=None,
                                                                 missing_values=nan,
                                                                 strategy='most_frequent',
                                                                 verbose=0)),
                                                  ('ohe',
                                                   OneHotEncoder(categories='auto',
                                      

In [75]:
params = {'random_forest__n_estimators':[100,200,300,400,500,600,700], 
          'random_fores'
          'random_forest__random_state': [42]}

grid = GridSearchCV(pipe_final,param_grid=params,cv=2)
grid.fit(x,y)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_processing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer_categorica',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                       

In [76]:
grid.best_params_

{'random_forest__n_estimators': 100, 'random_forest__random_state': 42}

In [77]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__n_estimators,param_random_forest__random_state,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.174622,0.004543,0.022013,0.004759,100,42,"{'random_forest__n_estimators': 100, 'random_f...",0.789238,0.782022,0.78563,0.003608,1
1,0.322905,0.006252,0.031207,0.000675,200,42,"{'random_forest__n_estimators': 200, 'random_f...",0.778027,0.779775,0.778901,0.000874,7
2,0.469691,0.014143,0.043386,0.001609,300,42,"{'random_forest__n_estimators': 300, 'random_f...",0.782511,0.777528,0.78002,0.002492,6
3,0.621233,0.005715,0.053981,0.002457,400,42,"{'random_forest__n_estimators': 400, 'random_f...",0.780269,0.782022,0.781146,0.000877,4
4,0.767691,0.002557,0.068966,0.000397,500,42,"{'random_forest__n_estimators': 500, 'random_f...",0.782511,0.782022,0.782267,0.000244,2
5,0.91698,0.013592,0.080565,0.000642,600,42,"{'random_forest__n_estimators': 600, 'random_f...",0.784753,0.779775,0.782264,0.002489,3
6,1.063825,0.010071,0.091144,0.000596,700,42,"{'random_forest__n_estimators': 700, 'random_f...",0.782511,0.779775,0.781143,0.001368,5
