# Titanic (Machine Learning from Disaster) - Model1-tester


In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.patches as mpatches


from setup_notebook import setup_path
setup_path()
from src.functions import *

from matplotlib.colors import LinearSegmentedColormap
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Carregando os dados
dfo = pd.read_csv("/home/akel/PycharmProjects/Kaggle/Titanic/data/raw/train.csv")
df=dfo.drop(columns='PassengerId')
df.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
195,1,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C
442,0,3,"Petterson, Mr. Johan Emil",male,25.0,1,0,347076,7.775,,S
54,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C
317,0,2,"Moraweck, Dr. Ernest",male,54.0,0,0,29011,14.0,,S
261,1,3,"Asplund, Master. Edvin Rojj Felix",male,3.0,4,2,347077,31.3875,,S


### Pre-processamento

In [3]:


#transformação das variaveis

if 'Cabin' in df.columns:
    # Variable HasCabin
    df['HasCabin'] = df['Cabin'].notnull().astype(int)
    # Variable HasCabin 
    df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'U')
    df.drop(columns='Cabin', inplace=True)
  
# fill with mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0]) 

# Variable Age imputed is Age2
if 'Age' in df.columns:
    coll_age=['Sex','Pclass','HasCabin']
    df['Age2']=df['Age'].copy()
    for i in range(len(coll_age)):
        if df['Age2'].isnull().sum()>0:
            df['Age2'] = df['Age2'].fillna(df.groupby(coll_age[0:3-i])['Age'].transform('median'))
        df['Age2'] = df['Age2'].fillna(df['Age'].median())
    df.drop(columns='Age', inplace=True)



# df['Age_Group'] = pd.cut(df['Age2'], bins=[0, 12, 18, 30, 50, 80], 
#                          labels=['Criança (<12)', 'Adolescente (12-18)', 
#                                  'Adulto Jovem (19-30)', 'Adulto (31-50)', 
#                                  'Idoso (>50)'])
  
# Variable FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

drop_cols = ['Name', 'Ticket']
df = df.drop(drop_cols, axis=1)


In [4]:

train = pd.get_dummies(df)
display(train.head(5))
print("=============================")
print("\n Data Ready for Model!")
train

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,HasCabin,Age2,FamilySize,Sex_female,Sex_male,...,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,0,3,1,0,7.25,0,22.0,2,False,True,...,True,False,False,False,False,False,False,False,False,True
1,1,1,1,0,71.2833,1,38.0,2,True,False,...,False,False,False,True,False,False,False,False,False,False
2,1,3,0,0,7.925,0,26.0,1,True,False,...,True,False,False,False,False,False,False,False,False,True
3,1,1,1,0,53.1,1,35.0,2,True,False,...,True,False,False,True,False,False,False,False,False,False
4,0,3,0,0,8.05,0,35.0,1,False,True,...,True,False,False,False,False,False,False,False,False,True



 Data Ready for Model!


Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,HasCabin,Age2,FamilySize,Sex_female,Sex_male,...,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,0,3,1,0,7.2500,0,22.0,2,False,True,...,True,False,False,False,False,False,False,False,False,True
1,1,1,1,0,71.2833,1,38.0,2,True,False,...,False,False,False,True,False,False,False,False,False,False
2,1,3,0,0,7.9250,0,26.0,1,True,False,...,True,False,False,False,False,False,False,False,False,True
3,1,1,1,0,53.1000,1,35.0,2,True,False,...,True,False,False,True,False,False,False,False,False,False
4,0,3,0,0,8.0500,0,35.0,1,False,True,...,True,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,0,27.0,1,False,True,...,True,False,False,False,False,False,False,False,False,True
887,1,1,0,0,30.0000,1,19.0,1,True,False,...,True,False,True,False,False,False,False,False,False,False
888,0,3,1,2,23.4500,0,21.0,4,True,False,...,True,False,False,False,False,False,False,False,False,True
889,1,1,0,0,30.0000,1,26.0,1,False,True,...,False,False,False,True,False,False,False,False,False,False


In [5]:
# Align Train and Test columns
train_cols = train.columns.tolist()
#test_cols = test.columns.tolist()
features = [c for c in train_cols if c != 'Survived']



print("Columns Aligned! Ready for Training.")


Columns Aligned! Ready for Training.


In [6]:
### Training

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

X = train.drop(columns=['Survived']) 
y = train['Survived']

# Initialize Model
model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)

# Cross Validation Score Check 
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Average CV Accuracy: {np.mean(cv_scores)*100:.2f}%")

# Fit Model
model.fit(X, y)
print("Model Trained Successfully!")

Average CV Accuracy: 81.93%
Model Trained Successfully!
