In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.patches as mpatches

from sklearn.model_selection import train_test_split


from setup_notebook import setup_path
setup_path()
from src.functions import *

from matplotlib.colors import LinearSegmentedColormap
import warnings

warnings.filterwarnings("ignore")


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score,RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.base import clone


from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
# Leitura e divis√£o das bases
dfo = pd.read_csv("/home/akel/PycharmProjects/Kaggle/Titanic/data/raw/train.csv")
df=dfo.drop(columns='PassengerId')

# SPLIT
RANDOM_STATE = 42
TEST_SIZE = 0.3
TARGET = 'Survived'
X = df.drop(columns=TARGET)
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE )
X_train = X_train.reset_index(drop=True) # <--- ISSO √â VITAL
y_train = y_train.reset_index(drop=True)

In [3]:
# Defini√ß√£o e ajustes do pre-processador
class preprocessador_titanic(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.embarked_mode_ = None
        self.age_medians_ = {}
        self.global_age_median_ = None

    """
    FIT       : Load self variables
    TRANSFORM : Unlade self variables
    """
    
    #novo
    def fit(self, X, y=None):
        X = X.copy()

        if 'Embarked' in X.columns:
            # Garante que pegamos o valor (string) e n√£o a Series
            mode_series = X['Embarked'].mode()
            self.embarked_mode_ = mode_series[0] if not mode_series.empty else 'S'

        # 1. Criar HasCabin logo no in√≠cio para consist√™ncia
        if 'Cabin' in X.columns:
            X['HasCabin'] = X['Cabin'].notnull().astype(int)

        # 2. C√°lculos de m√©dia (sua l√≥gica est√° correta)
        if 'Age' in X.columns:
            self.global_age_median_ = X['Age'].median()
            group_cols = ['Sex', 'Pclass', 'HasCabin']
            for i in range(len(group_cols)):
                cols = group_cols[:len(group_cols) - i]
                self.age_medians_[tuple(cols)] = X.groupby(cols)['Age'].median()

        # 3. Aplicar TODAS as transforma√ß√µes que o transform faria
        # Isso garante que dummy_columns_ aprenda a estrutura final real
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1

        if 'Cabin' in X.columns:
            X['Deck'] = X['Cabin'].apply(
                lambda x: 'U' if pd.isnull(x) or str(x)[0] == 'T' else str(x)[0]
            )
            X.drop(columns='Cabin', inplace=True)

        if 'Age' in X.columns:
            X['Age2'] = X['Age']
            X.drop(columns='Age', inplace=True)

        drop_cols = [c for c in ['Name', 'Ticket'] if c in X.columns]
        X.drop(columns=drop_cols, inplace=True)

        # 4. Agora sim captura as colunas do dummy
        X_dummy = pd.get_dummies(X, drop_first=False)
        self.dummy_columns_ = X_dummy.columns

        return self

    # =========================
    # TRANSFORM
    # =========================
    def transform(self, X):
        X = X.copy()

        # -----------------------
        # Cabin ‚Üí HasCabin + Deck
        # -----------------------
        if 'Cabin' in X.columns:
            X['HasCabin'] = X['Cabin'].notnull().astype(int)
            X['Deck'] = X['Cabin'].apply(
                lambda x: 'U' if pd.isnull(x) or str(x)[0] == 'T' else str(x)[0]
            )
            X.drop(columns='Cabin', inplace=True)

        # -----------------------
        # Embarked
        # -----------------------
        if 'Embarked' in X.columns:
            X['Embarked'] = X['Embarked'].fillna(self.embarked_mode_)

        # -----------------------
        # Age ‚Üí Age2 (hierarchical imputation)
        # -----------------------
        if 'Age' in X.columns:
            X['Age2'] = X['Age']

            for cols, medians in self.age_medians_.items():
                keys = X[list(cols)].apply(tuple, axis=1)

                X['Age2'] = X['Age2'].fillna(keys.map(medians))

            X['Age2'] = X['Age2'].fillna(self.global_age_median_)
            X.drop(columns='Age', inplace=True)

        # -----------------------
        # FamilySize
        # -----------------------
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1

        # -----------------------
        # Drop columns
        # -----------------------
        drop_cols = [c for c in ['Name', 'Ticket'] if c in X.columns]
        X.drop(columns=drop_cols, inplace=True)

        # -----------------------
        # One-hot encoding
        # -----------------------
        X = pd.get_dummies(X, drop_first=False)
        X = X.reindex(columns=self.dummy_columns_, fill_value=0)

        return X

In [7]:
# calculo sem pipeline - MANUAL

from sklearn.model_selection import StratifiedKFold 
from sklearn.base import clone

# 1. Configura√ß√£o id√™ntica ao Scikit-Learn
rf_model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
pp_base = preprocessador_titanic()

# O cross_val_score para classifica√ß√£o SEMPRE usa StratifiedKFold por padr√£o
skf = StratifiedKFold(n_splits=5, shuffle=False) 

scores_manuais = []

# O split agora precisa do y_train para saber como estratificar
for train_index, val_index in skf.split(X_train, y_train):
    
    X_tr, X_val = X_train.iloc[train_index].copy(), X_train.iloc[val_index].copy()
    y_tr, y_val = y_train[train_index], y_train[val_index]
    
    # Pr√©-processamento isolado por fold
    pp_fold = clone(pp_base)
    X_tr_transformed = pp_fold.fit_transform(X_tr)
    X_val_transformed = pp_fold.transform(X_val)
    
    model_fold = clone(rf_model)
    model_fold.fit(X_tr_transformed, y_tr)
    
    scores_manuais.append(model_fold.score(X_val_transformed, y_val))

print(f"Average Manual Stratified CV Accuracy: {np.mean(scores_manuais)*100:.2f}%")

#modelo final
PP_final = preprocessador_titanic()
X_train_transformed = PP_final.fit_transform(X_train)
X_test_transformed = PP_final.transform(X_test)

model_final = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
model_final.fit(X_train_transformed, y_train)

y_pred = model_final.predict(X_test_transformed)

print(f"{'='*70}")
print(f"üéØ Random Forest Final (Manual)")
print(f"{'='*70}")
print(f"üìä **Acur√°cia no Teste**: {accuracy_score(y_test, y_pred):.6f}")
print(f"\nüìã **Relat√≥rio de Classifica√ß√£o**:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print(f"üéØ **Matriz de Confus√£o**:")
print(f"                Previsto 0   Previsto 1")
print(f"Real 0          {cm[0,0]:<11} {cm[0,1]:<11}")
print(f"Real 1          {cm[1,0]:<11} {cm[1,1]:<11}")
print(f"{'‚îÄ'*70}")


Average Manual Stratified CV Accuracy: 82.51%
üéØ Random Forest Final (Manual)
üìä **Acur√°cia no Teste**: 0.783582

üìã **Relat√≥rio de Classifica√ß√£o**:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       157
           1       0.77      0.68      0.72       111

    accuracy                           0.78       268
   macro avg       0.78      0.77      0.77       268
weighted avg       0.78      0.78      0.78       268

üéØ **Matriz de Confus√£o**:
                Previsto 0   Previsto 1
Real 0          135         22         
Real 1          36          75         
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


In [5]:
# dentro do pipeline

model_RF = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
#PP = preprocessador_titanic()
pipe = Pipeline([
    ('feature_engineering', preprocessador_titanic()),
    ('model', model_RF)
])

# Cross Validation Score Check 

cv_scores = cross_val_score(pipe, X_train, y_train, cv=50)
print(f"Average CV Accuracy: {np.mean(cv_scores)*100:.2f}%")

#modelo final

pipe.fit(X_train, y_train)
y_pred=pipe.predict(X_test)
print(f"{'='*70}")
print(f"üéØ Random Forest Padr√£o")
print(f"{'='*70}")
print(f"üìä **Acur√°cia no Teste**: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nüìã **Relat√≥rio de Classifica√ß√£o**:")
print(classification_report(y_test, y_pred))
cm=confusion_matrix(y_test, y_pred)
print(f"üéØ **Matriz de Confus√£o**:")
print(f"               Previsto 0   Previsto 1")
print(f"Real 0         {cm[0,0]:<11} {cm[0,1]:<11}")
print(f"Real 1         {cm[1,0]:<11} {cm[1,1]:<11}")
print(f"{'‚îÄ'*70}")


Average CV Accuracy: 82.51%
üéØ Random Forest Padr√£o
üìä **Acur√°cia no Teste**: 0.7836

üìã **Relat√≥rio de Classifica√ß√£o**:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       157
           1       0.77      0.68      0.72       111

    accuracy                           0.78       268
   macro avg       0.78      0.77      0.77       268
weighted avg       0.78      0.78      0.78       268

üéØ **Matriz de Confus√£o**:
               Previsto 0   Previsto 1
Real 0         135         22         
Real 1         36          75         
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
