In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Carregar o dataset
url = "https://github.com/cassiusf/datasets/raw/refs/heads/main/titanic_data.csv"
df = pd.read_csv(url)

# 1.1. Eliminar as variáveis “PassengerId”, “Name”, “Ticket” e “Cabin”
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

# 1.2. Eliminar observações com valores ausentes
df.dropna(inplace=True)

# 1.3. Aplicar LabelEncoder em "Embarked" e "Sex"
label_encoder = LabelEncoder()
for col in ["Embarked", "Sex"]:
    df[col] = label_encoder.fit_transform(df[col])

# 1.4. Separar o dataframe em Treino (80%) e Teste (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Saída
print(f"Tamanho do treino: {train_df.shape}")
print(f"Tamanho do teste: {test_df.shape}")
print(train_df.head())


Tamanho do treino: (569, 8)
Tamanho do teste: (143, 8)
     Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
472         1       2    0  33.0      1      2  27.7500         2
432         1       2    0  42.0      1      0  26.0000         2
666         0       2    1  25.0      0      0  13.0000         2
30          0       1    1  40.0      0      0  27.7208         0
291         1       1    0  19.0      1      0  91.0792         0


In [7]:
X = df.drop(columns=["Survived"])
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
#2. Treinar o modelo de Árvore de Decisão
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#2.1 Matriz de confusão
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

#2.3 e 2.4 Métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Matriz de Confusão:")
print(cm)
print(f"\nTrue Negative (TN): {tn}")
print(f"False Positive (FP): {fp}")
print(f"False Negative (FN): {fn}")
print(f"True Positive (TP): {tp}")

print(f"\nAcurácia: {accuracy:.4f}")
print(f"Precisão (Precision): {precision:.4f}")
print(f"Revocação (Recall): {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Matriz de Confusão:
[[63 17]
 [26 37]]

True Negative (TN): 63
False Positive (FP): 17
False Negative (FN): 26
True Positive (TP): 37

Acurácia: 0.6993
Precisão (Precision): 0.6852
Revocação (Recall): 0.5873
F1-score: 0.6325


In [None]:
# 3. Função para treinar e avaliar o modelo
def avaliar_arvore(max_depth=None):
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    metrics = {
        "max_depth": max_depth,
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "TP": tp,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }
    return metrics

# 3.Avaliar profundidades
resultados = []
for depth in [None, 3, 5]:  # None = default (sem limite)
    resultados.append(avaliar_arvore(depth))

df_resultados = pd.DataFrame(resultados)
# 4. Apresente as visualizações da árvore original e das duas árvores criadas na questão 3.
print(df_resultados)

   max_depth  TN  FP  FN  TP  accuracy  precision    recall        f1
0        NaN  63  17  25  38  0.706294   0.690909  0.603175  0.644068
1        3.0  69  11  25  38  0.748252   0.775510  0.603175  0.678571
2        5.0  76   4  33  30  0.741259   0.882353  0.476190  0.618557
