In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset again
df = pd.read_csv('../train.csv')
df.head()

In [None]:
# Select useful features

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = df[features]
y = df['Survived']

In [None]:
# Handle missing values
X['Age'].fillna(X['Age'].median(), inplace=True)

X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)

In [None]:
# Preprocessing
categorical_cols = ["Sex", "Embarked"]
numerical_cols = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

In [None]:
# Build the pipeline
model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classifier', LogisticRegression())
])

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train the model
model.fit(X_train, y_train)


In [None]:
# Make predictions + evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print()
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print()
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))