# Titanic Survival Prediction
This notebook uses XGBoost to predict survival on the Titanic dataset, with feature engineering and preprocessing pipelines.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

## Load and preprocess data

In [4]:
# Load Titanic dataset
data = pd.read_csv("titanic/train.csv")

# Feature Engineering
data["FamilySize"] = data["SibSp"] + data["Parch"]
data["IsAlone"] = (data["FamilySize"] == 0).astype(int)
data["Title"] = data["Name"].str.extract(" ([A-Za-z]+)\\.", expand=False)
data["Title"] = data["Title"].replace(
    ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 
     'Sir', 'Jonkheer', 'Dona'], 'Rare')
data["Title"] = data["Title"].replace(['Mlle', 'Ms'], 'Miss')
data["Title"] = data["Title"].replace('Mme', 'Mrs')

# Drop columns not needed
data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

## Split data

In [5]:
# Split features and target
y = data["Survived"]
X = data.drop("Survived", axis=1)

# Split into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## Build preprocessing pipeline

In [6]:
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

# Define transformers
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

## Define and train model

In [7]:
# Create full pipeline with XGBoost
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=100, learning_rate=0.1,
                                 use_label_encoder=False, eval_metric='logloss',
                                 random_state=42))
])

# Train model
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Evaluate accuracy

In [8]:
# Predict and evaluate
predictions = model.predict(X_valid)
accuracy = accuracy_score(y_valid, predictions)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.8268156424581006


## Cross-validation

In [9]:
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validated Accuracy:", cv_scores.mean())

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Cross-validated Accuracy: 0.8350197727700708
