In [48]:
################################################################################
# Jupyter Notebook: Titanic Survival Prediction using XGBoost (Updated)
################################################################################

# In[1]: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings

# OPTIONAL: to hide the FutureWarning related to __sklearn_tags__
warnings.filterwarnings('ignore', category=FutureWarning)

In [49]:
################################################################################
# In[2]: Load the data
################################################################################
url = 'https://drive.google.com/uc?export=download&id=1Oytm0kGCmWsydZrRvCyK_cOE2WfnaVIA'
titanic_col_names = [
    'PassengerID',
    'Survived',
    'Pclass',
    'Name',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Ticket',
    'Fare',
    'Embarked'
]
titanic = pd.read_csv(url, header=0, names=titanic_col_names)

print("Shape of the dataset:", titanic.shape)
titanic.head()

Shape of the dataset: (711, 11)


Unnamed: 0,PassengerID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,159,0,3,"Smiljanic, Mr. Mile",male,25.0,0,0,315037,8.6625,S
1,344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25.0,0,0,244361,13.0,S
2,224,0,3,"Nenkoff, Mr. Christo",male,25.0,0,0,349234,7.8958,S
3,532,0,3,"Toufik, Mr. Nakli",male,25.0,0,0,2641,7.2292,C
4,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,S


In [50]:
################################################################################
# In[3]: Data Cleaning & Feature Engineering
################################################################################

# 1. Check for missing values
print("\nMissing values before cleaning:")
print(titanic.isnull().sum())

# 2. Fill missing 'Age' with median
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

# 3. Fill missing 'Embarked' with the most common value (e.g., 'S')
titanic['Embarked'] = titanic['Embarked'].fillna('S')

# 4. Convert 'Sex' to numeric (male=1, female=0)
titanic['Sex'] = titanic['Sex'].map({'male': 1, 'female': 0})

# 5. Convert 'Embarked' to numeric labels (S=0, C=1, Q=2)
embarked_map = {'S': 0, 'C': 1, 'Q': 2}
titanic['Embarked'] = titanic['Embarked'].map(embarked_map)

# 6. Fill any remaining missing 'Fare' if present (using median)
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

print("\nMissing values after cleaning:")
print(titanic.isnull().sum())


Missing values before cleaning:
PassengerID    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Missing values after cleaning:
PassengerID    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [51]:
################################################################################
# In[4]: Prepare features (X) and target (y)
################################################################################
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = titanic[features]
y = titanic['Survived']

print("\nFeature matrix shape:", X.shape)
print("Target vector shape:", y.shape)


Feature matrix shape: (711, 7)
Target vector shape: (711,)


In [52]:
################################################################################
# In[5]: Train-test split
################################################################################
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])


Training set size: 568
Test set size: 143


In [53]:
################################################################################
# In[6]: Build and train an XGBoost model
################################################################################

# Create the XGBClassifier without 'use_label_encoder'
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'  # Note: 'logloss' is good for binary classification
)

# Fit the model
model.fit(X_train, y_train)

# Optional: Print the model to see its parameters
# (instead of letting Jupyter automatically display it)
print("Training complete. Model:\n", model)

Training complete. Model:
 XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)


In [54]:
################################################################################
# In[7]: Make predictions on the test set
################################################################################
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nAccuracy on the test set: {accuracy:.4f}")


Accuracy on the test set: 0.8322


In [55]:
################################################################################
# In[8]: Determine the number of predicted survivors and their PassengerIDs
################################################################################
test_df = titanic.iloc[X_test.index].copy()
test_df['PredictedSurvived'] = y_pred

num_survivors = test_df['PredictedSurvived'].sum()
print(f"\nNumber of survivors predicted in the test set: {num_survivors}")

survived_passenger_ids = test_df.loc[test_df['PredictedSurvived'] == 1, 'PassengerID'].values
print("\nPassenger IDs predicted to survive:")
print(survived_passenger_ids)


Number of survivors predicted in the test set: 42

Passenger IDs predicted to survive:
[646 428 742   2 212 682 368 584 850 152 872 582 547 507 671 781 600 709
 397 319 594 408 497 140 388 330 721 778 727 485 586 400 474 349 185 642
 555 141 316 760 242 577]


In [56]:
# Group by Pclass and compute mean survival
survival_by_class = titanic.groupby('Pclass')['Survived'].mean()
print("Survival rate by passenger class:\n", survival_by_class)


Survival rate by passenger class:
 Pclass
1    0.614943
2    0.496599
3    0.230769
Name: Survived, dtype: float64


In [57]:
# Compare fare distributions for survivors vs. nonsurvivors
fare_stats = titanic.groupby('Survived')['Fare'].median()
print("\nMedian fare for survivors vs. nonsurvivors:\n", fare_stats)



Median fare for survivors vs. nonsurvivors:
 Survived
0    10.5
1    26.0
Name: Fare, dtype: float64


In [58]:
correlation = titanic['Fare'].corr(titanic['Survived'])
print("\nCorrelation between Fare and Survived:", correlation)



Correlation between Fare and Survived: 0.2409307787340386
