In [1]:
"""
Titanic Survival Prediction – End-to-End Machine Learning Workflow

This script builds a supervised machine learning model to predict passenger
survival on the Titanic dataset. It follows a standard data science pipeline:
data loading, preprocessing, feature selection, model training, validation,
and final prediction generation.

The model is trained using the labelled training dataset (train.csv), where
missing values are handled, categorical variables are encoded, and a
Random Forest classifier is fitted. Model performance is evaluated using a
hold-out validation split to produce an accuracy metric.

After validation, the trained model is applied to the unlabelled test dataset
(test.csv) to generate survival predictions suitable for downstream analysis
or competition submission (e.g., Kaggle).

Key objectives:
- Demonstrate a clean, reproducible ML workflow
- Avoid data leakage between training and test sets
- Provide interpretable survival predictions based on passenger attributes
"""


'\nTitanic Survival Prediction – End-to-End Machine Learning Workflow\n\nThis script builds a supervised machine learning model to predict passenger\nsurvival on the Titanic dataset. It follows a standard data science pipeline:\ndata loading, preprocessing, feature selection, model training, validation,\nand final prediction generation.\n\nThe model is trained using the labelled training dataset (train.csv), where\nmissing values are handled, categorical variables are encoded, and a\nRandom Forest classifier is fitted. Model performance is evaluated using a\nhold-out validation split to produce an accuracy metric.\n\nAfter validation, the trained model is applied to the unlabelled test dataset\n(test.csv) to generate survival predictions suitable for downstream analysis\nor competition submission (e.g., Kaggle).\n\nKey objectives:\n- Demonstrate a clean, reproducible ML workflow\n- Avoid data leakage between training and test sets\n- Provide interpretable survival predictions based on 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
# ---------- Load TRAIN data (has Survived) ----------
train_data = pd.read_csv(r'C:\Users\Carlos\Documents\TitanicML\train.csv')


In [4]:
# Basic preprocessing (train)
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())

In [5]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
target = 'Survived'

In [6]:

X = train_data[features]
y = train_data[target]

In [7]:
# Split for evaluation
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
# Train
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [9]:
# Count survivors vs non-survivors
survival_counts = train_data['Survived'].value_counts()

print("Survival counts:")
print(survival_counts)

print("\nReadable format:")
print(f"Did not survive (0): {survival_counts.get(0, 0)}")
print(f"Survived (1): {survival_counts.get(1, 0)}")


Survival counts:
Survived
0    549
1    342
Name: count, dtype: int64

Readable format:
Did not survive (0): 549
Survived (1): 342


In [10]:
survival_percent = train_data['Survived'].value_counts(normalize=True) * 100

print("\nSurvival percentages:")
print(survival_percent.round(2))




Survival percentages:
Survived
0    61.62
1    38.38
Name: proportion, dtype: float64


In [11]:
# Validate (this is a real accuracy)
y_pred_valid = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.8101
