"""
Titanic Survival Prediction Portfolio Project
Author: YI LUO
Date: 20250315

Introduction:
This portfolio project demonstrates an end-to-end data analysis and machine learning workflow 
using the Titanic dataset from Kaggle (https://www.kaggle.com/competitions/titanic/data). 
The main objective is to predict whether a passenger survived the Titanic disaster.

Key steps and findings:
1. Data Preprocessing:
   - Handled missing values: Age filled with median values (grouped by Pclass and Sex), Fare with median, and Embarked with mode.
   - Converted categorical variables such as Sex (male=1, female=0) and Embarked into one-hot encoded features.
   - Extracted new features from raw data: 
     • Title: Extracted from Name to capture social status and gender cues.
     • Deck: Derived from the first character of Ticket to indicate the deck level.
     • FamilySize: Constructed as a weighted combination of SibSp and Parch, then log-transformed for smoothing.
     • FareBin and AgeBin: Discretized versions of Fare and Age to capture non-linear effects.
   
2. Exploratory Data Analysis (EDA):
   - Visualized distributions of key features (e.g., AgeBin, FareBin) and examined survival rates across different passenger classes.
   - Insights revealed that gender, class, and title are strong predictors of survival.

3. Model Building:
   - Built a 6-layer Deep Neural Network (DNN) using TensorFlow/Keras.
   - Integrated BatchNormalization and Dropout layers to stabilize training and prevent overfitting.
   - Employed EarlyStopping and ModelCheckpoint callbacks to capture the best model.
   - The model was trained using normalized data and an 80/20 train-validation split.

4. Evaluation and Submission:
   - The DNN achieved competitive validation accuracy (around 81.5% on the validation set).
   - Final predictions on the test set were generated and formatted as per Kaggle requirements for submission.

This project highlights a comprehensive data science approach—from data cleaning and feature engineering to deep learning model training—demonstrating my ability to tackle real-world predictive modeling challenges. This work is an integral part of my portfolio, showcasing the techniques and insights that I can bring to a data science role.
"""

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Load the training and test data
df = pd.read_csv("/Users/luoyi/Desktop/10_kaggle/01_titanic/train.csv")
test_df = pd.read_csv("/Users/luoyi/Desktop/10_kaggle/01_titanic/test.csv")

# Save PassengerId for final submission
test_passenger_ids = test_df['PassengerId']

# Quick overview of the training data
print("Training data shape:", df.shape)
print(df.head())

In [None]:
# ---------------------------
# 1. Fill Missing Values
# ---------------------------
# Fill missing Age values using median grouped by Pclass and Sex
df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
test_df['Age'] = test_df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

# Fill missing Fare in test data with median
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Fill missing Embarked values with the mode (most common value)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)

# ---------------------------
# 2. Feature Extraction & Transformation
# ---------------------------
# 2.1 Extract Title from Name and map rare titles
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
    'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
    'Capt': 'Rare', 'Sir': 'Rare'
}
df['Title'] = df['Title'].map(title_mapping)
test_df['Title'] = test_df['Title'].map(title_mapping)

# One-Hot encode Title for both train and test data
df = pd.get_dummies(df, columns=['Title'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Title'], drop_first=True)

# 2.2 Convert Sex to numeric (male: 1, female: 0)
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})

# 2.3 Extract Deck from Ticket
df['Deck'] = df['Ticket'].apply(lambda x: str(x)[0])
test_df['Deck'] = test_df['Ticket'].apply(lambda x: str(x)[0])
df['Deck'] = df['Deck'].apply(lambda x: x if x.isalpha() else 'X')
test_df['Deck'] = test_df['Deck'].apply(lambda x: x if x.isalpha() else 'X')
df = pd.get_dummies(df, columns=['Deck'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Deck'], drop_first=True)

# 2.4 Create FamilySize feature (using a weighted formula) and apply a log transform
df['FamilySize'] = np.log1p(df['SibSp'] * 3 + df['Parch'] * 2 + 1)
test_df['FamilySize'] = np.log1p(test_df['SibSp'] * 3 + test_df['Parch'] * 2 + 1)

# 2.5 One-Hot encode Embarked
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

# 2.6 Create FareBin: Discretize Fare into 5 quantile bins
df['FareBin'] = pd.qcut(df['Fare'], 5, labels=[1, 2, 3, 4, 5]).astype(int)
test_df['FareBin'] = pd.qcut(test_df['Fare'], 5, labels=[1, 2, 3, 4, 5]).astype(int)

# 2.7 Create AgeBin: Discretize Age into 5 bins
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, 100], labels=[1, 2, 3, 4, 5]).astype(int)
test_df['AgeBin'] = pd.cut(test_df['Age'], bins=[0, 12, 20, 40, 60, 100], labels=[1, 2, 3, 4, 5]).astype(int)

# ---------------------------
# 3. Drop Unnecessary Columns
# ---------------------------
# We drop columns that are not useful for prediction:
# - Name: already used to extract Title
# - Ticket: used for Deck extraction
# - Cabin: too many missing values
# - PassengerId: not predictive
# - SibSp, Parch, Fare, Age: as we have new features and bins for these
drop_cols = ['Name', 'Ticket', 'Cabin', 'PassengerId', 'SibSp', 'Parch', 'Fare', 'Age']
df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

# ---------------------------
# 4. Ensure Consistency between Train and Test
# ---------------------------
missing_cols = set(df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0  # Fill missing columns in test set with 0
# Align test_df column order with train set (excluding Survived)
test_df = test_df[df.columns.drop('Survived')]

# Check for missing values in the processed data
print("Train missing values:\n", df.drop(columns=['Survived']).isnull().sum())
print("Test missing values:\n", test_df.isnull().sum())

In [None]:
# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Plot distribution of AgeBin and FareBin
plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
sns.countplot(x='AgeBin', data=df)
plt.title("Distribution of Age Bins")
plt.xlabel("Age Bin")
plt.ylabel("Count")

plt.subplot(1, 2, 2)
sns.countplot(x='FareBin', data=df)
plt.title("Distribution of Fare Bins")
plt.xlabel("Fare Bin")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

# Plot survival rate by Pclass
plt.figure(figsize=(8,5))
sns.barplot(x='Pclass', y='Survived', data=df)
plt.title("Survival Rate by Pclass")
plt.ylabel("Survival Rate")
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Apply standard scaling (DNN performs better with normalized data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test_df)

# Split the training data into train and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Build a 6-layer DNN model
model = keras.Sequential([
    keras.Input(shape=(X_train.shape[1],)),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.2),
    
    layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Define optimizer with a custom learning rate
optimizer = keras.optimizers.Adam(learning_rate=0.001)

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Set up callbacks: EarlyStopping to prevent overfitting, and ModelCheckpoint to save the best model
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
    keras.callbacks.ModelCheckpoint('best_dnn_model.keras', monitor='val_accuracy', save_best_only=True)
]

# Train the model
history = model.fit(X_train, y_train, epochs=150, batch_size=16,
                    validation_data=(X_val, y_val),
                    callbacks=callbacks,
                    verbose=1)

# Evaluate the model on training and validation sets
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")

# Plot training history
plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Accuracy over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Predict on the test set
test_preds = model.predict(X_test_scaled)
# Convert probabilities to binary 0/1 outcomes
test_preds = (test_preds > 0.5).astype(int)

# Create the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': test_preds.flatten()
})

# Save the submission file
submission.to_csv('/Users/luoyi/Desktop/10_kaggle/01_titanic/submission_dnn.csv', index=False)
print("Submission file generated!")
print(submission.head())