# üö¢ Titanic - Complete Solution

**Goal**: Predict survival on the Titanic (Binary Classification)

**Metric**: Accuracy

---

## üì¶ Setup

In [None]:
import sys
sys.path.append('../../..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Our shared utilities
from shared.utils import set_seed
from shared.auto_eda import quick_eda

set_seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

%matplotlib inline

: 

## üìÇ Load Data

In [None]:
train = pd.read_csv('data/raw/train.csv')
test = pd.read_csv('data/raw/test.csv')

print(f"Train: {train.shape}")
print(f"Test: {test.shape}")

# Save test IDs for submission
test_ids = test['PassengerId']

In [None]:
train.head()

In [None]:
train.info()

## üîç Quick Automated EDA

In [None]:
# Run our automated EDA
report = quick_eda(train, target_col='Survived')

## üìä Survival Analysis

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Survival rate
train['Survived'].value_counts().plot(kind='bar', ax=axes[0,0], color=['#ff6b6b', '#51cf66'])
axes[0,0].set_title('Survival Count')
axes[0,0].set_xticklabels(['Died', 'Survived'], rotation=0)

# By Sex
train.groupby('Sex')['Survived'].mean().plot(kind='bar', ax=axes[0,1], color=['#ff6b6b', '#51cf66'])
axes[0,1].set_title('Survival Rate by Sex')
axes[0,1].set_ylabel('Survival Rate')
axes[0,1].tick_params(rotation=0)

# By Pclass
train.groupby('Pclass')['Survived'].mean().plot(kind='bar', ax=axes[0,2], color='steelblue')
axes[0,2].set_title('Survival Rate by Class')
axes[0,2].set_ylabel('Survival Rate')
axes[0,2].tick_params(rotation=0)

# Age distribution
train[train['Survived']==1]['Age'].hist(ax=axes[1,0], bins=30, alpha=0.7, label='Survived', color='#51cf66')
train[train['Survived']==0]['Age'].hist(ax=axes[1,0], bins=30, alpha=0.7, label='Died', color='#ff6b6b')
axes[1,0].set_title('Age Distribution by Survival')
axes[1,0].legend()

# By Embarked
train.groupby('Embarked')['Survived'].mean().plot(kind='bar', ax=axes[1,1], color='steelblue')
axes[1,1].set_title('Survival Rate by Embarked')
axes[1,1].tick_params(rotation=0)

# Fare distribution
train['Fare'].hist(ax=axes[1,2], bins=50, color='steelblue')
axes[1,2].set_title('Fare Distribution')

plt.tight_layout()
plt.show()

## üßπ Feature Engineering

In [None]:
def engineer_features(df):
    """Feature engineering for Titanic dataset."""
    df = df.copy()
    
    # --- Extract Title from Name ---
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.')
    
    # Group rare titles
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Rev': 'Rare', 'Dr': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Mme': 'Mrs', 'Ms': 'Miss', 'Lady': 'Rare',
        'Sir': 'Rare', 'Capt': 'Rare', 'Countess': 'Rare', 'Don': 'Rare',
        'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
    df['Title'] = df['Title'].map(title_mapping).fillna('Rare')
    
    # --- Family Features ---
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # --- Age Groups ---
    # Fill missing ages with median by Title
    df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
    df['Age'] = df['Age'].fillna(df['Age'].median())  # Fallback
    
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                            labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    df['IsChild'] = (df['Age'] < 12).astype(int)
    
    # --- Fare ---
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    df['FareBin'] = pd.qcut(df['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'], duplicates='drop')
    
    # --- Cabin ---
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].str[0].fillna('Unknown')
    
    # --- Embarked ---
    df['Embarked'] = df['Embarked'].fillna('S')  # Most common
    
    # --- Sex ---
    df['Sex'] = (df['Sex'] == 'male').astype(int)
    
    return df

In [None]:
# Apply feature engineering
train_fe = engineer_features(train)
test_fe = engineer_features(test)

print(f"Train shape after FE: {train_fe.shape}")
print(f"New features: {[c for c in train_fe.columns if c not in train.columns]}")

## üîß Prepare Final Features

In [None]:
# Features to use
FEATURES = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
    'FamilySize', 'IsAlone', 'IsChild', 'HasCabin', 'FarePerPerson'
]

# Categorical features to encode
CAT_FEATURES = ['Title', 'Embarked', 'AgeGroup', 'FareBin', 'Deck']

# Encode categoricals
all_data = pd.concat([train_fe, test_fe], ignore_index=True)

for col in CAT_FEATURES:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))
    train_fe[col] = le.transform(train_fe[col].astype(str))
    test_fe[col] = le.transform(test_fe[col].astype(str))

ALL_FEATURES = FEATURES + CAT_FEATURES

X = train_fe[ALL_FEATURES]
y = train_fe['Survived']
X_test = test_fe[ALL_FEATURES]

print(f"Final features: {len(ALL_FEATURES)}")
print(f"X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}")

## ü§ñ Model Training

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
}

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    results[name] = scores
    print(f"{name:25s} | CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

In [None]:
# Visualize results
plt.figure(figsize=(10, 5))
plt.boxplot(results.values(), labels=results.keys())
plt.title('Model Comparison (5-Fold CV)')
plt.ylabel('Accuracy')
plt.ylim(0.75, 0.90)
plt.show()

## üéØ Train Final Model & Predict

In [None]:
# Use best model (typically Random Forest or Gradient Boosting)
final_model = GradientBoostingClassifier(n_estimators=150, max_depth=4, random_state=42)
final_model.fit(X, y)

# Feature importance
importance = pd.DataFrame({
    'feature': ALL_FEATURES,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance['feature'], importance['importance'], color='steelblue')
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Make predictions
predictions = final_model.predict(X_test)

print(f"Predictions shape: {predictions.shape}")
print(f"Survival rate: {predictions.mean():.2%}")

## üì§ Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': predictions
})

# Verify format
print(f"Submission shape: {submission.shape}")
print(f"Expected: (418, 2)")
print(f"\nColumns: {submission.columns.tolist()}")
submission.head(10)

In [None]:
# Save submission
import os
os.makedirs('submissions', exist_ok=True)

submission.to_csv('submissions/submission.csv', index=False)
print("‚úÖ Submission saved to submissions/submission.csv")

# Quick sanity check
check = pd.read_csv('submissions/submission.csv')
print(f"\nüìä Sanity Check:")
print(f"   Rows: {len(check)} (expected 418)")
print(f"   Columns: {check.columns.tolist()}")
print(f"   Survived 0s: {(check['Survived']==0).sum()}")
print(f"   Survived 1s: {(check['Survived']==1).sum()}")

---

## üéâ Done!

Your submission file is ready at `submissions/submission.csv`.

**To submit:**
1. Go to [kaggle.com/c/titanic](https://www.kaggle.com/c/titanic)
2. Click "Submit Predictions"
3. Upload `submissions/submission.csv`

**Expected accuracy: ~78-80%** (Top 20% on leaderboard)

### üí° Ideas to Improve
- Try ensemble of multiple models
- Add more feature engineering (ticket prefix, name length, etc.)
- Use LightGBM or XGBoost
- Tune hyperparameters with GridSearchCV