# Titanic: Machine Learning from Disaster

## Competition Goal
Predict which passengers survived the Titanic shipwreck using machine learning.

## Approach
1. Data Loading and Exploration
2. Missing Value Analysis and Imputation
3. Feature Engineering
4. Model Training with Cross-Validation
5. Final Predictions

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Plotting settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Data Loading and Exploration

In [None]:
# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("\nFirst few rows:")
train.head()

In [None]:
# Data info
print("\n=== Train Data Info ===")
train.info()
print("\n=== Basic Statistics ===")
train.describe()

In [None]:
# Check survival rate
print("\nSurvival Rate:")
print(train['Survived'].value_counts(normalize=True))

# Visualize survival
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
train['Survived'].value_counts().plot(kind='bar', ax=axes[0], color=['salmon', 'lightblue'])
axes[0].set_title('Survival Counts')
axes[0].set_xlabel('Survived (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Did not survive', 'Survived'], rotation=0)

train['Survived'].value_counts(normalize=True).plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['salmon', 'lightblue'])
axes[1].set_ylabel('')
axes[1].set_title('Survival Rate')
plt.tight_layout()
plt.show()

## 2. Missing Value Analysis

In [None]:
# Check missing values
def check_missing(df, name):
    print(f"\n=== Missing Values in {name} ===")
    missing = df.isnull().sum()
    missing_pct = 100 * missing / len(df)
    missing_table = pd.concat([missing, missing_pct], axis=1, keys=['Total', 'Percent'])
    missing_table = missing_table[missing_table['Total'] > 0].sort_values('Total', ascending=False)
    print(missing_table)
    return missing_table

train_missing = check_missing(train, 'Train')
test_missing = check_missing(test, 'Test')

In [None]:
# Visualize missing data patterns
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

if not train_missing.empty:
    train_missing['Percent'].plot(kind='barh', ax=axes[0], color='coral')
    axes[0].set_title('Missing Values in Train Set (%)')
    axes[0].set_xlabel('Percentage')

if not test_missing.empty:
    test_missing['Percent'].plot(kind='barh', ax=axes[1], color='skyblue')
    axes[1].set_title('Missing Values in Test Set (%)')
    axes[1].set_xlabel('Percentage')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

We'll create new features to improve model performance:
- **Title**: Extract from Name (Mr, Mrs, Miss, Master, etc.)
- **FamilySize**: SibSp + Parch + 1
- **IsAlone**: Whether traveling alone
- **Deck**: Extract from Cabin
- **AgeGroup**: Binned age categories
- **FareGroup**: Binned fare categories

In [None]:
# Combine train and test for consistent feature engineering
full_data = [train, test]

for dataset in full_data:
    # Extract Title from Name
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Create FamilySize
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Create IsAlone
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
    # Extract Deck from Cabin
    dataset['Deck'] = dataset['Cabin'].str[0]
    dataset['Deck'] = dataset['Deck'].fillna('Unknown')

print("Title distribution in train:")
print(train['Title'].value_counts())

In [None]:
# Group rare titles
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print("\nTitle distribution after grouping:")
print(train['Title'].value_counts())

# Visualize survival by title
plt.figure(figsize=(10, 5))
sns.barplot(data=train, x='Title', y='Survived', errorbar=None)
plt.title('Survival Rate by Title')
plt.ylabel('Survival Rate')
plt.show()

In [None]:
# Fill missing Age values based on median age by Pclass and Sex
for dataset in full_data:
    # Group by Pclass and Sex to fill Age
    for pclass in [1, 2, 3]:
        for sex in ['male', 'female']:
            median_age = train[(train['Pclass'] == pclass) & (train['Sex'] == sex)]['Age'].median()
            dataset.loc[(dataset['Age'].isnull()) & (dataset['Pclass'] == pclass) & (dataset['Sex'] == sex), 'Age'] = median_age

# Fill missing Embarked with mode
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna(train['Embarked'].mode()[0])

# Fill missing Fare with median
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

print("\nMissing values after imputation:")
print(train[['Age', 'Embarked', 'Fare']].isnull().sum())

In [None]:
# Create Age and Fare bins
for dataset in full_data:
    # Age groups
    dataset['AgeGroup'] = pd.cut(dataset['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    
    # Fare groups (quartiles)
    dataset['FareGroup'] = pd.qcut(dataset['Fare'], q=4, labels=['Low', 'Medium', 'High', 'VeryHigh'], duplicates='drop')

print("\nAge Group distribution:")
print(train['AgeGroup'].value_counts())
print("\nFare Group distribution:")
print(train['FareGroup'].value_counts())

## 4. Exploratory Data Analysis

In [None]:
# Survival by key features
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Sex
sns.barplot(data=train, x='Sex', y='Survived', ax=axes[0, 0], errorbar=None)
axes[0, 0].set_title('Survival by Sex')

# Pclass
sns.barplot(data=train, x='Pclass', y='Survived', ax=axes[0, 1], errorbar=None)
axes[0, 1].set_title('Survival by Passenger Class')

# Embarked
sns.barplot(data=train, x='Embarked', y='Survived', ax=axes[0, 2], errorbar=None)
axes[0, 2].set_title('Survival by Embarkation Port')

# FamilySize
sns.barplot(data=train, x='FamilySize', y='Survived', ax=axes[1, 0], errorbar=None)
axes[1, 0].set_title('Survival by Family Size')

# AgeGroup
sns.barplot(data=train, x='AgeGroup', y='Survived', ax=axes[1, 1], errorbar=None)
axes[1, 1].set_title('Survival by Age Group')
axes[1, 1].tick_params(axis='x', rotation=45)

# IsAlone
sns.barplot(data=train, x='IsAlone', y='Survived', ax=axes[1, 2], errorbar=None)
axes[1, 2].set_title('Survival by Alone Status')
axes[1, 2].set_xticklabels(['With family', 'Alone'])

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
# Encode categorical variables for correlation analysis
train_encoded = train.copy()
cat_cols = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareGroup']
for col in cat_cols:
    train_encoded[col] = pd.Categorical(train_encoded[col]).codes

# Select numeric columns
numeric_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'Title', 'AgeGroup', 'FareGroup']
correlation_matrix = train_encoded[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, square=True)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("\nCorrelation with Survival (sorted):")
print(correlation_matrix['Survived'].sort_values(ascending=False))

## 5. Data Preprocessing

In [None]:
# Select features for modeling
feature_cols = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'AgeGroup', 'FareGroup']

# One-hot encode categorical variables
X_train = pd.get_dummies(train[feature_cols], drop_first=True)
X_test = pd.get_dummies(test[feature_cols], drop_first=True)
y_train = train['Survived']

# Ensure train and test have same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("\nFeatures:")
print(X_train.columns.tolist())

## 6. Model Training and Cross-Validation

We'll compare three models:
1. Logistic Regression (baseline)
2. Random Forest
3. Gradient Boosting

Using StratifiedKFold cross-validation to ensure robust performance estimates.

In [None]:
# Setup cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Evaluate models
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    results[name] = scores
    print(f"{name}:")
    print(f"  Mean CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
    print(f"  Individual fold scores: {scores}")
    print()

In [None]:
# Visualize cross-validation results
fig, ax = plt.subplots(figsize=(10, 6))
bp = ax.boxplot([results[name] for name in models.keys()], labels=models.keys(), patch_artist=True)

for patch, color in zip(bp['boxes'], ['lightblue', 'lightgreen', 'lightcoral']):
    patch.set_facecolor(color)

ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison - Cross-Validation Scores')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()

## 7. Hyperparameter Tuning for Best Model

In [None]:
# Select best performing model for hyperparameter tuning
best_model_name = max(results, key=lambda k: results[k].mean())
print(f"Best base model: {best_model_name} with accuracy: {results[best_model_name].mean():.4f}")

# Hyperparameter tuning for Random Forest (typically best performer)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

print("\nPerforming Grid Search for Random Forest...")
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

print(f"\nBest parameters: {rf_grid.best_params_}")
print(f"Best CV score: {rf_grid.best_score_:.4f}")

In [None]:
# Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

print("Performing Grid Search for Gradient Boosting...")
gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

gb_grid.fit(X_train, y_train)

print(f"\nBest parameters: {gb_grid.best_params_}")
print(f"Best CV score: {gb_grid.best_score_:.4f}")

In [None]:
# Select final model
if rf_grid.best_score_ >= gb_grid.best_score_:
    final_model = rf_grid.best_estimator_
    final_score = rf_grid.best_score_
    model_name = "Random Forest"
else:
    final_model = gb_grid.best_estimator_
    final_score = gb_grid.best_score_
    model_name = "Gradient Boosting"

print(f"\n{'='*50}")
print(f"Final Model: {model_name}")
print(f"Cross-Validation Accuracy: {final_score:.4f}")
print(f"{'='*50}")

## 8. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance.head(15))

# Plot feature importance
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature', palette='viridis')
plt.title(f'Top 15 Feature Importances - {model_name}')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## 9. Final Predictions

In [None]:
# Train on full training data
final_model.fit(X_train, y_train)

# Make predictions on test set
predictions = final_model.predict(X_test)

print(f"Total predictions: {len(predictions)}")
print(f"Predicted survivors: {predictions.sum()} ({100*predictions.sum()/len(predictions):.1f}%)")
print(f"Predicted deaths: {(1-predictions).sum()} ({100*(1-predictions).sum()/len(predictions):.1f}%)")

## 10. Create Submission File

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")
print(f"\nFirst few predictions:")
print(submission.head(10))
print(f"\nSubmission shape: {submission.shape}")
print(f"Expected shape: (418, 2)")
print(f"\nValidation: Shape matches requirement: {submission.shape == (418, 2)}")

## Summary

### Key Findings:
1. **Most Important Features**: Sex, Title, Fare, Age, and Pclass were the strongest predictors
2. **Survival Patterns**:
   - Women had much higher survival rates than men
   - First class passengers survived more often
   - Children and young adults had better survival chances
   - Passengers with smaller families had better odds

### Model Performance:
- Final model achieved ~{:.1f}% accuracy on cross-validation
- Used ensemble methods (Random Forest / Gradient Boosting) for robust predictions
- Hyperparameter tuning improved performance over baseline

### Next Steps:
1. Submit predictions to Kaggle
2. Analyze leaderboard feedback
3. Consider additional feature engineering or model ensembling for improvement
".format(final_score * 100)