In [None]:
"""
Titanic Survival Prediction - Kaggle Competition
=================================================
This notebook builds a machine learning model to predict passenger survival 
on the Titanic using Random Forest and other classification algorithms.

Dataset: Kaggle Titanic Competition
Author: Achal Agarwal
"""

# ============================================================================
# 1. IMPORT LIBRARIES
# ============================================================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# ============================================================================
# 2. LOAD DATA
# ============================================================================

# Set path to data directory
path = 'C:/Users/Achal Agarwal/OneDrive/Kaggle/Titanic/'

# Load training data, test data, and sample submission
train_data = pd.read_csv(path + 'data/train.csv')
test_data = pd.read_csv(path + 'data/test.csv')
gender_submission = pd.read_csv(path + 'data/gender_submission.csv')

# Display first 50 rows to understand data structure
train_data.head(50)

# ============================================================================
# 3. EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================================

# Calculate survival rate for women
women = train_data[train_data['Sex'] == 'female']['Survived']
rate_women = sum(women) / len(women) * 100
print(f"Women survival rate: {rate_women:.2f}%")

# ============================================================================
# 4. DATA CLEANING
# ============================================================================

print("Missing values before cleaning:")
print(train_data[["Age", "Fare", "Embarked"]].isnull().sum())

# Fill missing Age values with median (more robust to outliers than mean)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

# Fill missing Fare values with median
train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# Fill missing Embarked values with mode (most common port)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

print("\nMissing values after cleaning:")
print(train_data[["Age", "Fare", "Embarked"]].isnull().sum())

# ============================================================================
# 5. FEATURE ENGINEERING
# ============================================================================

# Create FamilySize feature (total family members aboard)
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Create IsAlone feature (1 if traveling alone, 0 otherwise)
train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)

# Extract Title from passenger names (Mr, Mrs, Miss, Master, etc.)
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Group rare titles into 'Rare' category
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 
               'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train_data['Title'] = train_data['Title'].replace(rare_titles, 'Rare')
test_data['Title'] = test_data['Title'].replace(rare_titles, 'Rare')

# Standardize similar titles
train_data['Title'] = train_data['Title'].replace('Mlle', 'Miss')
train_data['Title'] = train_data['Title'].replace('Ms', 'Miss')
train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')

test_data['Title'] = test_data['Title'].replace('Mlle', 'Miss')
test_data['Title'] = test_data['Title'].replace('Ms', 'Miss')
test_data['Title'] = test_data['Title'].replace('Mme', 'Mrs')

# Define feature list for model training
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", 
            "Embarked", "FamilySize", "IsAlone", "Title"]

# ============================================================================
# 6. PREPARE DATA FOR MODELING
# ============================================================================

# Separate target variable (Survived)
y = train_data["Survived"]

# Apply one-hot encoding to categorical features
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# Ensure test data has same columns as training data
# Missing columns are filled with 0, extra columns are dropped
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# ============================================================================
# 7. MODEL TRAINING - RANDOM FOREST
# ============================================================================

# Initialize Random Forest with tuned hyperparameters
model = RandomForestClassifier(
    n_estimators=200,       # Number of trees in the forest
    max_depth=7,            # Maximum depth of each tree
    min_samples_split=4,    # Minimum samples required to split a node
    random_state=1          # For reproducibility
)

# Train the model
model.fit(X, y)

# Make predictions on test data
predictions = model.predict(X_test)

# Create submission file
output = pd.DataFrame({
    'PassengerId': test_data.PassengerId, 
    'Survived': predictions
})
output.to_csv('my_submission.csv', index=False)
print("âœ… Submission file created: my_submission.csv")

# ============================================================================
# 8. MODEL EVALUATION
# ============================================================================

# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"\n=== Cross-Validation Results ===")
print(f"All 5 fold scores: {scores}")
print(f"Average accuracy: {scores.mean():.4f}")
print(f"Standard deviation: {scores.std():.4f}")

# ============================================================================
# 9. COMPARE MULTIPLE MODELS
# ============================================================================

print(f"\n=== Comparing Different Models ===")

models = {
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=7, random_state=1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=1),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=1)
}

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")

# ============================================================================
# 10. STRATIFIED K-FOLD VALIDATION
# ============================================================================

# Use stratified k-fold to maintain class distribution in each fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

print(f"\n=== Stratified K-Fold Results ===")
print(f"Mean accuracy: {scores.mean():.4f}")
print(f"Standard deviation: {scores.std():.4f}")

# ============================================================================
# 11. HYPERPARAMETER TUNING (OPTIONAL)
# ============================================================================

print(f"\n=== Hyperparameter Tuning ===")

# Define parameter distribution for random search
param_dist = {
    'n_estimators': [200, 400, 600],
    'max_depth': [5, 7, 10, None],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 0.5, 1.0],
}

# Perform randomized search with cross-validation
search = RandomizedSearchCV(
    RandomForestClassifier(random_state=1),
    param_dist, 
    n_iter=20,              # Number of parameter combinations to try
    cv=cv, 
    scoring='accuracy', 
    n_jobs=-1,              # Use all CPU cores
    random_state=1
)

search.fit(X, y)

print(f"Best cross-validation score: {search.best_score_:.4f}")
print(f"Best parameters: {search.best_params_}")

# ============================================================================
# END OF NOTEBOOK
# ============================================================================