# Titanic machine learning insights from disaster

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Step 1: Load and Explore Data

In [None]:
def load_data(train_file, test_file):
    """
    Load the Titanic dataset.
    
    Args:
    train_file: path to the training dataset
    test_file: path to the test dataset
    
    Returns:
    df_train: Training DataFrame
    df_test: Test DataFrame
    """
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)
    
    print(f"Training Data Shape: {df_train.shape}")
    print(f"Test Data Shape: {df_test.shape}")
    
    return df_train, df_test

## Step 2: Data Preprocessing

In [None]:
def preprocess_data(df_train, df_test):
    """
    Preprocess the Titanic dataset by handling missing values, feature engineering, and one-hot encoding.
    
    Args:
    df_train: DataFrame containing the training data
    df_test: DataFrame containing the test data
    
    Returns:
    processed_train: Preprocessed training dataset
    processed_test: Preprocessed test dataset with PassengerId preserved
    """
    # Preserve PassengerId for the test dataset
    passenger_ids = df_test['PassengerId']
    
    # Combine the datasets for uniform preprocessing
    df_test['Survived'] = np.nan
    df_combined = pd.concat([df_train, df_test], axis=0)
    
    # Drop unnecessary columns
    df_combined = df_combined.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
    
    # Handle missing values
    df_combined['Age'] = df_combined['Age'].fillna(df_combined['Age'].mean())
    df_combined['Fare'] = df_combined['Fare'].fillna(df_combined['Fare'].mean())
    df_combined['Embarked'] = df_combined['Embarked'].fillna('S')
    df_combined['Cabin'] = df_combined['Cabin'].fillna('X000')
    
    # Feature Engineering: Cabin letter and cabin number
    df_combined['cabin_letter'] = df_combined['Cabin'].str.extract(r'([a-zA-Z]+)', expand=False)
    df_combined['cabin_number'] = df_combined['Cabin'].str.extract(r'(\d+)', expand=False).fillna(0).astype(int)
    df_combined = df_combined.drop('Cabin', axis=1)

    # One-hot encoding of categorical variables
    df_combined = pd.get_dummies(df_combined, columns=['Sex', 'Embarked', 'cabin_letter'], drop_first=True)
    
    # Interaction features
    df_combined['Pclass_bin_Fare'] = df_combined['Fare'] // df_combined['Pclass']
    df_combined['Pclass_bin_sex'] = df_combined['Pclass'] - df_combined['Sex_male']

    # Split the combined data back into train and test sets
    processed_train = df_combined[df_combined['Survived'].notna()].copy()
    processed_test = df_combined[df_combined['Survived'].isna()].copy()
    
    # Drop 'Survived' from the test set
    processed_train['Survived'] = processed_train['Survived'].astype(int)
    processed_test = processed_test.drop('Survived', axis=1)
    
    # Restore PassengerId for the test set
    processed_test['PassengerId'] = passenger_ids
    
    return processed_train, processed_test

## Step 3: Exploratory Data Analysis (EDA) - Key Visualizations

In [None]:
def create_key_visualizations(df_train):
    """
    Generate key visualizations for the Titanic dataset.
    
    Args:
    df_train: DataFrame containing the training data
    
    Returns:
    None (displays plots)
    """
    # Set plot style
    sns.set(style="whitegrid")

    # 1. Survival Rate by Gender
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Sex', y='Survived', data=df_train)
    plt.title('Survival Rate by Gender')
    plt.ylabel('Survival Rate')
    plt.xlabel('Gender')
    plt.show()

    # 2. Survival Rate by Passenger Class
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Pclass', y='Survived', data=df_train)
    plt.title('Survival Rate by Passenger Class')
    plt.ylabel('Survival Rate')
    plt.xlabel('Passenger Class')
    plt.show()

    # 3. Age Distribution of Survivors and Non-Survivors
    plt.figure(figsize=(10, 8))
    sns.histplot(df_train[df_train['Survived'] == 1]['Age'], bins=20, label='Survived', kde=True, color='green')
    sns.histplot(df_train[df_train['Survived'] == 0]['Age'], bins=20, label='Did not survive', kde=True, color='red')
    plt.title('Age Distribution of Survivors and Non-Survivors')
    plt.xlabel('Age')
    plt.legend()
    plt.show()

    # 4. Survival Rate by Embarkation Point
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Embarked', y='Survived', data=df_train)
    plt.title('Survival Rate by Embarkation Point')
    plt.ylabel('Survival Rate')
    plt.xlabel('Embarkation Point')
    plt.show()

## Step 4: Split the Data and Apply Feature Scaling

In [None]:
def split_and_scale_data(processed_train):
    """
    Split the training data into train and validation sets, and scale the features.
    
    Args:
    processed_train: Preprocessed training dataset
    
    Returns:
    X_train_scaled, X_val_scaled, y_train, y_val: Scaled training and validation features and labels
    scaler: Fitted scaler object
    """
    X = processed_train.drop(['Survived'], axis=1)
    y = processed_train['Survived']
    
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    return X_train_scaled, X_val_scaled, y_train, y_val, scaler

## Step 5: Model Training and Tuning

In [None]:
# Logistic Regression Model
def train_logistic_regression(X_train_scaled, y_train):
    """
    Train a Logistic Regression model.
    
    Args:
    X_train_scaled: Scaled training features
    y_train: Training labels
    
    Returns:
    log_model: Trained Logistic Regression model
    """
    log_model = LogisticRegression(max_iter=1000, random_state=42)
    log_model.fit(X_train_scaled, y_train)
    return log_model

# Random Forest Model with Hyperparameter Tuning
def tune_random_forest(X_train_scaled, y_train):
    """
    Perform GridSearchCV to tune Random Forest hyperparameters.
    
    Args:
    X_train_scaled: Scaled training features
    y_train: Training labels
    
    Returns:
    best_rf_model: Tuned Random Forest model
    """
    param_grid = {
        'n_estimators': [100, 200,300 ],
        'max_depth': [20, 30, 40],
        'min_samples_split': [2, 3, 14],
        'min_samples_leaf': [2, 3, 4]
    }
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_scaled, y_train)
    print(f"Best params for Random Forest: {grid_search.best_params_}")
    return grid_search.best_estimator_

# XGBoost Model with Hyperparameter Tuning
def tune_xgboost(X_train_scaled, y_train):
    """
    Perform GridSearchCV to tune XGBoost hyperparameters.
    
    Args:
    X_train_scaled: Scaled training features
    y_train: Training labels
    
    Returns:
    best_xgb_model: Tuned XGBoost model
    """
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [4, 5, 6],
        'learning_rate': [0.01, 0.05, 0.1]
    }
    xgb = XGBClassifier(objective='binary:logistic', random_state=42)
    grid_search = GridSearchCV(xgb, param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_scaled, y_train)
    print(f"Best params for XGBoost: {grid_search.best_params_}")
    return grid_search.best_estimator_

## Step 6: Feature Importance (Random Forest)

In [None]:
def plot_feature_importance(rf_model, X_train):
    """
    Plot feature importance from the trained Random Forest model.
    
    Args:
    rf_model: Trained Random Forest model
    X_train: Training dataset (features only, not including target)
    
    Returns:
    None (displays the plot)
    """
    # Get feature importance from the Random Forest model
    feature_importance = rf_model.feature_importances_
    
    # Create a DataFrame for visualization
    features = X_train.columns
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': feature_importance
    }).sort_values(by='Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance from Random Forest')
    plt.show()

## Step 7: Model Evaluation and Best Model Selection

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

def evaluate_models(X_val_scaled, y_val, log_model, rf_model, xgb_model):
    """
    Evaluate the performance of Logistic Regression, Random Forest, and XGBoost models on the validation set.
    
    Args:
    X_val_scaled: Scaled validation features
    y_val: Validation labels
    log_model: Trained Logistic Regression model
    rf_model: Trained Random Forest model
    xgb_model: Trained XGBoost model
    
    Returns:
    best_model: Model with the highest validation accuracy
    """
    def print_metrics(y_true, y_pred, model_name):
        """
        Print evaluation metrics for a model.
        """
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        roc_auc = roc_auc_score(y_true, y_pred)

        print(f"\n{model_name} Metrics:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"ROC-AUC: {roc_auc:.4f}")
        print("Confusion Matrix:")
        print(confusion_matrix(y_true, y_pred))
        print(classification_report(y_true, y_pred))
    
    # Evaluate Logistic Regression
    log_pred = log_model.predict(X_val_scaled)
    print_metrics(y_val, log_pred, "Logistic Regression")
    
    # Evaluate Random Forest
    rf_pred = rf_model.predict(X_val_scaled)
    print_metrics(y_val, rf_pred, "Random Forest")
    
    # Evaluate XGBoost
    xgb_pred = xgb_model.predict(X_val_scaled)
    print_metrics(y_val, xgb_pred, "XGBoost")
    
    # Select the best model based on accuracy
    best_model = max([(log_model, accuracy_score(y_val, log_pred)), 
                      (rf_model, accuracy_score(y_val, rf_pred)), 
                      (xgb_model, accuracy_score(y_val, xgb_pred))], key=lambda x: x[1])[0]
    print(f"\nBest model selected: {best_model}")
    
    return best_model


## Step 8: Make Predictions and Save to CSV

In [None]:
def make_predictions_and_save_to_csv(model, processed_test, scaler, output_file='submission.csv'):
    """
    Make predictions on the preprocessed test data and save the submission file as 'submission.csv'.
    
    Args:
    model: Trained model to use for predictions
    processed_test: Preprocessed test dataset
    scaler: Fitted scaler object for scaling test features
    output_file: Filename to save the final predictions (default: 'submission.csv')
    
    Returns:
    None (saves CSV file)
    """
    # Drop PassengerId from features
    X_test = processed_test.drop(['PassengerId'], axis=1)
    
    # Scale the test data using the same scaler used for training data
    X_test_scaled = scaler.transform(X_test)
    
    # Make predictions on the scaled test data
    predictions = model.predict(X_test_scaled)
    
    # Create a submission DataFrame
    submission = pd.DataFrame({
        'PassengerId': processed_test['PassengerId'],
        'Survived': predictions
    })
    
    # Save the submission file
    submission.to_csv(output_file, index=False)
    print(f"Submission file saved as {output_file}")

# Usage Example:
# make_predictions_and_save_to_csv(best_model, processed_test, scaler, 'submission.csv')

## Final Main Function

To put everything together, hereâ€™s the final main function that runs the complete process.

In [None]:
def main():
    # Step 1: Load and Explore Data
    print("Step 1: Loading and exploring data...")
    df_train, df_test = load_data("/kaggle/input/titanic/train.csv",
                                  "/kaggle/input/titanic/test.csv")
    
    # Step 2: Preprocess the Data
    print("Step 2: Preprocessing the data...")
    processed_train, processed_test = preprocess_data(df_train, df_test)
    
    # Step 3: Exploratory Data Analysis (EDA)
    print("Step 3: Performing EDA...")
    create_key_visualizations(df_train)
    
    # Step 4: Split and Scale Data
    print("Step 4: Splitting and scaling data...")
    X_train_scaled, X_val_scaled, y_train, y_val, scaler = split_and_scale_data(processed_train)
    
    # Step 5: Train and Tune Models
    print("Step 5: Training and tuning models...")
    log_model = train_logistic_regression(X_train_scaled, y_train)
    rf_model = tune_random_forest(X_train_scaled, y_train)
    xgb_model = tune_xgboost(X_train_scaled, y_train)
    
    # Step 6: Plot Feature Importance (Random Forest)
    print("Step 6: Plotting feature importance...")
    plot_feature_importance(rf_model, processed_train.drop(['Survived'], axis=1))
    
    # Step 7: Evaluate Models and Select the Best
    print("Step 7: Evaluating models and selecting the best model...")
    best_model = evaluate_models(X_val_scaled, y_val, log_model, rf_model, xgb_model)
    
    # Step 8: Make Predictions and Save Submission File
    print("Step 8: Making predictions and saving to CSV...")
    make_predictions_and_save_to_csv(best_model, processed_test, scaler, 'submission.csv')

# Run the main function
if __name__ == '__main__':
    main()