In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Ensemble and Tree-based Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
# XGBoost and LightGBM (if installed)
XGBOOST_AVAILABLE = False
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    pass

# Define create_ensemble_model function here
def create_ensemble_model():
    base_models = [
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('xgb', XGBClassifier(random_state=42)) if XGBOOST_AVAILABLE else ('svm', SVC(probability=True, random_state=42))
    ]
    
    final_estimator = LogisticRegression(random_state=42)
    
    model = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator,
        cv=5,
        stack_method='predict_proba'
    )
    
    return model

Data Exploration

Load the train data

In [None]:
# Load the training data
train_data = pd.read_csv("train.csv")
train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Load the test data

In [6]:
test_data = pd.read_csv("test.csv")
test_data.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("Percentage of women who survived:", rate_women)

Percentage of women who survived: 0.7420382165605095


In [None]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("Percentage of men who survived:", rate_men)

Percentage of men who survived: 0.18890814558058924


Probability of survival for children, teenagers, Women and Men

In [None]:
# Compute survival probabilities for different demographic groups
def compute_survival_probabilities(df):
    children = df[df["Age"] < 13]  # Age < 13 considered as children
    teenagers = df[(df["Age"] >= 13) & (df["Age"] < 20)]  # Age 13-19 as teenagers
    women = df[df["Sex"] == "female"]  # All females
    men = df[df["Sex"] == "male"]  # All males
    
    groups = {"Children": children, "Teenagers": teenagers, "Women": women, "Men": men}
    survival_rates = {}
    
    for group_name, group_data in groups.items():
        survival_rate = group_data["Survived"].mean()
        survival_rates[group_name] = survival_rate
    
    return survival_rates

In [None]:
# Calculate survival probabilities by Group: Children, Teenagers, Women & Men
survival_probs = compute_survival_probabilities(train_data)
print("\nSurvival Probabilities by Group:")
for group, prob in survival_probs.items():
    print(f"{group}: {prob:.2%}")


Survival Probabilities by Group:
Children: 57.97%
Teenagers: 41.05%
Women: 74.20%
Men: 18.89%


Data Preprocessing

In [None]:
# Define function for feature engineering
def feature_engineering(data):
    data = data.copy()
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1  # Including the passenger
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    
    # Convert Age and Fare into categorical bins
    data['AgeBand'] = pd.cut(data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Middle_Aged', 'Senior'])
    data['FareBand'] = pd.qcut(data['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very_High'])
    
    # Convert intervals to strings for encoding later
    data['AgeBand'] = data['AgeBand'].astype(str)
    data['FareBand'] = data['FareBand'].astype(str)

    return data

# Define preprocessing pipeline
def create_preprocessing_pipeline():
    numerical_features = ['Age', 'Fare', 'FamilySize']
    categorical_features = ['Sex', 'Embarked', 'Title', 'AgeBand', 'FareBand']

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    return preprocessor

Define the Features and Model

Random Forest Model Training

In [None]:
# Function for ensemble model
def create_ensemble_model():
    base_models = [
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('svm', SVC(probability=True, random_state=42))
    ]
    
    final_estimator = LogisticRegression(random_state=42)
    
    model = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator,
        cv=5,
        stack_method='predict_proba'
    )
    
    return model

Model Training

In [None]:
# Required imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Ensure this is included
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

# Main function to train the model and generate predictions
def main():
    train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
    test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

    train_df = feature_engineering(train_data)
    test_df = feature_engineering(test_data)

    X = train_df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    y = train_df['Survived']

    preprocessor = create_preprocessing_pipeline()
    model = create_ensemble_model()

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Hyperparameter tuning with RandomizedSearchCV
    param_distributions = {
        'classifier__rf__n_estimators': [50, 100, 150],
        'classifier__rf__max_depth': [None, 5, 10],
        'classifier__gb__n_estimators': [50, 100, 150],
        'classifier__gb__learning_rate': [0.01, 0.1, 0.5],
        'classifier__svm__C': [0.1, 1, 10]
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    search = RandomizedSearchCV(full_pipeline, param_distributions, cv=cv, n_iter=10,
                                scoring='accuracy', n_jobs=-1, random_state=42)
    
    search.fit(X, y)

    best_model = search.best_estimator_
    
    print("Best Parameters:", search.best_params_)
    print("Best Cross-Validation Score:", search.best_score_)
    
    # Train set performance
    y_pred_train = best_model.predict(X)
    
    print("Confusion Matrix:")
    print(confusion_matrix(y, y_pred_train))
    
    print(f"Model Accuracy: {accuracy_score(y, y_pred_train):.4f}")
    
    print("Classification Report:")
    print(classification_report(y, y_pred_train))
    
    # Predict on test set
    X_test = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    predictions = best_model.predict(X_test)

    submission = pd.DataFrame({
        'PassengerId': test_df['PassengerId'],
        'Survived': predictions
    })
    
    submission.to_csv('submission.csv', index=False)
    
    print("Your submission was successfully saved!")

# Run the main function
if __name__ == "__main__":
    main()

Best Parameters: {'classifier__svm__C': 1, 'classifier__rf__n_estimators': 100, 'classifier__rf__max_depth': 5, 'classifier__gb__n_estimators': 50, 'classifier__gb__learning_rate': 0.5}
Best Cross-Validation Score: 0.8417362375243236
Confusion Matrix:
[[524  25]
 [ 61 281]]
Model Accuracy: 0.9035
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92       549
           1       0.92      0.82      0.87       342

    accuracy                           0.90       891
   macro avg       0.91      0.89      0.90       891
weighted avg       0.90      0.90      0.90       891

Your submission was successfully saved!
