In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.plotting import scatter_matrix 
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from warnings import filterwarnings
from sklearn.exceptions import ConvergenceWarning
from scipy.stats import mode
from scipy.linalg import LinAlgWarning
import joblib

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
num_feats = ["PassengerId", "Survived", "Pclass", "Age", "SibSp", "Parch", "Fare"]
cat_feats = ["Name", "Sex", "Ticket", "Cabin", "Embarked"]
df.head()

## EDA

In [None]:
dfna = df.isna().sum().reset_index().rename(columns={0: "Missing_Vals"})
display(dfna[dfna["Missing_Vals"] != 0])

In [None]:
# Calculate the percentage of missing data for each feature
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Create a DataFrame to display the results
missing_data_df = pd.DataFrame({'Feature': missing_percentage.index, 'Missing Percentage': missing_percentage.values})
display(missing_data_df)

In [None]:
df.describe()

In [None]:
sns.pairplot(df[num_feats], diag_kind='hist', hue="Survived", palette={0: "red", 1: "blue"})
plt.tight_layout()
plt.show()

In [None]:

# Create boxplots for each numerical feature
for feat in num_feats:
    sns.boxplot(data=df, y=feat, x="Survived", palette={0: "red", 1: "blue"})  # Create a boxplot for the feature
    plt.title(f'Boxplot of {feat} by Survived')  # Set the subplot title
    plt.show()

In [None]:
# sns.countplot(data=df[cat_feats])
for feat in cat_feats:
    sns.countplot(data=df, x=feat, hue="Survived", palette={0: "red", 1: "blue"})
    plt.title(f"Hist by class of {feat}")
    #     plt.tight_layout()
    plt.show()

Overviewing some initail analysis of the data we definately see that some features will be more helpful than others. We also can see that many of the categorical features are somewhat unique for each passenger. Such as name and ticket. We can also try extracting some information from each of these and implement some feature engineering to see if those features help more with prediction.

## Feature Engineering

Further Set-up

In [None]:
set_config(transform_output="pandas")

In [None]:
#Add pre proccessor

title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

def get_substring(str, substrings):
    for substr in substrings:
        if substr in str:
            return substr
        else:
            return "None"
        
def replace_titles(title):
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

def feature_engineering(df):
    df["Deck"] = df["Cabin"].apply(lambda cabin: cabin[0] if not pd.isna(cabin) else "Unknown")
    # Get titles from name (source: https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/)    
    df["Title"] = df["Name"].apply(lambda name: get_substring(name, title_list))
    df["Title"] = df["Title"].apply(replace_titles)
    df["AgeClass"] = df['Age']*df['Pclass']
    # Add "Unknown" if Embarked is NaN
    df["Embarked"] = df["Embarked"].apply(lambda port: "Unknown" if pd.isna(port) else port)
    return df

feats_to_use = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Deck", "Title", "AgeClass"]

def select_features(df):
#     print(df.columns)
    return df[feats_to_use]

# Fill NaN for age with neighbours
age_imputer_transformer = ColumnTransformer(transformers=[
    ('age_imputer', Pipeline(steps=[('imputer', KNNImputer(n_neighbors=5))]), ["Age"])],
    remainder='passthrough',
    verbose_feature_names_out=False,
    sparse_threshold=0)

# Add custom features and filling of NaN
custom_feat_engineer_transformer = Pipeline(steps=[
    ('feature_engineer', FunctionTransformer(feature_engineering, validate=False))
])

# Selects features and filling of NaN
custom_feat_selector_transformer = Pipeline(steps=[
    ('feature_selector', FunctionTransformer(select_features, validate=False))
])

categorical_features = ["Sex", "Embarked", "Deck", "Title"]
numerical_features = ["Pclass", "Age", "SibSp", "Parch", "Fare", "AgeClass"]

# catch any other missing values not yet handled
final_imputer_transformer = ColumnTransformer(
    transformers=[
        ('numerical_imputer', Pipeline(steps=[('imputer', KNNImputer(n_neighbors=5)),  ('scaler', StandardScaler())]), numerical_features),
        ('categorical_imputer', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), categorical_features)
    ],
    remainder='passthrough',
    sparse_threshold=0,
    verbose_feature_names_out=False
)

ohe_feats = ["Embarked", "Deck", "Title"]
bin_feats = ["Sex"]

onehot_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown = "ignore")) 
])

binary_transformer = Pipeline(steps=[
    ('binary', ce.BinaryEncoder())
])

# Do encoding for categorical features
cat_transformer = ColumnTransformer(
    transformers=[
        ('onehot', onehot_transformer, ohe_feats),
        ('binary', binary_transformer, bin_feats)
    ],
    remainder='passthrough',
    sparse_threshold=0,
    verbose_feature_names_out=False
)

preprocessor = Pipeline(steps=[
    ('age_imputer_transformer', age_imputer_transformer),
    ('custom_feature_engineer', custom_feat_engineer_transformer),
    ('custom_feature_selector', custom_feat_selector_transformer),
    ('final_imputer_transformer', final_imputer_transformer),
    ('categorical_transformer', cat_transformer)
])

def make_pipeline(model):
    pipeline = [
        ('preprocessor', preprocessor),
        ('classifier', model)]
    return Pipeline(pipeline)

In [None]:
# Get features and label
X = df.drop("Survived", axis=1)
y = df["Survived"]
# X.head()
# y.head()

In [None]:
def predict_and_score(model, X, y, **kwargs):
    y_pred = model.predict(X)
    return score(y, y_pred, **kwargs)

def score(y,  y_pred, output='all', additional_print="", ret_data=False):
    acc = accuracy_score(y, y_pred)
    conf_mat = confusion_matrix(y, y_pred)
    clf_rep = classification_report(y, y_pred)
    acc_print = f"{additional_print} Accuracy={acc}"
    if output == 'all':
        print(acc_print)
        print("Confusion Matrix:\n", conf_mat)
        print("Classification Report:\n", clf_rep)
    elif output == 'acc':
        print(acc_print)
    if ret_data:
        return acc, conf_mat, clf_rep
    
def multi_score(model, X, y, runs=10, progress=False, output='all', additional_print="", ret_data=False):
    total_acc = 0
    for i in range(runs):
        ps_output = 'acc' if progress else None
        acc, conf_mat, clf_rep = predict_and_score(model, X, y, output=ps_output, ret_data=True)
        total_acc += acc
    
    avg_acc = total_acc / runs
    acc_print = f"For {runs} runs: Avg {additional_print} Accuracy={acc}"
    if output == 'all':
        print(acc_print)
        print("Confusion Matrix:\n", conf_mat)
        print("Classification Report:\n", clf_rep)
    elif output == 'acc':
        print(acc_print)
    if ret_data:
        return acc, conf_mat, clf_rep
    
def transform_clf_params(params):
    return {f"classifier__{key}": val for key, val in params.items()} 
    
def grid_search_fit(model, X, y, params={}, output=True):
    params = transform_clf_params(params)
    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_model = grid_search.best_estimator_
    if output:
        print(f"Grid Search with CV found best params={best_params}, best score={best_score}")
    return best_model


In [None]:
def save_model(model, model_name):
    joblib.dump(model, f"{model_name}.joblib")
    
def load_model(model_name):
    return joblib.load(f"{model_name}.joblib")

## Models

### Logistic Regression

In [None]:
def perform_lr(X, y, penalty=None):
    solver = ['saga']
    if penalty is None:
        solver += ['lbfgs', 'newton-cholesky', 'sag']
    elif penalty == 'l1':
        solver += ['liblinear']
    elif penalty == 'l2':
        solver += ['lbfgs', 'liblinear', 'newton-cholesky', 'sag']
    params = {
        'solver': solver
    }
#     solver = 'lbfgs' if penalty is None or penalty == 'l2' else 'liblinear'
    lr_pipe = make_pipeline(LogisticRegression(penalty=penalty))
    print(f"Logistic Regression-{penalty}", "-"*30)
    if penalty is not None:
        params['C'] = [0.001, 0.01, 0.1, 1, 10]
    return grid_search_fit(lr_pipe, X, y, params=params)

In [None]:
filterwarnings("ignore", category=ConvergenceWarning)
filterwarnings("ignore", category=LinAlgWarning)
lr = perform_lr(X, y)
multi_score(lr, X, y, additional_print="Train")
lr_l1 = perform_lr(X, y, penalty='l1')
multi_score(lr_l1, X, y, additional_print="Train")
lr_l2 = perform_lr(X, y, penalty='l2')
multi_score(lr_l2, X, y, additional_print="Train")

In [None]:
save_model(lr, "LR")
save_model(lr_l1, "LR-L1")
save_model(lr_l2, "LR-L2")

## Random Forest

In [None]:
rf = make_pipeline(RandomForestClassifier())
params = {'n_estimators': range(50, 301, 25), 
          # 'criterion': ["gini", "entropy", "log_loss"], 
          'max_depth': (list(range(10, 41, 10)) + [None]), 
          'max_features': ['sqrt', 'log2']}
best_rf = grid_search_fit(rf, X, y, params=params)
multi_score(best_rf, X, y, additional_print="Train")

In [None]:
save_model(best_rf, "RF")

## SVC

In [None]:
def perform_svc(X, y):
    svm = SVC()
    pipe_svm = make_pipeline(svm)
    params = {'kernel': ['linear', 
                         'poly', 
                         'rbf', 
#                          'sigmoid'
                        ],
             'C': [
#                  0.01, 
                 0.1, 
                 1, 
                 10, 
#                  100
             ]}
    return grid_search_fit(pipe_svm, X, y, params=params)

In [None]:
svc = perform_svc(X, y)
save_model(svc, "SVC")

In [None]:
multi_score(svc, X, y)

## KNN

In [None]:
knn = make_pipeline(KNeighborsClassifier())
params = {'n_neighbors': range(2, 11)}
best_knn = grid_search_fit(knn, X, y, params=params)
multi_score(best_knn, X, y, additional_print="Train")

In [None]:
save_model(best_knn, "KNN")

## Naive Bayes

In [None]:
nb = make_pipeline(GaussianNB())
nb.fit(X, y)
multi_score(nb, X, y, additional_print="Train")

In [None]:
save_model(nb, "NB")

## Gradient Boosting

In [None]:
gb = make_pipeline(GradientBoostingClassifier())
params = {
    'n_estimators': range(100, 161, 10), 
    'learning_rate': [0.1, 1], 
    'max_depth': range(1, 10, 2), 
    'max_features': ['sqrt', 'log2'],
    'subsample': np.linspace(0.1, 1.0, 4)
}
best_gb = grid_search_fit(gb, X, y, params=params)
multi_score(best_gb, X, y)

In [None]:
save_model(best_gb, "GB")

## ADABoost

In [None]:
ab = make_pipeline(AdaBoostClassifier())
est_dt = DecisionTreeClassifier(max_depth=1)
est_lr = LogisticRegression()
est_nb = GaussianNB()
params = {
    'n_estimators': range(50, 151, 10),  
    'learning_rate': [0.01, 0.1, 1, 10],
    'estimator': [est_dt, 
                  est_lr, 
                  est_nb
                 ]
         }
best_ab = grid_search_fit(ab, X, y, params=params)
multi_score(best_ab, X, y)

In [None]:
save_model(best_ab, "ADA")

## Ensembles

In [None]:
class Ensemble:
    
    def __init__(self, models):
        self.models = list(models)
    
    def predict(self, X):
        preds = []
        for model in self.models:
            preds.append(model.predict(X))
#             print(mode(preds, axis=0).mode)
        return mode(preds, axis=0).mode
    
    def multi_score(self, X, y, runs=10, progress=False, output='all', additional_print="", ret_data=False):
        total_acc = 0
        for i in range(runs):
            y_pred = self.predict(X)
            ps_output = 'acc' if progress else None
            acc, conf_mat, clf_rep = score(y, y_pred, output=ps_output, ret_data=True)
            total_acc += acc

        avg_acc = total_acc / runs
        acc_print = f"For {runs} runs: Avg {additional_print} Accuracy={acc}"
        if output == 'all':
            print(acc_print)
            print("Confusion Matrix:", f"Num FP={conf_mat[0, 1]}\n", conf_mat)
            print("Classification Report:\n", clf_rep)
        elif output == 'acc':
            print(acc_print)
        if ret_data:
            return acc, conf_mat, clf_rep


In [None]:
ensemble1_models = [
#     lr_l2, 
    lr_l1, 
    best_rf, 
#     svc, 
    nb,
    best_knn,
#     best_gb, 
    best_ab
]
ensemble1 = Ensemble(ensemble1_models)
ensemble1.multi_score(X, y)

In [None]:
ensemble2_models = [
#     lr_l2, 
    lr_l1, 
    best_rf, 
    svc, 
    nb,
#     best_knn,
    best_gb, 
#     best_ab
]
ensemble2 = Ensemble(ensemble2_models)
ensemble2.multi_score(X, y)

## Stacked Classifier

In [None]:
class StackedClassifier:
    
    def __init__(self, model, input_models, params=None):
        self.model = model
        self.input_models = input_models
        self.params = params
    
    def _make_stacked_X(self, X):
        preds = []
        for model in self.input_models:
            preds.append(model.predict(X))
        return np.column_stack(preds)
    
    def fit(self, X, y):
        X_stacked = self._make_stacked_X(X)
        if self.params is None:
            self.model.fit(X_stacked, y)
        else:
            self.model = grid_search_fit(self.model, X_stacked, y, self.params)
    
    def predict(self, X):
        X_stacked = self._make_stacked_X(X)
        y_pred = self.model.predict(X_stacked)
        return y_pred
    
    def multi_score(self, X, y, runs=10, progress=False, output='all', additional_print="", ret_data=False):
        total_acc = 0
        for i in range(runs):
            y_pred = self.predict(X)
            ps_output = 'acc' if progress else None
            acc, conf_mat, clf_rep = score(y, y_pred, output=ps_output, ret_data=True)
            total_acc += acc

        avg_acc = total_acc / runs
        acc_print = f"For {runs} runs: Avg {additional_print} Accuracy={acc}"
        if output == 'all':
            print(acc_print)
            print("Confusion Matrix:", f"Num FP={conf_mat[0, 1]}\n", conf_mat)
            print("Classification Report:\n", clf_rep)
        elif output == 'acc':
            print(acc_print)
        if ret_data:
            return acc, conf_mat, clf_rep

In [None]:
stacked_models = [
#     lr_l2, 
    lr_l1, 
    best_rf, 
#     svc, 
    nb, 
    best_knn,
#     best_gb, 
    best_ab
]
# model = XGBClassifier()
# params = {'n_estimators': range(50, 301, 25), 
#           # 'criterion': ["gini", "entropy", "log_loss"], 
#           'max_depth': (list(range(10, 41, 10)) + [None]), 
#           'max_features': ['sqrt', 'log2']}
model = RandomForestClassifier()
stacked_clf = StackedClassifier(model, stacked_models)
stacked_clf.fit(X, y)
stacked_clf.multi_score(X, y)

In [None]:
stacked_models2 = [
#     lr_l2, 
    lr_l1, 
    best_rf, 
    svc, 
    nb,
#     best_knn,
    best_gb, 
#     best_ab
]

model = XGBClassifier()
stacked_clf2 = StackedClassifier(model, stacked_models2)
stacked_clf2.fit(X, y)
stacked_clf2.multi_score(X, y)

In [None]:
stacked_models3 = [
#     lr_l2, 
    lr_l1, 
    best_rf, 
    svc, 
#     nb,
    best_knn,
#     best_gb, 
#     best_ab
]

model = LogisticRegression()
stacked_clf3 = StackedClassifier(model, stacked_models3)
stacked_clf3.fit(X, y)
stacked_clf3.multi_score(X, y)

In [None]:
multi_stack = [stacked_clf, stacked_clf2, stacked_clf3]
model = RandomForestClassifier()

stacked_stacked_clf = StackedClassifier(model, multi_stack)
stacked_stacked_clf.fit(X, y)
stacked_stacked_clf.multi_score(X, y)

## Submitting

In [None]:
X_test = pd.read_csv("/kaggle/input/titanic/test.csv")
X_test.head()

In [None]:
X_test_na = X_test.isna().sum().reset_index().rename(columns={0: "Missing_Vals"})
display(X_test_na[X_test_na["Missing_Vals"] != 0])

In [None]:
def format_submission(X, y_pred):
    return pd.DataFrame({'PassengerId': X['PassengerId'], 'Survived': y_pred})

In [None]:
def make_submission(model, model_name, print_model=False):
    if print_model:
        print(model)
    y_test_pred = model.predict(X_test)
    submission = format_submission(X_test, y_test_pred)
    fname = f"Titanic-Prediction-{model_name}.csv"
    submission.to_csv(fname, index=False)
    print(f"Saved file: {fname}")
    

In [None]:
make_submission(lr_l2, "LR-L2")

In [None]:
make_submission(best_rf, "RF")

In [None]:
make_submission(svc, "SVC")

In [None]:
make_submission(best_knn, "KNN")

In [None]:
make_submission(nb, "NB")

In [None]:
make_submission(best_gb, "GB")

In [None]:
make_submission(best_ab, "AB")

In [None]:
make_submission(ensemble1, "Ensemble1")

In [None]:
make_submission(ensemble2, "Ensemble2")

In [None]:
make_submission(stacked_clf, "Stacked")

In [None]:
make_submission(stacked_clf2, "Stacked2")

In [None]:
make_submission(stacked_clf3, "Stacked3")

In [None]:
make_submission(stacked_stacked_clf, "Stacked_multi")