In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

age_mister = 20
age_misses = 20


def read_raw_titanic(path):
    return pd.read_csv(path)

def read_clean_titanic(path, use_dummies=True):
    df = clean_titanic(read_raw_titanic(path), use_dummies)
    return df

def clean_titanic(df, use_dummies=True):
    def val_contains_any(val, list):
        for item in list:
            if item in val:
                return True
        return False
    def is_parent(row):
        if row['Parch'] > 0:
            if (is_mister(row)) or (is_misses(row)):
                return True
        return False
    def is_child(row):
        if row['Parch'] > 0:
            if (is_master(row)) or (is_miss(row)):
                return True
        return False

    def age_cat(row):
        if is_master(row): return "master"
        if is_miss(row): return "miss"
        if is_misses(row): return "misses"
        return "mister"

    def is_male(row):
        return row['Sex'] == "male"

    def is_female(row):
        return row['Sex'] == "female"

    def is_master(row):
        if is_male(row) == False: return False
        if row["Age"] is not None and row["Age"] < age_mister: return True
        # if we didn't have age, use title
        return is_male(row) and val_contains_any(row["Name"], ["Master."])

    def is_mister(row):
        if is_male(row) == False: return False
        if row["Age"] is not None and row["Age"] >= age_mister: return True
        # if we didn't have age, use title
        return val_contains_any(row["Name"], ["Mr.", "Mister.","Dr.","Don.","Rev.","Col.","Capt.","Major.","Sir.","Jonkheer."])

    def is_miss(row):
        if is_female(row) == False: return False
        if row["Age"] is not None and row["Age"] >= age_misses: return True
        # if we didn't have age, use title
        return val_contains_any(row["Name"], ["Miss.","Ms.","Mlle."])

    def is_misses(row):
        if is_female(row) == False: return False
        if row["Age"] is not None and row["Age"] < age_misses: return True
        # if we didn't have age, use title
        return val_contains_any(row["Name"], ["Mrs.","Mme.","Countess.","Lady."])
    
    # def age_cat(row):
    #     if is_master(row): return "master"
    #     if is_miss(row): return "miss"
    #     if is_misses(row): return "misses"
    #     if is_mister(row): return "mister"
    df["sex_cat"]             = df['Sex']
    df['is_female']           = df["Sex"].map(lambda x: 1 if x == "female" else 0)
    df['is_family']           = df.apply(lambda x: 1 if x['SibSp'] > 0 or x['Parch'] > 0 else 0, axis=1)
    df['is_parent']           = df.apply(lambda x: 1 if is_parent(x) else 0, axis=1)
    df['is_child']            = df.apply(lambda x: 1 if is_child(x) else 0, axis=1)

    df['has_raw_age']         = df["Age"].map(lambda x: 1 if x > 0 else 0)
    df['is_age_estimated']    = df["Age"].map(lambda x: 1 if ".5" in str(x) else 0)
    df['is_age_infant']       = df["Age"].map(lambda x: 1 if x < 1 else 0)
    df['is_age_child']        = df["Age"].map(lambda x: 1 if x >= 1 and x < 13 else 0)
    df['is_age_teen']         = df["Age"].map(lambda x: 1 if x >= 13 and x < 20 else 0)
    df['is_age_young_adult']  = df["Age"].map(lambda x: 1 if x >= 20 and x < 27 else 0)
    df['is_age_adult']        = df["Age"].map(lambda x: 1 if x >= 27 and x < 45 else 0)
    df['is_age_old_adult']    = df["Age"].map(lambda x: 1 if x >= 45 and x < 65 else 0)
    df['is_age_elderly']      = df["Age"].map(lambda x: 1 if x >= 65 else 0)

    # age groups
    df['is_age_master']       = df.apply(lambda x: 1 if is_master(x) else 0, axis=1)
    df['is_age_mister']       = df.apply(lambda x: 1 if is_mister(x) else 0, axis=1)
    df['is_age_miss']         = df.apply(lambda x: 1 if is_miss(x) else 0, axis=1)
    df['is_age_misses']       = df.apply(lambda x: 1 if is_misses(x) else 0, axis=1)
    df['raw_age']             = df['Age']

    master_age_mean           = df[(df['is_age_master'] == 1) & (df["has_raw_age"] == 1)]['Age'].mean()
    mister_age_mean           = df[(df['is_age_mister'] == 1) & (df["has_raw_age"] == 1)]['Age'].mean()
    miss_age_mean             = df[(df['is_age_miss'] == 1)   & (df["has_raw_age"] == 1)]['Age'].mean()
    misses_age_mean           = df[(df['is_age_misses'] == 1) & (df["has_raw_age"] == 1)]['Age'].mean()
    
    df['age_cat']             = df.apply(lambda x: age_cat(x), axis=1)
    
    df['Age']                 = np.where((df['is_age_master']==1) & (df['Age'].isna()), master_age_mean, df['Age'])
    df['Age']                 = np.where((df['is_age_mister']==1) & (df['Age'].isna()), mister_age_mean, df['Age'])
    df['Age']                 = np.where((df['is_age_miss']==1)   & (df['Age'].isna()), miss_age_mean, df['Age'])
    df['Age']                 = np.where((df['is_age_misses']==1) & (df['Age'].isna()), misses_age_mean, df['Age'])

    if ("Survived" in df.columns):
        df["Survival Status"] = df['Survived'].map({1: "Survived", 0: "Died"})
        df["survival_cat"] = df["Survival Status"]

    ## 
    ## CLEAN-UP
    # 2 people with no embarked are hard to match with other values, drop them
    df                        = df[df["Embarked"].isna() == False]

    # trmming down
    # 'PassengerId', 
    df = df.drop(columns=['Name', 'Ticket', 
                        'Cabin', 'raw_age', 
                        'has_raw_age','is_age_estimated',
                        'SibSp', 'Parch'])

    ## DUMMIES
    final_df = df
    if use_dummies:
        final_df = pd.get_dummies(final_df)
    
    return final_df

In [None]:
def survival_in_feature_group(df, field):
    sums_df = df[[field, "PassengerId", "Survived"]].groupby(field).sum()
    grouper = df[[field, "PassengerId", "Survived"]].groupby(field)
    res =  grouper.count()
    res['total'] = res["PassengerId"]
    res['survived_count'] = sums_df["Survived"]
    res['survived_perc'] = round(res['survived_count'] / res['total'] * 100, 2)
    res = res.reset_index()[[field, 'total', 'survived_count', 'survived_perc']]
    return res

In [None]:
import pandas as pd
import seaborn as sns

def plotting_percentages(df, col, target):
    x, y = col, target
    
    # Temporary dataframe with percentage values
    temp_df = df.groupby(x)[y].value_counts(normalize=True)
    temp_df = temp_df.mul(100).rename('percent').reset_index()

    # Sort the column values for plotting    
    order_list = list(df[col].unique())
    order_list.sort()

    # Plot the figure
    sns.set(font_scale=1.5)
    g = sns.catplot(x=x, y='percent', hue=y,kind='bar', data=temp_df, 
                    height=8, aspect=2, order=order_list, legend_out=False)
    g.ax.set_ylim(0,100)

    # Loop through each bar in the graph and add the percentage value    
    for p in g.ax.patches:
        txt = str(p.get_height().round(1)) + '%'
        txt_x = p.get_x() 
        txt_y = p.get_height()
        g.ax.text(txt_x,txt_y,txt)
        
    # Set labels and title
    plt.title(f'{col.title()} By Percent {target.title()}', 
              fontdict={'fontsize': 30})
    plt.xlabel(f'{col.title()}', fontdict={'fontsize': 20})
    plt.ylabel(f'{target.title()} Percentage', fontdict={'fontsize': 20})
    plt.xticks(rotation=75)
    return g

In [None]:
def confusion_heat_map(confusion_matrix):
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in confusion_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in confusion_matrix.flatten()/np.sum(confusion_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages) ]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(confusion_matrix, annot=labels, fmt='', cmap='Blues')