In [None]:
#Importing usual suspects
import pandas as pd # LIbrary to help load and explore data
import numpy as np # Library for mathematical functions and support for arrays and matrices

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [None]:
training_data = pd.read_csv("datasets/titanic/train.csv") # Load the training data into a dataframe
training_data.head(10) # show 10 line preview if brackets are falso just shows 5 as default

In [None]:
training_data.describe() # Show stats info about data - note only works for numerical value fields

In [None]:
training_data.info() #Information about the datatypes in the dataframe

**Sometimes we will have null values in data sets, let's find these and then work out what to do with them**

In [None]:
training_data.isnull().sum()

**So age, cabin and embarked are null so we should do something with these**
1. **Age lets replace with average or median age**
2. **Cabin - not sure what to do here but rather than deleting or dropping column lets keep and replace with a dummy value**
3. **Embarked - very small amount of nulls so we could probably just take the most common entry**

**The above options are probably simple naive replacements but lets do it and see what happens**

**_Note that we need to do this to the test set too when we load that - we need to ensure that the test set resembles the training set_**

In [None]:
median_age_val = training_data["Age"].median() # show the median age
f"Median Age: {median_age_val}" 


In [None]:
mean_age_val = training_data["Age"].mean() # show the average age
f"Mean Age: {mean_age_val}" 

**probably ok to pick either - lets go with mean**

In [None]:
training_data_age_cleaned = training_data["Age"].fillna(training_data["Age"].mean()) # Habit to create intermediate variables - you could use inplace=True as part of the inplace method params but I like to have original and intermediate variables just in case
# this is now the age column we will use later
training_data_age_cleaned.isna().sum() #same as isnull

**Looking at cabin we may think that where the cabin was located could have pssible correlated to the survival rate or somthing like that. Since we have no such information lets just replace it with dummy values**

In [None]:
training_data_cabin_cleaned = training_data["Cabin"].fillna('DUMMY') #Blanks is also fine

#Lets also do the embarked as the most embarked place
training_data_embarked_cleaned = training_data["Embarked"].fillna(training_data["Embarked"].mode()[0])
training_data_embarked_cleaned.isna().sum()

**Now we are going to combine the above for a clean training dataset**

In [None]:
#There are a few ways to do this but here we used filter
# Wont use passengerId, cabin, Name, ticket or embarked because its a reasonable bet to say this had no real bearing on their mortality
# Leaving out age as we will use the one from the cleaned set
clean_training_dataset = training_data.filter(['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare'])
clean_training_dataset["Age"] = training_data_age_cleaned
clean_training_dataset["Embarked"] = training_data_embarked_cleaned

clean_training_dataset.head()

In [None]:
import matplotlib.pyplot as plt

fig, ax1 = plt.subplots(2, 2, figsize=(15,15)) #Create a figure of nrows and ncols


survived_df = clean_training_dataset["Survived"]
survived_df.value_counts().plot(kind="pie", ax=ax1[0,0])

class_df = clean_training_dataset["Pclass"]
class_df.value_counts().plot(kind="pie", ax=ax1[0,1])

gender_df = clean_training_dataset["Sex"]
gender_df.value_counts().plot(kind="pie", ax=ax1[1,0])


sibsp_df = clean_training_dataset["SibSp"]
sibsp_df.value_counts().plot(kind="pie", ax=ax1[1,1])



In [None]:
import seaborn as sns

sns.countplot(x="Survived", data=clean_training_dataset)


In [None]:
sns.catplot(x="Sex", col="Survived", kind="count", data=clean_training_dataset)

In [None]:
sns.catplot(x="Pclass", col="Survived", kind="count", data=clean_training_dataset)

In [None]:
sns.catplot(x="Embarked", col="Survived", kind="count", data=clean_training_dataset)

In [None]:
clean_training_dataset["Age"].max()
clean_training_dataset["Age"].min()
clean_training_dataset["Age"].mean()

In [None]:
def put_in_age_bucket(age):
    
    if 0.0 <= age <= 19.99:
        return 0
    if 20.0 <= age <= 39.99:
        return 1
    if 40.0 <= age <= 59.00:
        return 2
    if 60.0 <= age <= 80.00:
        return 3


def create_age_buckets(age_df):
    age_bucket_df = age_df.apply(lambda row: put_in_age_bucket(row))
    return age_bucket_df


clean_training_dataset["Age_Bucket"] = create_age_buckets(clean_training_dataset["Age"]) 
clean_training_dataset.head()

In [None]:
sns.catplot(x="Age_Bucket", col="Survived", kind="count", data=clean_training_dataset)

In [None]:
clean_training_dataset["Fare"].max()
clean_training_dataset["Fare"].min()
clean_training_dataset["Fare"].mean()


In [None]:
clean_training_dataset["Fare_Band"] = pd.qcut(clean_training_dataset['Fare'], 4, labels=[1,2,3,4])
clean_training_dataset.head()

clean_training_dataset["Fare_Band"].value_counts()

In [None]:
clean_training_dataset["Sex"] = clean_training_dataset["Sex"].map({"male":0, "female":1})
clean_training_dataset["Embarked"] = clean_training_dataset["Embarked"].map({"S":0, "C":1, "Q":2})

clean_training_dataset.head()

In [None]:
sns.catplot(x="Fare_Band", col="Survived", kind="count", data=clean_training_dataset)

In [None]:
clean_training_dataset = clean_training_dataset.drop(["Age"], axis = 1)

clean_training_dataset.head()

In [None]:
clean_training_dataset = clean_training_dataset.drop(["Fare"], axis = 1)


In [None]:
print(clean_training_dataset.head())

In [None]:
unseen_data_df = pd.read_csv("datasets/titanic/test.csv")


In [None]:
clean_unseen_df = unseen_data_df.filter(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Age', 'Embarked'])

clean_unseen_df["Age_Bucket"] = create_age_buckets(clean_unseen_df["Age"]) 


In [None]:
clean_unseen_df["Sex"] = clean_unseen_df["Sex"].map({"male":0, "female":1})


In [None]:

clean_unseen_df["Embarked"] = clean_unseen_df["Embarked"].map({"S":0, "C":1, "Q":2})


In [None]:
clean_unseen_df["Fare_Band"] = pd.qcut(clean_unseen_df['Fare'], 4, labels=[1,2,3,4])

In [None]:
clean_unseen_df = clean_unseen_df.drop(["Fare"], axis = 1)

In [None]:
clean_unseen_df = clean_unseen_df.drop(["Age"], axis = 1)
clean_unseen_df.head()