# Titanic exploration
Taken from https://www.kaggle.com/competitions/titanic

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Load dataset

In [2]:
titanic_train = pd.read_csv("titanic/train.csv")
titanic_test = pd.read_csv("titanic/test.csv")
titanic_submission_ex = pd.read_csv("titanic/gender_submission.csv")

## Layouttitanic_train.head()

In [3]:
print("Columns:\n", titanic_train.columns.values)
print("Types:\n", titanic_train.dtypes)
titanic_train.head()

Columns:
 ['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
Types:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_submission_ex.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [5]:
titanic_train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


## Findings:
* ID is just a coutner id
* Survived is 2 class label (0 = No, 1 = Yes)
* Pclass is a 3 class label (socio-economic status,  1 = 1st, 2 = 2nd, 3 = 3rd)
* Name is a string -> pretty unusable unless we use name distance
* Sex is a 2 class label
* Age is a continous property in range $[0.43: 80]$
* SibSp is a continous sibling/spouse counter
* Parch is a continous children/parent counter
* Ticket is a unique ticket number (string)
* Fare continous passanger fare
* Cabin continous cabin number
* Embarked 3 class port of embarkment (C = Cherbourg, Q = Queenstown, S = Southampton)

### Problems
* __Unbalanced with regards to the survival labels__
* Age has >170 missing values
* Cabin is unknown for the majority of our data points
* Unbalanced on Sex -> there might be a bias in the data

In [6]:
# Missing values in age
titanic_train['Age'].notnull().sum()

714

In [7]:
titanic_train.groupby('Sex')['Sex'].count()

Sex
female    314
male      577
Name: Sex, dtype: int64

In [8]:
titanic_train.groupby("Survived")['Survived'].count()

Survived
0    549
1    342
Name: Survived, dtype: int64

In [9]:
# Get dataset without null ages
test_age_nonnull = titanic_train[~titanic_train['Age'].isnull()]
test_age_nonnull.groupby("Survived")['Survived'].count()

Survived
0    424
1    290
Name: Survived, dtype: int64

In [10]:
test_age_nonnull_clean = test_age_nonnull.drop(columns=['Name', 'Ticket', 'Fare', 'Cabin', 'Embarked'])
test_age_nonnull_clean.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
count,714.0,714.0,714.0,714,714.0,714.0,714.0
unique,,,,2,,,
top,,,,male,,,
freq,,,,453,,,
mean,448.582633,0.406162,2.236695,,29.699118,0.512605,0.431373
std,259.119524,0.49146,0.83825,,14.526497,0.929783,0.853289
min,1.0,0.0,1.0,,0.42,0.0,0.0
25%,222.25,0.0,1.0,,20.125,0.0,0.0
50%,445.0,0.0,2.0,,28.0,0.0,0.0
75%,677.75,1.0,3.0,,38.0,1.0,1.0
