
## Titanic Dataset Problem

### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re

## Dataset

In [None]:
test_data_path = "data/test.csv"
train_data_path = "data/train.csv"
submission_data_path = "data/gender_submission.csv"

In [None]:
df_train = pd.read_csv(train_data_path)
df_train.head()

### Dataset Stats

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.describe(include='object')

In [None]:
m = df_train.shape[0]
print(m)
df_train.notna().sum()

In [None]:
df_train.nunique()

In [None]:
age_bins = np.arange(0, 81, 10)
# age_bins = [0, 5, 12, 17, 22, 35, 45, 60, 80]
df_train["Age_Range"] = pd.cut(df_train.Age, bins=age_bins, include_lowest=1)

df_train.Age_Range.unique()

### Dataset Visualization

In [None]:
# Setting up visualisations
sns.set_style(style='white') 
sns.set(rc={
    'figure.figsize':(10,6), 
    'axes.facecolor': '#eee',
    'axes.grid': True,
    'grid.color': '.9',
    'axes.linewidth': 1.0,
    'grid.linestyle': u'-'},font_scale=1)
custom_colors = ["#3498db", "#95a5a6","#34495e", "#2ecc71", "#e74c3c"]
sns.set_palette(custom_colors)

In [None]:
missing_val_heatmap = sns.heatmap(df_train.notna(), cbar=False, cmap="Blues")
missing_val_heatmap.set_title("Missing Val Heatmap")


In [None]:
survival_ratio = df_train.Survived.value_counts(normalize=True)
survival_ratio.plot.barh(color=["black", "lightblue"])


plt.title("Training Data - Ratio of Survival and Death")

In [None]:
pclass_dist = df_train.Pclass.value_counts()
pclass_vals = np.sort(pclass_dist.index)
pclass_ratio = pclass_dist / m
pclass_ratio.plot.pie(autopct="%1.0f%%")

plt.legend(labels="Class " +  pclass_ratio.index.astype(str))
plt.title("Training Data - People Traveling in different Classes")

In [None]:
pclass_survival_dist = df_train[df_train.Survived==1].Pclass.value_counts()
pclass_death_dist = pclass_dist - pclass_survival_dist 

pclass_survival_ratio = pclass_survival_dist / pclass_dist
pclass_death_ratio = 1 - pclass_survival_ratio 

pclass_survival_ratio.name = "Survival Ratio"
pclass_death_ratio.name = "Death Ratio"

pclass_death_ratio_df = pd.concat(
    (pclass_survival_ratio, pclass_death_ratio), 
    axis=1
    )
pclass_death_ratio_df.plot.bar()

plt.title("'Training Data - Ratio of people survived as per class'")

In [None]:
for pclass_val in pclass_vals:
    df_train.Age[df_train.Pclass == pclass_val].plot.density()

plt.title("Age Density in Classes")
plt.legend(pclass_vals)

plt.show()

In [None]:
sex_dist = df_train.Sex.value_counts()
sex_survival_dist = df_train[df_train.Survived==1].Sex.value_counts()
sex_death_dist = sex_dist - sex_survival_dist

sex_survival_status_dist = pd.DataFrame([sex_survival_dist / sex_dist, sex_death_dist / sex_dist], index=[1, 0])
sex_survival_status_dist

In [None]:
sex_survival_status_dist.T.plot.barh()
plt.legend(labels = ["Survived", "Dead"])

plt.show()

In [None]:
# td['Fare_Category'] = pd.cut(td['Fare'], bins=[0,7.90,14.45,31.28,120], labels=['Low','Mid',
# sns.countplot(x = "Age_Range", hue = "Survived", data = td, palette=["C1", "C0"]).legend(labels = ["Deceased", "Survived"])

In [None]:
sns.countplot(x="Age_Range", data=df_train, hue="Survived", palette=["C1", "C0"])
plt.title("")
plt.legend(labels=["Deceased", "Survived"])


In [None]:
age_range_dist = df_train.Age_Range.value_counts().sort_index()

age_range_survival_dist = df_train[df_train.Survived==1].Age_Range.value_counts().sort_index()
age_range_death_dist = age_range_dist - age_range_survival_dist

age_range_survival_ratio = age_range_survival_dist / age_range_dist
age_range_death_ratio = 1 - age_range_survival_ratio

age_range_survival_status_df = pd.DataFrame([age_range_survival_ratio, age_range_death_ratio], index=["Survived", "Deasesed"]).T

age_range_survival_status_df.plot.bar()


In [None]:
pclass_survival_dist = df_train[df_train.Survived==1].Pclass.value_counts()
pclass_death_dist = pclass_dist - pclass_survival_dist 

pclass_survival_ratio = pclass_survival_dist / pclass_dist
pclass_death_ratio = 1 - pclass_survival_ratio 

pclass_survival_ratio.name = "Survival Ratio"
pclass_death_ratio.name = "Death Ratio"

pclass_death_ratio_df = pd.concat(
    (pclass_survival_ratio, pclass_death_ratio), 
    axis=1
    )
pclass_death_ratio_df.plot.bar()

plt.title("'Training Data - Ratio of people survived as per class'")

In [None]:
a = sns.distplot(df_train.Age, bins=30)

In [None]:
df_train.columns

In [None]:
# fms = df_train[["Survived", "SibSp", "Parch"]]
fms = df_train.loc[:, ["Survived", "SibSp", "Parch"]]
fms["Fam"] = fms.SibSp + fms.Parch

fms.head()

In [None]:
sns.countplot(x="SibSp", data=df_train, hue="Survived", palette=["C1", "C0"])

In [None]:
sns.countplot(x="Parch", data=df_train, hue="Survived", palette=["C1", "C0"])

In [None]:
sns.countplot(x="Fam", data=fms, hue="Survived", palette=["C1", "C0"])

In [None]:
fms.Fam.value_counts()

In [None]:
fam_dist = fms.Fam.value_counts().sort_index()

fam_survival_dist = fms[fms.Survived==1].Fam.value_counts().sort_index()
fam_death_dist = (fam_dist - fam_survival_dist).fillna(0)

fam_survival_ratio = (fam_survival_dist / fam_dist).fillna(0)
fam_death_ratio = 1 - fam_survival_ratio

fam_survival_status_df = pd.DataFrame([fam_survival_ratio, fam_death_ratio], index=[1, 0]).T

fam_survival_status_df

In [None]:
pd.concat([fam_survival_ratio, fam_death_ratio], axis=1)

In [None]:
emb_plt = sns.countplot(x="Embarked", data=df_train, hue="Survived", palette=["C1", "C0"])
emb_plt.set_xticklabels(["SA", "CE", "QT"])

In [None]:
df_train[df_train.Embarked.isna()]

In [None]:
df_train[(df_train.Pclass==1) & (df_train.Sex=="female")].Embarked.value_counts()

In [None]:
df_train.Embarked.mode()[0]

In [None]:
df_train.Embarked = df_train.Embarked.fillna(df_train.Embarked.mode()[0])
df_train[df_train.Embarked.isna()]

In [None]:
df_train["Title"] = df_train.Name.apply(lambda name : name.split(",")[1].split(".")[0].strip())
print(df_train["Title"].unique())

df_train[df_train.Sex=="female"].Title.value_counts()

In [None]:
df_train["Title"].nunique()

In [None]:
grp = df_train.groupby(["Sex", "Pclass"])
grp1 = df_train.groupby(["Sex", "Pclass", "Title"])

grp.Age.apply(lambda x: x.fillna(x.median()))

In [None]:
grp.Age.apply(lambda x: x.fillna(x.median()))[df_train.Age.isna()].sort_values().reset_index().Age.plot()

In [None]:
grp1.Age.apply(lambda x: x.fillna(x.median()))[df_train.Age.isna()].sort_values().reset_index().Age.plot()

In [None]:
sns.kdeplot(data=df_train, x="Age", hue="Survived", palette=["C1", "C0"])

In [None]:
sns.catplot(x = 'SibSp', y = 'Survived', data = df_train, kind = 'bar', palette='mako')


In [None]:
df_train.columns

In [None]:
age     = df_train.Age
fare    = df_train.Fare
survive = df_train.Survived
pclass  = df_train.Pclass
embarked  = df_train.Embarked
sex = df_train.Sex


In [None]:
sns.distplot(x=df_train.Age, bins=40)

In [None]:
sns.jointplot(x=age, y=fare)

In [None]:
sns.jointplot(x=age, y=fare, kind="hex")

In [None]:
d = pd.read_csv(train_data_path)[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]

In [None]:
sns.pairplot(d)


In [None]:
sns.barplot(x="Pclass", data=d, y=pclass)

In [None]:
sns.countplot(d['Pclass'])

In [None]:
sns.swarmplot(y = d['Age'], x = d['Pclass'])


In [None]:
sns.heatmap(d.corr(), annot = True, cmap = 'viridis')


In [None]:
sns.clustermap(d.corr(), annot=True,cmap='viridis', figsize=(6,6))


In [None]:
mask = np.triu(np.ones_like(df_train.corr(method = "pearson"), dtype=np.bool))


In [None]:
sns.FacetGrid(df_train, col = 'Pclass', row = 'Survived').map(sns.distplot, 'Age')


In [None]:
sns.lmplot(x = 'Age', y = 'Fare', data = d, hue = 'Survived', palette=["pink", "lightblue"])



In [None]:
train_with_age = df_train[ ~np.isnan(df_train["Age"]) ]
survivalpc_by_age = train_with_age.groupby(["Sex","Age"], as_index = False)["Survived"].mean()


for gender in ["male", "female"]:
    plt.figure()
    sns.lmplot(data=survivalpc_by_age[survivalpc_by_age["Sex"]==gender], x="Age", y="Survived", order=4)
    plt.title("%s survival by age" % gender)
    plt.xlim(0, 80)
    plt.ylim(0, 1)

In [None]:
1

In [None]:
p = .73
r = .77

In [None]:
2 * (r * p) / (
     r + p 
)

In [None]:
m = 1           # False Pos
n = 5           # False Neg
s = m+n

(1) * 2 * ((m/s)*p * (n/s)*r) / (
         (m/s)*p + (n/s)*r 
)

In [None]:
m = 1           # False Pos
n = 1           # False Neg
s = m+n

(p * m + r * n) / s

In [None]:
def score(p, r, m = 1, n = 1):
    
    return (p * m + r * n) / (m + n)
m = 6
n = 1

score(.75, .75, m, n), score(.73, .75, m, n), score(.73, .77, m, n), score(.75, .70, m, n), score(.74, .72, m, n), score(.73, .71, m, n)

In [None]:
1, 4, 3, 5, 2, 6

In [None]:
score(.75, .75, 4, 1), score(.73, .75, 4, 1), score(.73, .77, 4, 1), score(.75, .70, 4, 1), score(.74, .72, 4, 1), score(.73, .71, 4, 1)


In [None]:
df_train[df_train.Age.isna()].Title.value_counts()

In [None]:
df_train[df_train.Title=="Dr"]

In [None]:
df_train.Title.value_counts()