## Motivation
Well I decided to start competing in Kaggle competitions. I need as much practice as possible.
## Disclaimer: 
All the code here is inspired by the following [video](https://www.youtube.com/watch?v=I3FBJdiExcg&t=1s)  

### Step1: Have a general idea about the problem: Read the corresponding documentation


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [None]:
df = pd.read_csv("train.csv")
df = df.set_index("PassengerId")
print(df.head())

### Descriptive analysis
Let's try to understand the data.

In [None]:
print(df.describe())

In [None]:
df = df.rename(mapper=str.lower, axis=1)
print(df.columns)
df = df.rename(columns={"survived":"y", "embarked":"from", "pclass":"c"})

In [None]:
X_num_names = ["y", "c", "age", "sibsp", "parch", "fare"]
X_cat_names = ["name", "sex", "ticket", "cabin", "from"]
df_num = df.loc[:, X_num_names]
df_cat = df.loc[:, X_cat_names]

In [None]:
df_num.describe()


In [None]:
for col in df_num.columns:
    plt.hist(df_num[col])
    plt.title(col)
    plt.show()

In [None]:
df_survive = df[df['y']==1]
print(df_survive.describe())
print("#" * 50)
df_dead = df[df['y'] == 0]
print(df_dead.describe())

In [None]:
fun_list = [pd.Series.count, np.sum, np.mean, np.std, np.max, np.min]

df_survived_class = df_survive.groupby("c").agg({"fare":fun_list})
df_dead_class = df_dead.groupby("c").agg({"fare":fun_list})
print(df_survived_class)
print(df_dead_class)

def age_division(age):
    if age <= 12:
        return "kid"
    elif age <= 18:
        return "teen"
    elif age <= 40:
        return "adult"
    else:
        return "elderly"

# df_survived_age = df_survive.set_index("age").groupby(age_division).agg({"age":fun_list})
# df_dead_age = df_dead.set_index("age").groupby(age_division).agg({"age":fun_list})

# print(df_survived_age)
# print(df_dead_age) 
# print(len(df_survive[df_survive["age"] >= 41]))

In [None]:
df_num.corr()

In [None]:
print(pd.pivot_table(df, index='y', values=['c','age', 'fare'], aggfunc=[pd.Series.count, np.mean]))

In [None]:
# comparing survivors' values with respect to categorical variables.
print(pd.pivot_table(df, index='y', columns=['c'], values='ticket', aggfunc='count'))
print(pd.pivot_table(df, index='y', columns=['sex'], aggfunc='count', values='ticket'))
print(pd.pivot_table(df, index='y', columns='from', aggfunc='count', values='ticket'))
# at first glance it might seem that passengers embarking from "C" are more likely to survive. Yet, it might be useful to consider
# the social class of people coming from the different stations.

print(pd.pivot_table(df, index='c', columns=['from'], values='name', aggfunc='count'))
# the last observation did not provide evidence to completely rool out the possibility of positive correlation between the embarkment 
# point and survival, more investigation is needed.


In [None]:

df_better = df.copy()
# 0: male, 1: female
genre_mapper = {"male":0, "female":1}
df_better["sex"] = pd.Series([genre_mapper[x] for x in df_better['sex']])

from_mapper = {"C":1, "Q":2, "S":3}
df_better["from"] = pd.Series([from_mapper[x] if x in from_mapper else x for x in df_better["from"]])
df_better['from'] = df_better['from'].astype(float)
# print(df_better.head())
print(df_better.loc[:, ["y", "from"]].corr())

In [None]:
# understand the relation between classes and the embarkment station
from_class_ana = df_better.groupby("from").agg({"c":['count', 'mean'], "fare":'mean'})
print(from_class_ana)
# so we can say the embarkment station has little to no correlation with the social class

print(pd.pivot_table(df_better, index='from',columns='c',values='ticket', aggfunc='count'))


In [None]:
# understanding the cabin

print(df_better["cabin"].isna().sum())
print(df_better["cabin"].copy().dropna().count())
# so only 204 passengers bought cabins

In [None]:
# we assume that the cabins are indeed separated by spaces
df_better['num_cabins'] = df_better.cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(" ")))
print(df_better.num_cabins.value_counts())
# let's check the relation between number of cabins and social class
print(pd.pivot_table(df_better, index='num_cabins', columns='c', values='ticket', aggfunc='count'))

In [None]:
# understanding the fare column:
print (df_better.loc[:, ["fare"]].describe())
fare_s = df_better['fare']
fare_df = df_better.loc[:, ["fare"]]
# consider the nan values
print(fare_s.isna().sum())
# there is no nan values: such a delight !!!

print(fare_s[lambda x : x == 0].count())

In [None]:
df_original = df.copy()
df = df_better
# now df is the dataframe with all the modifications

In [None]:
df_no_fare = df[df['fare'] == 0]
df_fare = df[df['fare'] != 0]
fare_np = df_fare["fare"].values

quantiles_values = [0, 0.25, 0.5, 0.75, 1]
fare_quantiles = {}
for i in range(1, 4):
    fare_quantiles["fare_q_c" + str(i)] = np.quantile(df_fare[df_fare["c"] == i]["fare"].values, quantiles_values)

for key, value in fare_quantiles.items():
    print(str(key) + ": " + str(value)) 

def quartile_number(value, quantiles):
    # value assumed to be at least larger or equal then the lowest value
    assert (value >= quantiles_values[0])
    for i in range(len(quantiles) - 1):
        if value >= quantiles[i] and value < quantiles[i + 1]:
            return i + 1
    return len(quantiles) - 1 

def classify_passenger(row):
    return quartile_number(row['fare'], fare_quantiles["fare_q_c" + str(int(row['c']))])    


df_fare['quartile_class'] = df_fare.loc[:, ['fare', 'c']].apply(lambda row: classify_passenger(row), axis=1)
print(df_fare.loc[:, ['fare', 'c', 'quartile_class']].head(15))

# fare_quantile = np.quantile(fare_np, [0, 0.25, 0.5, 0.75, 1])
# df_better["class_fare_quantile"] = df_fare.loc[:, ['fare', 'c']].apply()
# print(fare_quantile)


In [None]:
# let's try to see the relationship between the fare quartile and the survival

fare_class_quartile_effect = pd.pivot_table(df_fare, index='y', columns=['c','quartile_class'], values='name', aggfunc='count')
print(fare_class_quartile_effect)
print("#" * 50)
fare_class_quartile_effect.loc[2] = fare_class_quartile_effect.loc[1] / fare_class_quartile_effect.loc[0]
print(fare_class_quartile_effect)


In [61]:

fare_by_class_mean = df_fare.groupby("c").agg({"fare": np.mean}).squeeze()
print(fare_by_class_mean)
def fill_up_fare(row):
    if row['fare'] == 0:
        row['fare'] = fare_by_class_mean[row['c']]
    return row

# we can see that there is positive correlation between the quartile_class feature and survival
# it is necessary to impute the row: the mean seems like a reasonable choice

df = df.apply(lambda row: fill_up_fare(row) , axis=1)

# now time to add the new feature to the beloved dataframe
df['quartile_class'] = df.loc[:, ['fare', 'c']].apply(lambda row: classify_passenger(row), axis=1)

# let's consider the correlation between survival and the quartile class
print(df.loc[:, ["y", 'quartile_class']].corr())

c
1    86.148874
2    21.358661
3    13.787875
Name: fare, dtype: float64
                       y  quartile_class
y               1.000000        0.103309
quartile_class  0.103309        1.000000


In [None]:
# check the correlation between the number of cabines and survival
print(df.loc[:, ["y", "num_cabins"]].corr())

# There are 3 features that might reflect in a passenger's social image:
# * class * fare * num_cabins
# let's consider each individually

print((pd.pivot_table(df, index='y', columns=[ 'num_cabins', 'c'], values='ticket', aggfunc=['count'])))

In [None]:
df['train_test'] = pd.Series([1 for _ in range(len(df))])

In [None]:
df_test = pd.read_csv("test.csv")
df_test = df_test.rename(mapper=str.lower, axis=1)
print(df_test.columns)
df_test = df_test.rename(columns={"embarked":"from", "pclass":"c"})
df_test['train_test'] = pd.Series([1 for _ in range(len(df_test))])

In [None]:
print(df_test)
print(df)