In [82]:
import numpy as np # linear algebra
import pandas as pd # data processing
import seaborn as sns 
import matplotlib.pyplot as plt
import missingno
from collections import  Counter

In [83]:
# concatenating the training and the testing data so we have more values to work with and therefor we can better approximate the missing values
# the combined data differentiates the training set and the test set by encoding them with 1 and 0 respectively
training = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
training['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
all_data = pd.concat([training,test])

# matplotlib inline
all_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'train_test'],
      dtype='object')

In [84]:
all_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train_test
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


### 1.2 Feature engineering
The data for the cabin and tickes has many classes and is very messy therefore we will try clean it up by leveraging these classes to create new variables that aren't in the training set, Furthermore we will look at if the persons title is related to that persons survival

#### 1.2.3 Title

In [85]:
training.Name.head()
# splits up the data into the persons title 
all_data['title'] = all_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
all_data['title'].value_counts()


Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
Lady              1
Sir               1
Mme               1
Don               1
Capt              1
the Countess      1
Jonkheer          1
Dona              1
Name: title, dtype: int64

#### 1.2.1 Grouping Ages

In [86]:
# # grouping ages 
# bins = [0,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86]
# #labels = ['0-15','16-20','21-25','26-30','31-35','36-40','41-45','46-50','51-55','56-60','61-65','71-75','76-80','81-85','86-x']
# labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
# all_data['Age'] = pd.cut(all_data['Age'], bins=bins, labels=labels, right=False)
# all_data.head()

#### 1.2.2 Family Size

In [87]:
# feature engineering 
all_data['FamilySize']=all_data['Parch']+all_data['SibSp']+1
all_data=all_data.drop(['SibSp'],axis=1)
all_data=all_data.drop(['Parch'],axis=1)
all_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,train_test,title,FamilySize
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,,S,1,Mr,2
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1,Mrs,2
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,,S,1,Miss,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,S,1,Mrs,2
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,,S,1,Mr,1


In [88]:
# # Mean of survival by sex

# all_data[['title','Age']].groupby('title', as_index = False).mean().sort_values(by = 'Age', ascending = False)
# # all_data[['Sex', 'Survived']].groupby('Sex', as_index = False).mean().sort_values(by = 'Survived', ascending = False)

In [89]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   Ticket       1309 non-null   object 
 7   Fare         1308 non-null   float64
 8   Cabin        295 non-null    object 
 9   Embarked     1307 non-null   object 
 10  train_test   1309 non-null   int64  
 11  title        1309 non-null   object 
 12  FamilySize   1309 non-null   int64  
dtypes: float64(3), int64(4), object(6)
memory usage: 143.2+ KB


In [90]:
# all_data.dtypes
# all_data.groupby(['title'])['Age'].mean()
# all_data.groupby(['title'])['Age'].median()

# Apply fillna function within each group
#all_data['Age'] = all_data.groupby(['title']).transform(lambda x: x.fillna(x.mean()))
# all_data.groupby('title', as_index=False)['Age'].median()

# for index, row in all_data.iterrows():
#   title = row['title']
#   age = row['Age']
#   if np.isnan(age):
#     title_median_array = all_data.groupby('title', as_index=False)['Age'].median()
#     if(all_data.loc[index,'title']=='Mr')
    
#     elif(all_data.loc[index,'title']=='Mrs')

#     elif(all_data.loc[index,'title']=='Master')

#     elif(all_data.loc[index,'title']=='Miss')

#     print(f'title : {title_median}')
#     all_data.loc[index, 'Age'] = title_median
#   # else:
#   #   print(age, title)

all_data['Age'].fillna(all_data.groupby('title')['Age'].transform('median'))

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
413    29.0
414    39.0
415    38.5
416    29.0
417     4.0
Name: Age, Length: 1309, dtype: float64

In [91]:
all_data.groupby('title', as_index=False)['Age'].median()


Unnamed: 0,title,Age
0,Capt,70.0
1,Col,54.5
2,Don,40.0
3,Dona,39.0
4,Dr,49.0
5,Jonkheer,38.0
6,Lady,48.0
7,Major,48.5
8,Master,4.0
9,Miss,22.0


In [92]:
# all_data = all_data.sort_values(['title'])
# srs = all_data.dropna().groupby(['title']).agg({'Age':'mean'})['Age']
# srs.index = all_data[all_data['Age'].isnull()].index
# all_data['Age'] = all_data['Age'].fillna(value=srs)