# Get missing ages with linear regression

In [39]:
import os
os.chdir('C:\Users\Lundi\Documents\Programming\Python\Kaggle\Titanic - 2015')

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

titanic_data_v4 = pd.read_csv('Data/titanic_data_v4.csv')
titanic_data_v4 = titanic_data_v4.drop(['Unnamed: 0'], axis=1)
titanic_data_v4.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S
0,1,0,3,22,1,0,7.25,1,0,1


## Preparing data

In [27]:
titanic_data_v4.count()

PassengerId    891
Survived       891
Pclass         891
Age            714
SibSp          891
Parch          891
Fare           891
Is_male        891
Embarked Q     891
Embarked S     891
dtype: int64

I want a design matrix with only passengers that had ages. This will be used to train a linear regression model that will then predict the ages of passengers with missing ages.

First, let's make that design matrix:

In [28]:
X = titanic_data_v4.dropna().drop(['PassengerId','Survived'], axis=1)
y = X['Age']
X = X.drop(['Age'], axis=1)
X.head(2)

Unnamed: 0,Pclass,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S
0,3,1,0,7.25,1,0,1
1,1,1,0,71.2833,0,0,0


In [29]:
y.head()

0    22
1    38
2    26
3    35
4    35
Name: Age, dtype: float64

#### X Data with missing ages (to predict)

In [30]:
X_without_age = titanic_data_v4.ix[np.isnan(titanic_data_v4['Age']),:].drop(['PassengerId','Survived','Age'], axis=1)
X_without_age.head(2)

Unnamed: 0,Pclass,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S
5,3,0,0,8.4583,1,1,0
17,2,0,0,13.0,1,0,1


## Using linear regression to get missing ages

In [31]:
import sklearn.linear_model as skl_lm
import sklearn.cross_validation as skl_cv
lr_reg = skl_lm.LinearRegression()

lr_reg.fit(X, y)

age_predict = lr_reg.predict(X_without_age)

In [32]:
age_predict

array([ 31.27574649,  35.48942462,  23.10097007,  26.30288574,
        28.08226328,  28.32100301,  31.67130061,  28.0841446 ,
        26.30282458,  26.29311801,  28.31875766,  27.28637915,
        28.0841446 ,  18.31875173,  42.40454057,  40.48993092,
        21.51451006,  28.32100301,  28.31875766,  28.08359855,
        28.31875766,  28.31875766,  28.32100301,  28.32276202,
        23.95850815,  28.31875766,  31.28606026,  18.20902708,
        21.414674  ,  28.32949663,  28.30983886,  -5.24091331,
        38.13392799,  42.54396512,  15.61990345,  -8.44282898,
        33.43168893,  42.19340157,  24.08446348,  31.28606026,
        28.0841446 ,  -5.24091331,  27.39922931,  28.32100301,
        12.41798778,  25.12412265,  19.10887249,  24.08446348,
        28.33040671,  36.53841064,  31.28606026,  28.0841446 ,
        42.47006647,  28.0841446 ,  35.67872165,  42.54287302,
        40.48993092,  42.47734712,  28.0841446 ,  23.28669804,
        35.25990801,  28.31875766,  36.07706027,  -5.24

Many of the ages are less than zero. Let's assign these values of 0.5, assuming theey are all todlers:

In [33]:
age_predict[age_predict < 0] = 0.5
age_predict

array([ 31.27574649,  35.48942462,  23.10097007,  26.30288574,
        28.08226328,  28.32100301,  31.67130061,  28.0841446 ,
        26.30282458,  26.29311801,  28.31875766,  27.28637915,
        28.0841446 ,  18.31875173,  42.40454057,  40.48993092,
        21.51451006,  28.32100301,  28.31875766,  28.08359855,
        28.31875766,  28.31875766,  28.32100301,  28.32276202,
        23.95850815,  28.31875766,  31.28606026,  18.20902708,
        21.414674  ,  28.32949663,  28.30983886,   0.5       ,
        38.13392799,  42.54396512,  15.61990345,   0.5       ,
        33.43168893,  42.19340157,  24.08446348,  31.28606026,
        28.0841446 ,   0.5       ,  27.39922931,  28.32100301,
        12.41798778,  25.12412265,  19.10887249,  24.08446348,
        28.33040671,  36.53841064,  31.28606026,  28.0841446 ,
        42.47006647,  28.0841446 ,  35.67872165,  42.54287302,
        40.48993092,  42.47734712,  28.0841446 ,  23.28669804,
        35.25990801,  28.31875766,  36.07706027,   0.5 

Let's insert these ages into the original titanic data:

In [34]:
titanic_data_v5 = titanic_data_v4.copy()
titanic_data_v5.ix[np.isnan(titanic_data_v5['Age']),'Age'] = age_predict
titanic_data_v5.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1
5,6,0,3,31.275746,0,0,8.4583,1,1,0
6,7,0,1,54.0,0,0,51.8625,1,0,1
7,8,0,3,2.0,3,1,21.075,1,0,1
8,9,1,3,27.0,0,2,11.1333,0,0,1
9,10,1,2,14.0,1,0,30.0708,0,0,0


Comparing this to the original table...

In [35]:
titanic_data_v4.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Is_male,Embarked Q,Embarked S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1
5,6,0,3,,0,0,8.4583,1,1,0
6,7,0,1,54.0,0,0,51.8625,1,0,1
7,8,0,3,2.0,3,1,21.075,1,0,1
8,9,1,3,27.0,0,2,11.1333,0,0,1
9,10,1,2,14.0,1,0,30.0708,0,0,0


...I can see that the predicted ages have been inserted:

In [36]:
titanic_data_v5.count()

PassengerId    891
Survived       891
Pclass         891
Age            891
SibSp          891
Parch          891
Fare           891
Is_male        891
Embarked Q     891
Embarked S     891
dtype: int64

#### Writing to file

In [38]:
titanic_data_v5.to_csv('Data/titanic_data_v5.csv')