# Initial exploration of titanic data

In [1]:
import pandas as pd

In [29]:
titanic = pd.read_csv('data/train.csv')
print(titanic.head(5))
print(titanic.describe())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
       P

In [32]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Age'].count()

891

## Convert the non numeric data to a numeric series

In [31]:
# Because each column is a series, we can apply a map
# Note we need to assert the type of the map

print type(titanic['Sex'])
titanic['Sex'] = titanic['Sex'].map({'male':1, 'female':0}).astype(int)
print titanic['Sex'].head(5)

<class 'pandas.core.series.Series'>
0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64


In [25]:
# Handle the embarked, first find the unique observatioins
titanic['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [30]:
print titanic.Embarked.dropna().mode()  # find the most frequent location, assume that is where na boarded
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic['Embarked'] = titanic['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(int)
print(titanic['Embarked'].head(3))

0    0
1    1
2    0
Name: Embarked, dtype: int64


# Now lets try a regression

In [38]:
# Import the linear regression class
import sklearn
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

In [39]:
# The columns we'll use to predict the target
#print titanic.columns
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

print titanic.shape

(891, 12)


In [54]:
# Initialize our algorithm class
alg = sklearn.linear_model.LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)


In [55]:
predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)