In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
import numpy as np

# Import Training Dataset

In [2]:
titanic = pd.read_csv('data/train.csv')

In [3]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
titanic.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [5]:
titanic.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Missing Data

Fill NaNs

In [6]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Embarked'] = titanic['Embarked'].fillna('S')

Map sex to numeric representation

In [7]:
sex_map = {
    'male': 0,
    'female': 1
}

titanic['Sex'] = titanic['Sex'].map(sex_map)

Map emarked to numeric representation

In [8]:
embarked_map = {
    'S': 0,
    'C': 1,
    'Q': 2
}

titanic['Embarked'] = titanic['Embarked'].map(embarked_map)

# Linear Regression

In [9]:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [10]:
titanic.shape

(891, 12)

In [11]:
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

In [12]:
predictions = []

In [13]:
for train_index, test_index in kf:
    
    lr = LinearRegression()
    
    features = titanic[predictors].iloc[train_index,:]
    test_features = titanic[predictors].iloc[test_index,:]

    target = titanic['Survived'].iloc[train_index]
    test_target = titanic['Survived'].iloc[test_index]
    
    lr.fit(features, target)
    test_predictions = lr.predict(test_features)
    predictions.append(test_predictions)

[ 0.08998778  0.96075621  0.59267628  0.93113873  0.05293431]
[ 1.13774791  0.44173212  0.98551347  0.66915371  0.08254228]
[ 0.17288922  0.01702947  0.78261693 -0.00834789  0.14702227]


# Evaulating Error

In [14]:
predictions = np.concatenate(predictions, axis=0)

In [15]:
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0

In [16]:
accuracy = sum(predictions[predictions == titanic['Survived']])/len(predictions)
accuracy

  if __name__ == '__main__':


0.78338945005611671

# Logistic Regression

In [17]:
lr = LogisticRegression(random_state=1)

scores = cross_val_score(lr, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()

0.78787878787878773

# Process Test Set

In [18]:
titanic_test = pd.read_csv('data/test.csv')

In [19]:
titanic_test['Age'] = titanic_test['Age'].fillna(titanic_test['Age'].median())
titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())

In [20]:
titanic_test['Sex'] = titanic_test['Sex'].map(sex_map)

In [21]:
titanic_test['Embarked'] = titanic_test['Embarked'].map(embarked_map)

Initialize the algorithm class

In [22]:
lr = LogisticRegression(random_state=1)

Train the algorithm using all the training data

In [23]:
lr.fit(titanic[predictors], titanic["Survived"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Make predictions using the test set

In [24]:
predictions = lr.predict(titanic_test[predictors])

Create a new dataframe with only the columns Kaggle wants from the dataset.

In [25]:
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv('submissions/getting_started_with_kaggle.csv', index=False)