# Exercise 07

## Kaggle competition 

* Overview of how Kaggle works ([slides](https://github.com/justmarkham/DAT8/raw/master/slides/16_kaggle.pdf))
* Kaggle Titanic competition: [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic)

# Exercise 07.1 (20 points)

Create a submission using different classification methods, preprocessing extrategies and cross-validation techniques discussed during the class. The output must be detailed in this notebook.

# Exercise 07.2 (20 points)

The reminder points will be allocated based on the performance of each one submission.

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('titanic_train.csv', header=0)
test_df = pd.read_csv('titanic_test.csv', header=0)

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


## Create features

In [4]:
titanic = train_df
titanic.Age.fillna(titanic.Age.median(), inplace=True)
titanic['Sex_Female'] = (titanic['Sex'] == 'female').astype(np.int)
embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked')
embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True)
titanic = pd.concat([titanic, embarked_dummies], axis=1)

## Train Model

In [5]:
# define X and y
feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S']
X = titanic[feature_cols]
y = titanic.Survived

# train/test split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# train a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)

# make predictions for testing set
y_pred_class = logreg.predict(X_test)

# calculate testing accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.793721973094


# Apply model to Test set

In [6]:
test_df.Age.fillna(titanic.Age.median(), inplace=True)  # Note use median from training set
test_df['Sex_Female'] = (test_df['Sex'] == 'female').astype(np.int)
embarked_dummies = pd.get_dummies(test_df.Embarked, prefix='Embarked')
embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True)
test_df = pd.concat([test_df, embarked_dummies], axis=1)

In [7]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Female,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,0,1


In [30]:
predictions = logreg.predict(test_df[feature_cols])

In [9]:
# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("titanic_submission.csv", index=False)

# Using cross-validation

In [11]:
# Create k-folds
from sklearn.cross_validation import KFold
kf = KFold(X.shape[0], n_folds=10, random_state=0)

results = []
models = []
for train_index, test_index in kf:
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # train a logistic regression model
    models.append(LogisticRegression(C=1e9))
    
    models[-1].fit(X_train, y_train)

    # make predictions for testing set
    y_pred_class = models[-1].predict(X_test)

    # calculate testing accuracy
    results.append(metrics.accuracy_score(y_test, y_pred_class))

In [13]:
results

[0.76666666666666672,
 0.8202247191011236,
 0.7752808988764045,
 0.8202247191011236,
 0.7640449438202247,
 0.7865168539325843,
 0.7640449438202247,
 0.7752808988764045,
 0.84269662921348309,
 0.8314606741573034]

In [25]:
probas = pd.DataFrame(index=test_df[feature_cols].index, columns=['p'+ str(i) for i in range(10)])

In [26]:
for i in range(10):
    proba = models[i].predict_proba(test_df[feature_cols])[:, 1]
    probas.iloc[:, i] = proba

In [27]:
probas.head()

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9
0,0.087517,0.098606,0.102348,0.081192,0.093196,0.106704,0.098966,0.108938,0.117301,0.095253
1,0.381119,0.409449,0.439191,0.360096,0.404643,0.393892,0.39987,0.380765,0.413636,0.432706
2,0.101229,0.126403,0.131574,0.09806,0.10524,0.12316,0.11138,0.116369,0.146643,0.116584
3,0.085612,0.085913,0.086607,0.079191,0.086348,0.0869,0.084662,0.082693,0.090955,0.088761
4,0.555966,0.575204,0.574304,0.548026,0.580051,0.56974,0.58187,0.589655,0.556188,0.586308


### From proba to classifier

In [32]:
predictions = ((probas.mean(axis=1) > 0.5) * 1.0).values

In [29]:
# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("titanic_submission_CV.csv", index=False)