In [19]:
import numpy as np
import pandas as pd

# RMS Titanic data visualization code 
from titanic_visualizations import survival_stats
from IPython.display import display
%matplotlib inline

# Load the dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)


# Print the first few entries of the RMS Titanic data
display(full_data.head())

y_all = full_data['Survived']
data = full_data.drop('Survived', axis = 1)
data = data.drop('Name', axis = 1)
data = data.drop('Ticket', axis = 1)
data = data.drop('Cabin', axis = 1)
data.fillna(0, inplace=True)

def preprocess_features(X):
    outX = pd.DataFrame(index=X.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])
        # Note: This should change the data type for yes/no columns to int

        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)  # e.g. 'school' => 'school_GP', 'school_MS'

        outX = outX.join(col_data)  # collect column(s) in output dataframe

    return outX

X_all = preprocess_features(data)
print "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns))


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


Processed feature columns (12):-
['PassengerId', 'Pclass', 'Sex_female', 'Sex_male', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_0', 'Embarked_C', 'Embarked_Q', 'Embarked_S']


In [21]:
from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(y_all, train_size=0.70, test_size=0.30, random_state=42)

for train_index, test_index in sss:
    X_train, X_test = X_all.iloc[train_index], X_all.iloc[test_index] #keeping it a pandas dataframe
    y_train, y_test = y_all.iloc[train_index], y_all.iloc[test_index]

print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])


Training set: 623 samples
Test set: 268 samples


In [24]:
import time

def train_classifier(clf, X_train, y_train):
    print "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Done!\nTraining time (secs): {:.3f}".format(end - start)

# TODO: Choose a model, import it and instantiate an object

# DecisionTreeClassifier 
from sklearn import tree

clf = tree.DecisionTreeClassifier()


train_classifier(clf, X_train, y_train)


Training DecisionTreeClassifier...
Done!
Training time (secs): 0.004


In [27]:
from sklearn.metrics import f1_score

def predict_labels(clf, features, target):
    print "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print "Done!\nPrediction time (secs): {:.3f}".format(end - start)
    return f1_score(target.values, y_pred)

train_f1_score = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)

Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.001
F1 score for training set: 1.0


In [28]:
print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))

Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.001
F1 score for test set: 0.571428571429


In [30]:
from sklearn import neighbors

def train_predict(clf, X_train, y_train, X_test, y_test):
    print "------------------------------------------"
    print "Training set size: {}".format(len(X_train))
    train_classifier(clf, X_train, y_train)
    print "F1 score for training set: {}".format(predict_labels(clf, X_train, y_train))
    print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))

# TODO: Run the helper function above for desired subsets of training data
# Note: Keep the test set constant

#svm_clf = svm.SVC()
dt_clf = tree.DecisionTreeClassifier()
knn_clf = neighbors.KNeighborsClassifier()
#nb_clf = naive_bayes.GaussianNB()

X_train_100 = X_train.sample(n=100, random_state=10) # tested different random states
X_train_200 = X_train.sample(n=200, random_state=10) # finished with random state = 10 to get 
X_train_300 = X_train.sample(n=300, random_state=10) # finished with random state = 10 to get 
X_train_400 = X_train.sample(n=400, random_state=10) # finished with random state = 10 to get 
y_train_100 = y_train.sample(n=100, random_state=10) # consistent answers
y_train_200 = y_train.sample(n=200, random_state=10)
y_train_300 = y_train.sample(n=300, random_state=10)
y_train_400 = y_train.sample(n=400, random_state=10)

train_predict(dt_clf, X_train_100, y_train_100, X_test, y_test)
train_predict(dt_clf, X_train_200, y_train_200, X_test, y_test)
train_predict(dt_clf, X_train_300, y_train_300, X_test, y_test)
train_predict(dt_clf, X_train_400, y_train_400, X_test, y_test)
train_predict(dt_clf, X_train, y_train, X_test, y_test)

train_predict(knn_clf, X_train_100, y_train_100, X_test, y_test)
train_predict(knn_clf, X_train_200, y_train_200, X_test, y_test)
train_predict(knn_clf, X_train_300, y_train_300, X_test, y_test)
train_predict(knn_clf, X_train_400, y_train_400, X_test, y_test)
train_predict(knn_clf, X_train, y_train, X_test, y_test)


------------------------------------------
Training set size: 100
Training DecisionTreeClassifier...
Done!
Training time (secs): 0.001
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 1.0
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.623655913978
------------------------------------------
Training set size: 200
Training DecisionTreeClassifier...
Done!
Training time (secs): 0.001
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 1.0
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.598039215686
------------------------------------------
Training set size: 300
Training DecisionTreeClassifier...
Done!
Training time (secs): 0.001
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 