In [1]:
import pandas as pd
import numpy as np
import csv as csv
import math
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

%matplotlib inline

In [66]:
def preProcessData(dataframe):
    dataframe['Gender'] = dataframe.Sex.map( {'female': 0, 'male': 1} ).astype(int)
    modeEmbarked = dataframe.Embarked.dropna().mode().values
    dataframe.loc[ (dataframe.Embarked.isnull()), 'Embarked']  = modeEmbarked
    dummiesEmbarked = dataframe.Embarked.str.join(sep='*').str.get_dummies(sep='*')
    dataframe = pd.concat([dataframe, dummiesEmbarked], axis=1)
    
    dataframe['Pclass'] = dataframe['Pclass'].astype('str') 
    dummiesPclass =  pd.get_dummies(dataframe.Pclass,prefix='Pclass').astype(int)
    dataframe = pd.concat([dataframe, dummiesPclass], axis=1)
    
    #dataframe['SibSp'] = dataframe['SibSp'].astype('str') 
    #dummiesSibSp =  pd.get_dummies(dataframe.SibSp,prefix='SibSp')
    #dataframe = pd.concat([dataframe, dummiesSibSp], axis=1)
    
    #dataframe['Parch'] = dataframe['Parch'].astype('str') 
    #dummiesParch = pd.get_dummies(dataframe.Parch,prefix='Parch')
    #dataframe = pd.concat([dataframe, dummiesParch], axis=1)
    
    mean = dataframe['Age'].dropna().mean()
    median = dataframe['Age'].dropna().median()
    dataframe.loc[ (dataframe.Age.isnull()), 'Age'] = math.floor(median)
    #min_max_scaler = preprocessing.MinMaxScaler()
    #X_train_minmax = min_max_scaler.fit_transform(dataframe['Age'].values)
    #dataframe['Age'] = X_train_minmax

    #mean = dataframe['Fare'].dropna().mean()
    median = dataframe['Fare'].dropna().median()
    dataframe.loc[ (dataframe.Fare.isnull()), 'Fare'] = math.floor(median)
    #X_train_minmax = min_max_scaler.fit_transform(dataframe['Fare'].values)
    #dataframe['Fare'] = X_train_minmax
    
    ids = dataframe['PassengerId'].values
    dataframe = dataframe.drop(['Name', 'Sex', 'Ticket','Cabin','Pclass','Parch','SibSp', 'PassengerId','Embarked'], axis=1) 

    return dataframe,ids

In [67]:
train_df = pd.read_csv('train.csv', header=0)  
train_df,ids = preProcessData(train_df)
train_df.head()

Unnamed: 0,Survived,Age,Fare,Gender,C,Q,S,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,7.25,1,0,0,1,0,0,1
1,1,38.0,71.2833,0,1,0,0,1,0,0
2,1,26.0,7.925,0,0,0,1,0,0,1
3,1,35.0,53.1,0,0,0,1,1,0,0
4,0,35.0,8.05,1,0,0,1,0,0,1


In [68]:
test_df = pd.read_csv('test.csv', header=0)  
test_df,ids = preProcessData(test_df)
test_df.head()

Unnamed: 0,Age,Fare,Gender,C,Q,S,Pclass_1,Pclass_2,Pclass_3
0,34.5,7.8292,1,0,1,0,0,0,1
1,47.0,7.0,0,0,0,1,0,0,1
2,62.0,9.6875,1,0,1,0,0,1,0
3,27.0,8.6625,1,0,0,1,0,0,1
4,22.0,12.2875,0,0,0,1,0,0,1


In [69]:
from sklearn import linear_model

train_data = train_df.values
x_train, x_test, y_train, y_test = train_test_split(train_data[0::,1::], train_data[0::,0], 
                            test_size = 0.2, random_state = 123) # Split training/test.
test_data = test_df.values
print x_train.shape
print test_data.shape

logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x_train, y_train )
y_true, y_pred = y_test, logreg.predict(x_test) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit

y_pred = logreg.predict(test_data).astype(int)
#print(classification_report(y_true, y_pred)) # Classification on each digit

predictions_file = open("LogisticRegression2.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, y_pred))
predictions_file.close()

(712L, 9L)
(418L, 9L)
             precision    recall  f1-score   support

        0.0       0.86      0.84      0.85       114
        1.0       0.73      0.75      0.74        65

avg / total       0.81      0.81      0.81       179



In [73]:
from nolearn.dbn import DBN


train_data =  np.array(train_df.values)
x_train, x_test, y_train, y_test = train_test_split(train_data[0::,1::], train_data[0::,0], 
                            test_size = 0.2, random_state = 123) # Split training/test.
test_data =  np.array(test_df.values)
dbn_model = DBN([x_train.shape[1],50, 2],
                learn_rates = 0.3,
                learn_rate_decays = 0.9,
                epochs = 200, 
                verbose = 0)
dbn_model.fit(x_train, y_train)
y_true, y_pred = y_test, dbn_model.predict(x_test) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit
y_pred = dbn_model.predict(test_data) # Get our predictions
predictions_file = open("DBN.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, y_pred))
predictions_file.close()

             precision    recall  f1-score   support

        0.0       0.64      1.00      0.78       114
        1.0       0.00      0.00      0.00        65

avg / total       0.41      0.64      0.50       179



In [71]:
from sklearn.ensemble import RandomForestClassifier

train_data =  np.array(train_df.values)
x_train, x_test, y_train, y_test = train_test_split(train_data[0::,1::], train_data[0::,0], 
                            test_size = 0.2, random_state = 1243) # Split training/test.
test_data =  np.array(test_df.values)
rf_model = RandomForestClassifier(n_estimators=100,)
rf_model.fit(x_train, y_train)
y_true, y_pred = y_test, rf_model.predict(x_test) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit
y_pred = rf_model.predict(test_data) # Get our predictions
predictions_file = open("RandomForestClassifier.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, y_pred))
predictions_file.close()

             precision    recall  f1-score   support

        0.0       0.90      0.75      0.82       117
        1.0       0.64      0.84      0.73        62

avg / total       0.81      0.78      0.79       179



In [72]:
from sklearn import svm

train_data =  np.array(train_df.values)
x_train, x_test, y_train, y_test = train_test_split(train_data[0::,1::], train_data[0::,0], 
                            test_size = 0.2, random_state = 12341) # Split training/test.
test_data =  np.array(test_df.values)
model = svm.SVC(C=10)
model.fit(x_train, y_train)
y_true, y_pred = y_test, model.predict(x_test) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit
y_pred = model.predict(test_data) # Get our predictions
predictions_file = open("SVC.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, y_pred))
predictions_file.close()

             precision    recall  f1-score   support

        0.0       0.80      0.77      0.78       120
        1.0       0.56      0.61      0.59        59

avg / total       0.72      0.72      0.72       179

