In [131]:
import pandas as pd
import numpy as np
#import scikit_learn 

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data
train_data.head()
train_data.columns
train_data
nameId=test_data['PassengerId']

train_df=train_df[['Name','Sex','Age','PassengerId','Cabin','Ticket','SibSp','Pclass','Embarked','Fare','Parch','Survived']]
train_df

In [132]:
#Convert Categorical Variables into Categories & One-hot Encode Them; Remove Variables unrelated like 
dummy_fields = ['Sex','Embarked','Pclass']


for each in dummy_fields:
    train_data[each].astype('category')
    test_data[each].astype('category')
    
    dummies_train = pd.get_dummies(train_data[each],prefix = each, drop_first = False)
    dummies_test = pd.get_dummies(test_data[each],prefix = each, drop_first = False)

    train_data = pd.concat([train_data, dummies_train],axis=1)
    test_data = pd.concat([test_data, dummies_test],axis=1)
    
#Drop columns on training & testing datasets
fields_to_drop = ['Sex','Embarked','Ticket','Cabin','PassengerId','Name','Pclass']

train_data=train_data.drop(columns=fields_to_drop, axis =1)
test_data=test_data.drop(columns=fields_to_drop, axis =1)

## Data Preprocessing Steps ##
### Modify Age Column - for NaN, replace with average age & Scale so that it's b/w 0 to 1 ###

In [133]:
#Replace NaNs with average age value; 177 NaNs
avg_age_train = int(round(train_data['Age'].mean()))
avg_age_test = int(round(test_data['Age'].mean()))

numNaNs=train_data['Age'].isnull().sum()
train_data['Age'].fillna(avg_age_train,inplace=True)
test_data['Age'].fillna(avg_age_test,inplace=True)

### Prepare Continuous Input Variables Columns ###

In [134]:
norm_columns = ['Age','SibSp','Parch','Fare'] 
scaled_features_train = {} #store scaling values for conversion back to original values later on
scaled_features_test = {} #store scaling values for conversion back to original values later on

#Continuous Variables are set to be between 0 & 1 # Yielded 90% test accuracy with 5,000 epochs #
for each in norm_columns:
    max_train, min_train =train_data[each].max(), train_data[each].min()
    scaled_features_train[each]= [max_train , min_train]
    train_data.loc[:,each] = (train_data[each])/(max_train-min_train)
    
    max_test, min_test =test_data[each].max(), test_data[each].min()
    scaled_features_test[each]= [max_test , min_test]
    test_data.loc[:,each] = (test_data[each])/(max_test-min_test)



#Continuous Variables are set to be between 0 & 1 and to have zero mean and a standard deviation of 1# Yielded 91% accuracy with 50 epochs
#for each in norm_columns:
#    mean_train, std_train =train_data[each].mean(), train_data[each].std()
#    scaled_features_train[each]= [mean_train , std_train]
#    train_data.loc[:,each] = (train_data[each] - mean_train)/std_train
#    
#    mean_test, std_test =test_data[each].mean(), test_data[each].std()
#    scaled_features_test[each]= [mean_test , std_test]
#    test_data.loc[:,each] = (test_data[each] - mean_test)/std_test


In [135]:
train_data.head()
test_data.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0.454965,0.0,0.0,0.015282,0,1,0,1,0,0,0,1
1,0.619807,0.125,0.0,0.013663,1,0,0,0,1,0,0,1
2,0.817618,0.0,0.0,0.018909,0,1,0,1,0,0,1,0
3,0.35606,0.0,0.0,0.016908,0,1,0,0,1,0,0,1
4,0.290123,0.125,0.111111,0.023984,1,0,0,0,1,0,0,1


### Separate Inputs & Ouputs for the Training Dataset ###

In [136]:
train_inputs= train_data.drop(columns=['Survived'], axis = 1)
train_targets = train_data['Survived']

In [137]:
train_inputs.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0.276451,0.125,0.0,0.014151,0,1,0,0,1,0,0,1
1,0.477507,0.125,0.0,0.139136,1,0,1,0,0,1,0,0
2,0.326715,0.0,0.0,0.015469,1,0,0,0,1,0,0,1
3,0.439809,0.125,0.0,0.103644,1,0,0,0,1,1,0,0
4,0.439809,0.0,0.0,0.015713,0,1,0,0,1,0,0,1


### Construct Simple Neural Network for Classification Task ###

In [138]:
import  keras

# Separate data and one-hot encode the output# Separ 
# Note: We're also turning the data into numpy arrays, in order to train the model in Keras

features = np.array(train_inputs)
targets = np.array(keras.utils.to_categorical(train_targets, 2))

features_test = np.array(test_data)
#targets_test = np.array(keras.utils.to_categorical(test_data['admit'], 2))

print(features[:10])
print(targets[:10])
print(features_test[:10])



[[ 0.27645137  0.125       0.          0.01415106  0.          1.          0.
   0.          1.          0.          0.          1.        ]
 [ 0.47750691  0.125       0.          0.13913574  1.          0.          1.
   0.          0.          1.          0.          0.        ]
 [ 0.32671526  0.          0.          0.01546857  1.          0.          0.
   0.          1.          0.          0.          1.        ]
 [ 0.439809    0.125       0.          0.1036443   1.          0.          0.
   0.          1.          1.          0.          0.        ]
 [ 0.439809    0.          0.          0.01571255  0.          1.          0.
   0.          1.          0.          0.          1.        ]
 [ 0.37697914  0.          0.          0.0165095   0.          1.          0.
   1.          0.          0.          0.          1.        ]
 [ 0.67856245  0.          0.          0.10122886  0.          1.          0.
   0.          1.          1.          0.          0.        ]
 [ 0.02513194

### Compile the Model ####

In [156]:

# Imports# Impor 
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
from keras import backend as K

# Building the model
model = Sequential() #
model.add(Dense(128, activation='relu', input_shape=(12,)))
model.add(Dropout(.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(.1))
model.add(Dense(16, activation='relu'))
model.add(Dropout(.05))
model.add(Dense(2, activation='softmax'))


#model = Sequential() 88% with 8 batch size and 50 epochs
#model.add(Dense(16, activation='relu', input_shape=(12,)))
#model.add(Dropout(.1))
#model.add(Dense(8, activation='relu'))
#model.add(Dropout(.05))
#model.add(Dense(4, activation='relu'))
#model.add(Dropout(.025))
#model.add(Dense(2, activation='softmax'))

# Compiling the model
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_29 (Dense)             (None, 128)               1664      
_________________________________________________________________
dropout_22 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_23 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_24 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_32 (Dense)             (None, 16)                528       
__________

In [162]:
### Train the Model ###
from keras.callbacks import ModelCheckpoint  
# features and targets are Numpy arrays --just like in the Scikit-Learn API.

#Set hyperparameters 
epochs = 5000
batch_size = 8

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.from_scratch.hdf5', 
                               verbose=1, save_best_only=True)

model.fit(features, targets,
          epochs=epochs, batch_size=batch_size, callbacks=[checkpointer], verbose=1)

Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
  8/891 [..............................] - ETA: 0s - loss: 0.1468 - acc: 0.8750



Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000

KeyboardInterrupt: 

### Score the Model ###

In [163]:
# Evaluating the model on the training and testing set
score = model.evaluate(features, targets)
print("\n Training Accuracy:", score[1])


prediction=(model.predict(features_test))


 32/891 [>.............................] - ETA: 0s
 Training Accuracy: 0.906846240849


In [164]:
pred=pd.DataFrame(prediction)
pred = pd.concat([nameId, pred],axis=1)
pred['Survived'] = np.where(pred[0] > pred[1], 0, 1)
drop = [0,1]
submission=pred.drop(columns=drop, axis =1)

### Export Dataframe as a csv file ###

In [165]:
submission.to_csv('Adrian_Lievano_Titanic_Submission', sep=',', index=False)

In [None]:
### Notes: AGE & CLASS are probably most important variables. Rethink how you're assigning NaNs to mean values and how that might throw everything off. Your data is messed up!!!!