In [1]:
import pandas as pd
import numpy as np
import re as re

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
validate = pd.read_csv('gender_submission.csv')

In [2]:
# Begin to Look at the Features. This is based on https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
# Not a solution, just a guide on how to engineer and clean up data to boost accuracy without overfitting

# Mapping Sex
train['Sex_binary'] = train['Sex'].map({'male': 1, 'female': 0})
test['Sex_binary'] = test['Sex'].map({'male': 1, 'female': 0})

In [3]:
# Building the size of the family
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [4]:
# Now what matters, if the person was alone or not
train['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
test['IsAlone'] = 0
test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [5]:
# Filling empty cells with the most repeated shore
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

# Mapping shores
train['Embarked'] = train['Embarked'].map({'S':2,'Q':1,'C':0})
test['Embarked'] = test['Embarked'].map({'S':2,'Q':1,'C':0})

In [6]:
# Filling empty cells with median fare
train['Fare'] = train['Fare'].fillna(train['Fare'].median())
test['Fare'] = train['Fare'].fillna(train['Fare'].median())

# Splitting into 4 categories to generalize
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
test['CategoricalFare'] = pd.qcut(train['Fare'], 4)

# Mapping the categories 
train.loc[train['Fare'] <= 7.91, 'Fare'] = 0
train.loc[(train['Fare'] > 7.91) & (train['Fare'] <= 14.454), 'Fare'] = 1
train.loc[(train['Fare'] > 14.454) & (train['Fare'] <= 31), 'Fare'] = 2
train.loc[train['Fare'] > 31, 'Fare'] = 3
train['Fare'] = train['Fare'].astype(int)

test.loc[train['Fare'] <= 7.91, 'Fare'] = 0
test.loc[(train['Fare'] > 7.91) & (test['Fare'] <= 14.454), 'Fare'] = 1
test.loc[(train['Fare'] > 14.454) & (test['Fare'] <= 31), 'Fare'] = 2
test.loc[train['Fare'] > 31, 'Fare'] = 3
test['Fare'] = test['Fare'].astype(int)

In [7]:
# Filling (a lot) of empty age cells with a random number between the mean - std and mean + std
ageMean = train['Age'].mean()
ageStd = train['Age'].std()

train['Age'] = train['Age'].fillna(np.random.randint(ageMean - ageStd, ageMean + ageStd))
test['Age'] = test['Age'].fillna(np.random.randint(ageMean - ageStd, ageMean + ageStd))

# Splitting into 5 categories to generalize
train['CategoricalAge'] = pd.cut(train['Age'], 5)

# Mapping the categories
train.loc[ train['Age'] <= 16, 'Age'] = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1
train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2
train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3
train.loc[ train['Age'] > 64, 'Age'] 

test.loc[ test['Age'] <= 16, 'Age'] = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 32), 'Age'] = 1
test.loc[(test['Age'] > 32) & (test['Age'] <= 48), 'Age'] = 2
test.loc[(test['Age'] > 48) & (test['Age'] <= 64), 'Age'] = 3
test.loc[test['Age'] > 64, 'Age'] 

81    67.0
96    76.0
Name: Age, dtype: float64

In [8]:
# Getting titles of the people aboard (func from the same source listed above)
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
test['Title'] = test['Name'].apply(get_title)
    
train['Title'] = train['Title'].replace(['Lady','Countess','Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

test['Title'] = test['Title'].replace(['Lady','Countess','Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

# Mapping titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,FamilySize,IsAlone,CategoricalFare,CategoricalAge,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,1.0,1,0,A/5 21171,0,,2,1,2,0,"(-0.001, 7.91]","(16.336, 32.252]",1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2.0,1,0,PC 17599,3,C85,0,0,2,0,"(31.0, 512.329]","(32.252, 48.168]",3
2,3,1,3,"Heikkinen, Miss. Laina",female,1.0,0,0,STON/O2. 3101282,1,,2,0,1,1,"(7.91, 14.454]","(16.336, 32.252]",2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2.0,1,0,113803,3,C123,2,0,2,0,"(31.0, 512.329]","(32.252, 48.168]",3
4,5,0,3,"Allen, Mr. William Henry",male,2.0,0,0,373450,1,,2,1,1,1,"(7.91, 14.454]","(32.252, 48.168]",1
5,6,0,3,"Moran, Mr. James",male,1.0,0,0,330877,1,,1,1,1,1,"(7.91, 14.454]","(16.336, 32.252]",1
6,7,0,1,"McCarthy, Mr. Timothy J",male,3.0,0,0,17463,3,E46,2,1,1,1,"(31.0, 512.329]","(48.168, 64.084]",1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,0.0,3,1,349909,2,,2,1,5,0,"(14.454, 31.0]","(0.34, 16.336]",4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,1.0,0,2,347742,1,,2,0,3,0,"(7.91, 14.454]","(16.336, 32.252]",3
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,0.0,1,0,237736,2,,0,0,2,0,"(14.454, 31.0]","(0.34, 16.336]",3


In [9]:
dropElements = ['PassengerId','Name','Sex','SibSp','Parch','Ticket','Cabin','FamilySize','CategoricalFare','CategoricalAge']
train = train.drop(dropElements, axis=1)
train

Unnamed: 0,Survived,Pclass,Age,Fare,Embarked,Sex_binary,IsAlone,Title
0,0,3,1.0,0,2,1,0,1
1,1,1,2.0,3,0,0,0,3
2,1,3,1.0,1,2,0,1,2
3,1,1,2.0,3,2,0,0,3
4,0,3,2.0,1,2,1,1,1
5,0,3,1.0,1,1,1,1,1
6,0,1,3.0,3,2,1,1,1
7,0,3,0.0,2,2,1,0,4
8,1,3,1.0,1,2,0,0,3
9,1,2,0.0,2,0,0,0,3


In [10]:
features = ['Pclass','Age','Fare','Embarked','Sex_binary','IsAlone','Title']
target = 'Survived'
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,FamilySize,IsAlone,CategoricalFare,Title
0,892,3,"Kelly, Mr. James",male,2.0,0,0,330911,0,,1,1,1,1,"(-0.001, 7.91]",1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,2.0,1,0,363272,0,,2,0,2,0,"(31.0, 512.329]",3
2,894,2,"Myles, Mr. Thomas Francis",male,3.0,0,0,240276,0,,1,1,1,1,"(7.91, 14.454]",1
3,895,3,"Wirz, Mr. Albert",male,1.0,0,0,315154,0,,2,1,1,1,"(31.0, 512.329]",1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1.0,1,1,3101298,0,,2,0,3,0,"(7.91, 14.454]",3
5,897,3,"Svensson, Mr. Johan Cervin",male,0.0,0,0,7538,0,,2,1,1,1,"(7.91, 14.454]",1
6,898,3,"Connolly, Miss. Kate",female,1.0,0,0,330972,0,,1,0,1,1,"(31.0, 512.329]",2
7,899,2,"Caldwell, Mr. Albert Francis",male,1.0,1,1,248738,0,,2,1,3,0,"(14.454, 31.0]",1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,1.0,0,0,2657,0,,0,0,1,1,"(7.91, 14.454]",3
9,901,3,"Davies, Mr. John Samuel",male,1.0,2,0,A/4 48871,0,,2,1,3,0,"(14.454, 31.0]",1


In [11]:
X_train = np.array(train[features])
X_train = X_train/np.amax(X_train, axis=0)
y_train = np.array(train[target])
y_train = y_train.reshape(-1,1)

In [12]:
X_test = np.array(test[features])

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
print("Using TensorFlow Version %s" %tf.__version__)

  from ._conv import register_converters as _register_converters


Using TensorFlow Version 1.10.0


In [14]:
class NeuronalNet(object):
    def __init__(self, sess, X, Y, n_hidden=4, learning_rate=1e-2):
        self.sess = sess
        self.X = X
        self.Y = Y.reshape(-1,1)
        self.n_inputs = X.shape[0]
        self.n_input_dim = X.shape[1]
        self.n_output = 1
        self.learning_rate = learning_rate 
        self.n_hidden = n_hidden
        self.X_input, self.y, self.logits, self.cost = self.createNeuralNet()
        self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) # self.optimizer = tf.train.AdamOptimizer(self.learning_rate) #optimizador
        self.train_op = self.optimizer.minimize(self.cost)

    def createNeuralNet(self):
        initializer = tf.contrib.layers.xavier_initializer()
        X_input = tf.placeholder(tf.float32, [None, self.n_input_dim], name='input')
        y = tf.placeholder(tf.float32, [None, self.n_output], name='output')
        # Layer
        hidden1 = fully_connected(X_input, self.n_hidden,\
                                 activation_fn=tf.nn.elu, weights_initializer=initializer)
        logits = fully_connected(hidden1, self.n_output,\
                                 activation_fn=tf.nn.sigmoid, weights_initializer=initializer)
        loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, \
                                                       logits=logits)
        cost = tf.reduce_mean(loss)
        return X_input, y, logits, cost
    
    def train(self):
        _, cost = self.sess.run([self.train_op,self.cost],\
                                feed_dict={self.X_input: self.X,
                                           self.y: self.Y})        
        return cost
    
    def predict(self, X_test):
        pred = self.sess.run([self.logits], feed_dict={ self.X_input: X_test})[0]        
        return pred

In [15]:
def getTrainCost(model, n_iters=1000):
    model.sess.run(tf.global_variables_initializer())
    cost = []
    for i in range(n_iters):
        cost.append(model.train())
    return cost

In [16]:
neurons = 4
n_iters = 500
tf.reset_default_graph()
sess = tf.Session()
net = NeuronalNet(sess, X_train, y_train, n_hidden=neurons, learning_rate=0.05)
cost = getTrainCost(net, n_iters)

In [17]:
predicciones = net.predict(X_test)
print(predicciones)

[[0.2635342 ]
 [0.6390393 ]
 [0.38742906]
 [0.06354225]
 [0.47280747]
 [0.03330268]
 [0.51166797]
 [0.07139536]
 [0.8030346 ]
 [0.07957859]
 [0.06354225]
 [0.09798343]
 [0.42292127]
 [0.23275654]
 [0.59213847]
 [0.8075509 ]
 [0.23884392]
 [0.32008627]
 [0.3373002 ]
 [0.8514789 ]
 [0.60800636]
 [0.19599667]
 [0.36147276]
 [0.29861206]
 [0.831605  ]
 [0.25690714]
 [0.7080616 ]
 [0.32008627]
 [0.09798343]
 [0.37641004]
 [0.23275654]
 [0.07139536]
 [0.6390393 ]
 [0.47280747]
 [0.29861206]
 [0.32008627]
 [0.280828  ]
 [0.280828  ]
 [0.06354225]
 [0.06354225]
 [0.550364  ]
 [0.05235042]
 [0.11852023]
 [0.38523266]
 [0.59213847]
 [0.06354225]
 [0.41301885]
 [0.15081386]
 [0.8510909 ]
 [0.6390393 ]
 [0.06331504]
 [0.28860742]
 [0.3149242 ]
 [0.290188  ]
 [0.28860742]
 [0.3936727 ]
 [0.11852023]
 [0.06354225]
 [0.07957859]
 [0.7577174 ]
 [0.06354225]
 [0.05796884]
 [0.06354225]
 [0.51166797]
 [0.5635431 ]
 [0.38523266]
 [0.51166797]
 [0.09798343]
 [0.2531958 ]
 [0.70906717]
 [0.51166797]
 [0.06

In [18]:
predicciones = predicciones.tolist()

In [19]:
pre = pd.Series(predicciones)
validate['prediccion'] = pre
validate['prediccion'] = validate['prediccion'].str.get(0)
validate

Unnamed: 0,PassengerId,Survived,prediccion
0,892,0,0.263534
1,893,0,0.639039
2,894,0,0.387429
3,895,0,0.063542
4,896,1,0.472807
5,897,0,0.033303
6,898,1,0.511668
7,899,0,0.071395
8,900,1,0.803035
9,901,0,0.079579


In [20]:
coincidencias = []
for dato in validate.prediccion:
    if dato >= 0.5:
        coincidencias.append(1)
    else:
        coincidencias.append(0)
validate['final'] = coincidencias
validate

Unnamed: 0,PassengerId,Survived,prediccion,final
0,892,0,0.263534,0
1,893,0,0.639039,1
2,894,0,0.387429,0
3,895,0,0.063542,0
4,896,1,0.472807,0
5,897,0,0.033303,0
6,898,1,0.511668,1
7,899,0,0.071395,0
8,900,1,0.803035,1
9,901,0,0.079579,0


In [21]:
coincide = 0
coincide = sum(validate['Survived'] == validate['final'])
print(coincide)
print(float(coincide)/ float(len(validate)))

match = 0
nomatch = 0
for val in validate.values:
    if val[1] == val[3]:
        match = match +1
    else:
        nomatch = nomatch +1
print(float(match)/float(len(validate)))

330
0.7894736842105263
0.7894736842105263


In [22]:
toKaggle = pd.DataFrame({'PassengerId':validate['PassengerId'],
                         'Survived':validate['final']})

toKaggle.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


# output file with your prediction

In [23]:
from datetime import datetime
archivo = 'TitanicPred.csv'

toKaggle.to_csv(archivo,index=False)

print('Creado: ' + archivo)

Creado: TitanicPred.csv
