In [24]:
import pandas as pd
import numpy as np
import re as re

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
validate = pd.read_csv('gender_submission.csv')

In [25]:
# Begin to Look at the Features. This is based on https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
# Not a solution, just a guide on how to engineer and clean up data to boost accuracy without overfitting

# Mapping Sex
train['Sex_binary'] = train['Sex'].map({'male': 1, 'female': 0})
test['Sex_binary'] = test['Sex'].map({'male': 1, 'female': 0})

In [26]:
# Building the size of the family
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [27]:
# Now what matters, if the person was alone or not
train['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
test['IsAlone'] = 0
test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [28]:
# Filling empty cells with the most repeated shore
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

# Mapping shores
train['Embarked'] = train['Embarked'].map({'S':2,'Q':1,'C':0})
test['Embarked'] = test['Embarked'].map({'S':2,'Q':1,'C':0})

In [29]:
# Filling empty cells with median fare
train['Fare'] = train['Fare'].fillna(train['Fare'].median())
test['Fare'] = train['Fare'].fillna(train['Fare'].median())

# Splitting into 4 categories to generalize
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
test['CategoricalFare'] = pd.qcut(train['Fare'], 4)

# Mapping the categories 
train.loc[train['Fare'] <= 7.91, 'Fare'] = 0
train.loc[(train['Fare'] > 7.91) & (train['Fare'] <= 14.454), 'Fare'] = 1
train.loc[(train['Fare'] > 14.454) & (train['Fare'] <= 31), 'Fare'] = 2
train.loc[train['Fare'] > 31, 'Fare'] = 3
train['Fare'] = train['Fare'].astype(int)

test.loc[train['Fare'] <= 7.91, 'Fare'] = 0
test.loc[(train['Fare'] > 7.91) & (test['Fare'] <= 14.454), 'Fare'] = 1
test.loc[(train['Fare'] > 14.454) & (test['Fare'] <= 31), 'Fare'] = 2
test.loc[train['Fare'] > 31, 'Fare'] = 3
test['Fare'] = test['Fare'].astype(int)

In [30]:
# Filling (a lot) of empty age cells with a random number between the mean - std and mean + std
ageMean = train['Age'].mean()
ageStd = train['Age'].std()

train['Age'] = train['Age'].fillna(np.random.randint(ageMean - ageStd, ageMean + ageStd))
test['Age'] = test['Age'].fillna(np.random.randint(ageMean - ageStd, ageMean + ageStd))

# Splitting into 5 categories to generalize
train['CategoricalAge'] = pd.cut(train['Age'], 5)

# Mapping the categories
train.loc[ train['Age'] <= 16, 'Age'] = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1
train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2
train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3
train.loc[ train['Age'] > 64, 'Age'] 

test.loc[ test['Age'] <= 16, 'Age'] = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 32), 'Age'] = 1
test.loc[(test['Age'] > 32) & (test['Age'] <= 48), 'Age'] = 2
test.loc[(test['Age'] > 48) & (test['Age'] <= 64), 'Age'] = 3
test.loc[test['Age'] > 64, 'Age'] 

81    67.0
96    76.0
Name: Age, dtype: float64

In [31]:
# Getting titles of the people aboard (func from the same source listed above)
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
test['Title'] = test['Name'].apply(get_title)
    
train['Title'] = train['Title'].replace(['Lady','Countess','Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

test['Title'] = test['Title'].replace(['Lady','Countess','Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

# Mapping titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,FamilySize,IsAlone,CategoricalFare,CategoricalAge,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,1.0,1,0,A/5 21171,0,,2,1,2,0,"(-0.001, 7.91]","(16.336, 32.252]",1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2.0,1,0,PC 17599,3,C85,0,0,2,0,"(31.0, 512.329]","(32.252, 48.168]",3
2,3,1,3,"Heikkinen, Miss. Laina",female,1.0,0,0,STON/O2. 3101282,1,,2,0,1,1,"(7.91, 14.454]","(16.336, 32.252]",2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2.0,1,0,113803,3,C123,2,0,2,0,"(31.0, 512.329]","(32.252, 48.168]",3
4,5,0,3,"Allen, Mr. William Henry",male,2.0,0,0,373450,1,,2,1,1,1,"(7.91, 14.454]","(32.252, 48.168]",1
5,6,0,3,"Moran, Mr. James",male,1.0,0,0,330877,1,,1,1,1,1,"(7.91, 14.454]","(16.336, 32.252]",1
6,7,0,1,"McCarthy, Mr. Timothy J",male,3.0,0,0,17463,3,E46,2,1,1,1,"(31.0, 512.329]","(48.168, 64.084]",1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,0.0,3,1,349909,2,,2,1,5,0,"(14.454, 31.0]","(0.34, 16.336]",4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,1.0,0,2,347742,1,,2,0,3,0,"(7.91, 14.454]","(16.336, 32.252]",3
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,0.0,1,0,237736,2,,0,0,2,0,"(14.454, 31.0]","(0.34, 16.336]",3


In [32]:
dropElements = ['PassengerId','Name','Sex','SibSp','Parch','Ticket','Cabin','FamilySize','CategoricalFare','CategoricalAge']
train = train.drop(dropElements, axis=1)

In [33]:
features = ['Pclass','Age','Fare','Embarked','Sex_binary','IsAlone','Title']
target = 'Survived'
train[features].head(3)

Unnamed: 0,Pclass,Age,Fare,Embarked,Sex_binary,IsAlone,Title
0,3,1.0,0,2,1,0,1
1,1,2.0,3,0,0,0,3
2,3,1.0,1,2,0,1,2


In [34]:
X_train = np.array(train[features])

In [35]:
y_train = np.array(train[target])
y_train = y_train.reshape(-1,1)

In [36]:
# Import keras modules
from keras.models import Sequential
from keras.layers import Dense
from keras.initializers import glorot_normal
from keras.optimizers import SGD

In [37]:
model = Sequential()
numNeurons = 4
model.add(Dense(numNeurons, input_dim=len(features), activation='relu', kernel_initializer="uniform")) #Si cambiamos esto dos, tenemos 76 
model.add(Dense(1, activation='sigmoid', kernel_initializer="uniform"))

sgd = SGD(lr=0.05, decay=1e-6, momentum=0.6, nesterov=True)

model.compile(loss='mean_squared_logarithmic_error', optimizer=sgd, metrics=['mae'])

In [38]:
model.fit(X_train, y_train, epochs=2000, batch_size=100)

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

<keras.callbacks.History at 0xb2b7f14d0>

In [39]:
X_test = np.array(test[features])

In [40]:
predicciones = model.predict(X_test)

In [41]:
predicciones = predicciones.tolist()

In [42]:
pre = pd.Series(predicciones)
validate['prediccion'] = pre
validate['prediccion'] = validate['prediccion'].str.get(0)
validate

Unnamed: 0,PassengerId,Survived,prediccion
0,892,0,0.021731
1,893,1,0.374404
2,894,0,0.043549
3,895,0,0.038236
4,896,1,0.656423
5,897,0,0.120841
6,898,1,0.657395
7,899,0,0.140647
8,900,1,0.717271
9,901,0,0.037306


In [43]:
coincidencias = []
for dato in validate.prediccion:
    if dato >= 0.5:
        coincidencias.append(1)
    else:
        coincidencias.append(0)
validate['final'] = coincidencias
validate

Unnamed: 0,PassengerId,Survived,prediccion,final
0,892,0,0.021731,0
1,893,1,0.374404,0
2,894,0,0.043549,0
3,895,0,0.038236,0
4,896,1,0.656423,1
5,897,0,0.120841,0
6,898,1,0.657395,1
7,899,0,0.140647,0
8,900,1,0.717271,1
9,901,0,0.037306,0


In [44]:
coincide = 0
coincide = sum(validate['Survived'] == validate['final'])
print(coincide)
print(float(coincide)/ float(len(validate)))


match = 0
nomatch = 0
for val in validate.values:
    if val[1] == val[3]:
        match = match +1
    else:
        nomatch = nomatch +1
print(float(match)/float(len(validate)))

349
0.834928229665
0.834928229665


In [45]:
toKaggle = pd.DataFrame({'PassengerId':validate['PassengerId'],
                         'Survived':validate['final']})

toKaggle.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


# output file with your prediction

In [46]:
from datetime import datetime
archivo = 'TitanicPred.csv'

toKaggle.to_csv(archivo,index=False)

print('Creado: ' + archivo)

Creado: TitanicPred.csv
