In [1]:
import pandas as pd
import numpy as np
import re as re

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
validate = pd.read_csv('gender_submission.csv')

In [2]:
# Begin to Look at the Features. This is based on https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
# Not a solution, just a guide on how to engineer and clean up data to boost accuracy without overfitting

# Mapping Sex
train['Sex_binary'] = train['Sex'].map({'male': 1, 'female': 0})
test['Sex_binary'] = test['Sex'].map({'male': 1, 'female': 0})

In [3]:
# Building the size of the family
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,1,5
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0,3
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0,2


In [4]:
# Now what matters, if the person was alone or not
train['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
test['IsAlone'] = 0
test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [5]:
# Filling empty cells with the most repeated shore
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

# Mapping shores
train['Embarked'] = train['Embarked'].map({'S':2,'Q':1,'C':0})
test['Embarked'] = test['Embarked'].map({'S':2,'Q':1,'C':0})

In [6]:
# Filling empty cells with median fare
train['Fare'] = train['Fare'].fillna(train['Fare'].median())
test['Fare'] = train['Fare'].fillna(train['Fare'].median())

# Splitting into 4 categories to generalize
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

# Mapping the categories 
train.loc[train['Fare'] <= 7.91, 'Fare'] = 0
train.loc[(train['Fare'] > 7.91) & (train['Fare'] <= 14.454), 'Fare'] = 1
train.loc[(train['Fare'] > 14.454) & (train['Fare'] <= 31), 'Fare'] = 2
train.loc[train['Fare'] > 31, 'Fare'] = 3
train['Fare'] = train['Fare'].astype(int)

test.loc[train['Fare'] <= 7.91, 'Fare'] = 0
test.loc[(train['Fare'] > 7.91) & (test['Fare'] <= 14.454), 'Fare'] = 1
test.loc[(train['Fare'] > 14.454) & (test['Fare'] <= 31), 'Fare'] = 2
test.loc[train['Fare'] > 31, 'Fare'] = 3
test['Fare'] = test['Fare'].astype(int)

In [7]:
# Filling (a lot) of empty age cells with a random number between the mean - std and mean + std
ageMean = train['Age'].mean()
ageStd = train['Age'].std()

train['Age'] = train['Age'].fillna(np.random.randint(ageMean - ageStd, ageMean + ageStd))
test['Age'] = test['Age'].fillna(np.random.randint(ageMean - ageStd, ageMean + ageStd))

# Splitting into 5 categories to generalize
train['CategoricalAge'] = pd.cut(train['Age'], 5)

# Mapping the categories
train.loc[ train['Age'] <= 16, 'Age'] = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1
train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2
train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3
train.loc[ train['Age'] > 64, 'Age'] 

test.loc[ test['Age'] <= 16, 'Age'] = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 32), 'Age'] = 1
test.loc[(test['Age'] > 32) & (test['Age'] <= 48), 'Age'] = 2
test.loc[(test['Age'] > 48) & (test['Age'] <= 64), 'Age'] = 3
test.loc[test['Age'] > 64, 'Age'] 

81    67.0
96    76.0
Name: Age, dtype: float64

In [8]:
# Getting titles of the people aboard (func from the same source listed above)
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
test['Title'] = test['Name'].apply(get_title)
    
train['Title'] = train['Title'].replace(['Lady','Countess','Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

test['Title'] = test['Title'].replace(['Lady','Countess','Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

# Mapping titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)

In [9]:
dropElements = ['PassengerId','Name','Sex','SibSp','Parch','Ticket','Cabin','FamilySize','CategoricalFare','CategoricalAge']
train = train.drop(dropElements, axis=1)

In [10]:
features = ['Pclass','Age','Fare','Embarked','Sex_binary','IsAlone','Title']
target = 'Survived'
train[features].head(3)

Unnamed: 0,Pclass,Age,Fare,Embarked,Sex_binary,IsAlone,Title
0,3,1.0,0,2,1,0,1
1,1,2.0,3,0,0,0,3
2,3,1.0,1,2,0,1,2


In [11]:
X_train = np.array(train[features])

In [12]:
y_train = np.array(train[target])
y_train = y_train.reshape(-1,1)

In [13]:
# Import keras modules
from keras.models import Sequential
from keras.layers import Dense
from keras.initializers import glorot_normal
from keras.optimizers import SGD

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [14]:
model = Sequential()
numNeurons = 2
model.add(Dense(3, input_dim=len(features), activation='relu', kernel_initializer="uniform")) #Si cambiamos esto dos, tenemos 76 
model.add(Dense(numNeurons, activation='sigmoid', kernel_initializer="uniform"))
model.add(Dense(1, activation='sigmoid', kernel_initializer="uniform"))
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mae']) old

sgd = SGD(lr=0.05, decay=1e-6, momentum=0.6, nesterov=True)

model.compile(loss='mean_squared_logarithmic_error', optimizer="adamax", metrics=['mae'])

In [15]:
model.fit(X_train, y_train, epochs=1000, batch_size=100)
#score = model.evaluate(X_test, y_test, batch_size=25)
#score

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0xb29441890>

In [16]:
X_test = np.array(test[features])

In [17]:
predicciones = model.predict(X_test)

In [18]:
predicciones = predicciones.tolist()

In [19]:
pre = pd.Series(predicciones)
validate['prediccion'] = pre
validate['prediccion'] = validate['prediccion'].str.get(0)
validate

Unnamed: 0,PassengerId,Survived,prediccion
0,892,0,0.087519
1,893,1,0.555785
2,894,0,0.119137
3,895,0,0.086374
4,896,1,0.626329
5,897,0,0.088979
6,898,1,0.590387
7,899,0,0.099736
8,900,1,0.810036
9,901,0,0.085224


In [20]:
coincidencias = []
for dato in validate.prediccion:
    if dato >= 0.5:
        coincidencias.append(1)
    else:
        coincidencias.append(0)
validate['final'] = coincidencias
validate

Unnamed: 0,PassengerId,Survived,prediccion,final
0,892,0,0.087519,0
1,893,1,0.555785,1
2,894,0,0.119137,0
3,895,0,0.086374,0
4,896,1,0.626329,1
5,897,0,0.088979,0
6,898,1,0.590387,1
7,899,0,0.099736,0
8,900,1,0.810036,1
9,901,0,0.085224,0


In [21]:
coincide = 0
coincide = sum(validate['Survived'] == validate['final'])
print(coincide)
print(float(coincide)/ float(len(validate)))


match = 0
nomatch = 0
for val in validate.values:
    if val[1] == val[3]:
        match = match +1
    else:
        nomatch = nomatch +1
print(float(match)/float(len(validate)))

366
0.875598086124
0.875598086124


In [22]:
toKaggle = pd.DataFrame({'PassengerId':validate['PassengerId'],
                         'Survived':validate['final']})

toKaggle.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


# output file with your prediction

In [23]:
from datetime import datetime
archivo = 'TitanicPred.csv'

toKaggle.to_csv(archivo,index=False)

print('Creado: ' + archivo)

Creado: TitanicPred.csv
