In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing as preprocessing
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
train_titanic = pd.read_csv('train.csv')
test_titanic = pd.read_csv('test.csv')
PassangersId = test_titanic['PassengerId']

In [3]:
train_titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
test_titanic.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [5]:
# Descarte y relleno de 'Features'

In [6]:
# De primeras, podemos concluir que tanto 'PassengerId', 'Ticket' y el 'Name'
# no serán de gran utilidad por lo que las descartamos (También en test_titanic).
train_titanic = train_titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_titanic = test_titanic.drop(['Name', 'Ticket', 'Cabin'], axis=1)

train_titanic.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


In [7]:
print(train_titanic.info())
print ('*'*30)
print (test_titanic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 48.8+ KB
None
******************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 22.9+ KB
None


In [8]:
# Observamos que tenemos 891 índices. En la feature 'Embarked' tenemos solo 889.
# Tenemos dos índices con valor 'nan', los cuales habrá que rellenar y lo haremos
# con el valor mas repetido que es 'S'
print (train_titanic['Embarked'].value_counts())
print('-'*30)
train_titanic['Embarked'] = train_titanic['Embarked'].fillna('S')
test_titanic['Embarked'] = test_titanic['Embarked'].fillna('S')
print (train_titanic['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64
------------------------------
S    646
C    168
Q     77
Name: Embarked, dtype: int64


In [9]:
#Mapeo de 'Embarked'

# Para el dataFrame train_titanic
train_titanic['Embarked'] = train_titanic['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

# Para el dataFrame test_titanic
test_titanic['Embarked'] = test_titanic['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [10]:
# Feature 'Fare'
# Solo tenemos datos vacios en test_titanic
test_titanic['Fare'] = test_titanic['Fare'].fillna(test_titanic['Fare'].mean())
print (test_titanic['Fare'].describe())

count    418.000000
mean      35.627188
std       55.840500
min        0.000000
25%        7.895800
50%       14.454200
75%       31.500000
max      512.329200
Name: Fare, dtype: float64


In [11]:
# Mapping Fare

# Para el dataFrame train_titanic
train_titanic.loc[ train_titanic['Fare'] <= 7.91, 'Fare'] = 0
train_titanic.loc[(train_titanic['Fare'] > 7.91) & (train_titanic['Fare'] <= 14.454), 'Fare'] = 1
train_titanic.loc[(train_titanic['Fare'] > 14.454) & (train_titanic['Fare'] <= 31), 'Fare'] = 2
train_titanic.loc[train_titanic['Fare'] > 31, 'Fare'] = 3
train_titanic['Fare'] = train_titanic['Fare'].astype(int)

# Para el dataFrame test_titanic
test_titanic.loc[ test_titanic['Fare'] <= 7.91, 'Fare'] = 0
test_titanic.loc[(test_titanic['Fare'] > 7.91) & (test_titanic['Fare'] <= 14.454), 'Fare'] = 1
test_titanic.loc[(test_titanic['Fare'] > 14.454) & (test_titanic['Fare'] <= 31), 'Fare'] = 2
test_titanic.loc[test_titanic['Fare'] > 31, 'Fare'] = 3
test_titanic['Fare'] = test_titanic['Fare'].astype(int)


In [12]:
# Completamos los índices vacios para 'Age' en ambos dataFrames

train_titanic['Age'] = train_titanic['Age'].fillna(train_titanic['Age'].median())

test_titanic['Age'] = test_titanic['Age'].fillna(test_titanic['Age'].median());

In [13]:
# Mapeo de 'Age'

# Para el dataFrame train_titanic
train_titanic.loc[ train_titanic['Age'] <= 16, 'Age'] = 0
train_titanic.loc[(train_titanic['Age'] > 16) & (train_titanic['Age'] <= 32), 'Age'] = 1
train_titanic.loc[(train_titanic['Age'] > 32) & (train_titanic['Age'] <= 48), 'Age'] = 2
train_titanic.loc[(train_titanic['Age'] > 48) & (train_titanic['Age'] <= 64), 'Age'] = 3
train_titanic.loc[ train_titanic['Age'] > 64, 'Age'] = 4 ;
train_titanic['Age'] = train_titanic['Age'].astype(int)

# Para el dataFrame test_titanic
test_titanic.loc[ train_titanic['Age'] <= 16, 'Age'] = 0
test_titanic.loc[(train_titanic['Age'] > 16) & (test_titanic['Age'] <= 32), 'Age'] = 1
test_titanic.loc[(train_titanic['Age'] > 32) & (test_titanic['Age'] <= 48), 'Age'] = 2
test_titanic.loc[(train_titanic['Age'] > 48) & (test_titanic['Age'] <= 64), 'Age'] = 3
test_titanic.loc[ train_titanic['Age'] > 64, 'Age'] = 4 ;
test_titanic['Age'] = test_titanic['Age'].astype(int)


In [14]:
#Mapeo 'Sex'

# Para el dataFrame train_titanic
train_titanic['Sex'] = train_titanic['Sex'].map({'female': 0, 'male': 1}).astype(int)

# Para el dataFrame test_titanic
test_titanic['Sex'] = test_titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [15]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,1,1,0,0,0
1,1,1,0,2,1,0,3,1
2,1,3,0,1,0,0,1,0
3,1,1,0,2,1,0,3,0
4,0,3,1,2,0,0,1,0


In [16]:
test_titanic.tail()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
413,1305,3,1,0,0,0,1,0
414,1306,1,0,0,0,0,3,1
415,1307,3,1,0,0,0,0,0
416,1308,3,1,0,0,0,1,0
417,1309,3,1,0,1,1,2,1


In [17]:
# Definimos los training sets
X_train = train_titanic.drop("Survived",axis=1)
Y_train = train_titanic["Survived"]
X_test = test_titanic.drop('PassengerId', axis = 1).copy()

In [18]:
#Random Forest

clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)

clf.score(X_train, Y_train)


0.88552188552188549

In [24]:
# otra opcion de random forest
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=120, n_estimators=600, min_samples_split=4, min_samples_leaf=2)
clf.fit(X_train, Y_train)
scores = cross_val_score(clf, X_train, Y_train, cv = 5)
scores.mean()


0.81600447125970865

In [20]:
#SVM

clf = SVC()
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

0.82940516273849607

In [21]:
#Ada Boost
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, Y_train)
scores = cross_val_score(clf, X_train, Y_train, cv = 5)
scores.mean()

0.79015507199657287

In [22]:
# Escribimos los resultados
submission = pd.DataFrame({ 'PassengerId': PassangersId,
                            'Survived': Y_pred })
submission.to_csv("resultados5.csv", index=False)

In [23]:
# Decision Tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, Y_train)
scores = cross_val_score(clf, X_train, Y_train, cv = 5)
scores.mean()


0.78345123230939073