In [1]:
import numpy as np
import pandas as pd

In [2]:
# Importando el dataset
data = pd.read_csv('titanic.csv')
data.shape

(891, 12)

In [3]:
print(data.columns, '\n')
print(data.head(), '\n')
print(data['Fare'].describe())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object') 

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2     

In [4]:
# Limpieza de datos
def simplify_ages(df):
    df['Age'] = df['Age'].fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student','Young Adult', 'Adult', 'Senior']
    # La función CUT de Pandas nos regresa el índice 
    # del bin al que pertenece el valor x
    categories = pd.cut(df['Age'], bins, labels=group_names)
    df['Age'] = categories
    return df
    #
def simplify_cabins(df):
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Cabin'] = df['Cabin'].apply(lambda xx: xx[0])
    return df
    #
def simplify_fares(df):
    df['Fare'] = df['Fare'].fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000) #titanic_train.Fare.describe()
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    # Lo mismo que vimos anteriormente con la función CUT
    categories = pd.cut(df['Fare'], bins, labels=group_names)
    df['Fare'] = categories
    return df
    #
def format_name(df):
    df['Lname'] = df['Name'].apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df['Name'].apply(lambda x: x.split(' ')[1])
    return df
    #
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)
    #
def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

In [6]:
transformed_data = transform_features(data)
transformed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,1,0,3,male,Student,1,0,1_quartile,N,"Braund,",Mr.
1,2,1,1,female,Adult,1,0,4_quartile,C,"Cumings,",Mrs.
2,3,1,3,female,Young Adult,0,0,1_quartile,N,"Heikkinen,",Miss.
3,4,1,1,female,Young Adult,1,0,4_quartile,C,"Futrelle,",Mrs.
4,5,0,3,male,Young Adult,0,0,2_quartile,N,"Allen,",Mr.


In [7]:
from sklearn import preprocessing
def encode_features(datos):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Lname', 'NamePrefix']
    #df_combined = pd.concat([df_train[features], df_test[features]])
    #
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(datos[feature])
        datos[feature] = le.transform(datos[feature])
    #
    return datos

In [8]:
d_encode = encode_features(transformed_data)
d_encode.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,1,0,3,1,4,1,0,0,7,73,17
1,2,1,1,0,0,1,0,3,2,136,18
2,3,1,3,0,7,0,0,0,7,251,14
3,4,1,1,0,7,1,0,3,2,198,18
4,5,0,3,1,7,0,0,1,7,11,17


In [9]:
# Separando en training
from sklearn.model_selection import train_test_split
X_all = d_encode.drop(['Survived', 'PassengerId'], axis=1)

In [10]:
#Borramos las columnas Survived y PassengerID
y_all = d_encode['Survived']

In [11]:
num_test = 0.20 # aquí se está especificando 20% para test
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test)

In [12]:
print(X_train, '\n')
print(X_test, '\n')
print(y_train, '\n')
print(y_test)

     Pclass  Sex  Age  SibSp  Parch  Fare  Cabin  Lname  NamePrefix
832       3    1    6      0      0     0      7    529          17
506       2    0    7      0      2     2      7    496          18
637       2    1    7      1      1     2      7    122          17
700       1    0    5      1      0     3      2     26          18
266       3    1    5      4      1     3      7    464          17
..      ...  ...  ...    ...    ...   ...    ...    ...         ...
723       2    1    0      0      0     1      7    261          17
332       1    1    0      0      1     3      2    221          17
349       3    1    0      0      0     1      7    155          17
822       1    1    0      0      0     4      7    503           9
850       3    1    1      4      2     3      7     16          11

[712 rows x 9 columns] 

     Pclass  Sex  Age  SibSp  Parch  Fare  Cabin  Lname  NamePrefix
481       2    1    6      0      0     4      7    195          17
568       3    1    6 

In [13]:
# Seleccionamos el tipo de clasificador, en este caso RandomForest. 
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

In [14]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score
predictions = clf.predict(X_test)
# se imprime el accuracy
print("Accuracy: ", accuracy_score(y_test, predictions))
#Imprimimos la f-score
print("F1-Score: ", f1_score(y_test, predictions))

Accuracy:  0.7541899441340782
F1-Score:  0.7027027027027027


In [16]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train, cv=6)
print(scores)

[0.83193277 0.79831933 0.79831933 0.84033613 0.77966102 0.80508475]


In [18]:
#Podemos cambiar la métrica de evaluación con el parámetro scoring
scores = cross_val_score(clf, X_test, y_test, cv=6, scoring='f1_macro')
print(scores)

[0.6996663  0.79166667 0.75323149 0.72222222 0.65277778 0.75862069]
