In [1]:
import pandas as pd

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

#Drop features we are not going to use
train = train.drop(['Name','SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],axis=1)
test = test.drop(['Name','SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],axis=1)

#Look at the first 3 rows of our training data
train.head(3)
test.head(3)

Unnamed: 0,PassengerId,Pclass,Sex,Age
0,892,3,male,34.5
1,893,3,female,47.0
2,894,2,male,62.0


In [2]:
features = ['Pclass','Age','Sex']
target = 'Survived'

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age"])),
        ("imputer", SimpleImputer( strategy="constant")),
        ("normalizer" ,MinMaxScaler()),
    ])

In [5]:
num_pipeline.fit_transform(train)

array([[0.275   ],
       [0.475   ],
       [0.325   ],
       [0.4375  ],
       [0.4375  ],
       [0.      ],
       [0.675   ],
       [0.025   ],
       [0.3375  ],
       [0.175   ],
       [0.05    ],
       [0.725   ],
       [0.25    ],
       [0.4875  ],
       [0.175   ],
       [0.6875  ],
       [0.025   ],
       [0.      ],
       [0.3875  ],
       [0.      ],
       [0.4375  ],
       [0.425   ],
       [0.1875  ],
       [0.35    ],
       [0.1     ],
       [0.475   ],
       [0.      ],
       [0.2375  ],
       [0.      ],
       [0.      ],
       [0.5     ],
       [0.      ],
       [0.      ],
       [0.825   ],
       [0.35    ],
       [0.525   ],
       [0.      ],
       [0.2625  ],
       [0.225   ],
       [0.175   ],
       [0.5     ],
       [0.3375  ],
       [0.      ],
       [0.0375  ],
       [0.2375  ],
       [0.      ],
       [0.      ],
       [0.      ],
       [0.      ],
       [0.225   ],
       [0.0875  ],
       [0.2625  ],
       [0.61

In [6]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [9]:
cat_pipeline.fit_transform(train)

array([[0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       ...,
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1.]])

In [10]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [11]:
X_train = preprocess_pipeline.fit_transform(train[features])
X_train

array([[0.275, 0.   , 0.   , 1.   , 0.   , 1.   ],
       [0.475, 1.   , 0.   , 0.   , 1.   , 0.   ],
       [0.325, 0.   , 0.   , 1.   , 1.   , 0.   ],
       ...,
       [0.   , 0.   , 0.   , 1.   , 1.   , 0.   ],
       [0.325, 1.   , 0.   , 0.   , 0.   , 1.   ],
       [0.4  , 0.   , 0.   , 1.   , 0.   , 1.   ]])

In [12]:
from sklearn.model_selection import cross_val_score

In [13]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, train[target], cv=10)
forest_scores.mean()

0.8170911360799001

In [14]:
forest_clf.fit(X_train, train[target])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [15]:
test

Unnamed: 0,PassengerId,Pclass,Sex,Age
0,892,3,male,34.5
1,893,3,female,47.0
2,894,2,male,62.0
3,895,3,male,27.0
4,896,3,female,22.0
...,...,...,...,...
413,1305,3,male,
414,1306,1,female,39.0
415,1307,3,male,38.5
416,1308,3,male,


In [16]:
X_test = preprocess_pipeline.fit_transform(test)

In [17]:
predictions = forest_clf.predict(X_test)

In [18]:
#Create a  DataFrame with the passengers ids and our prediction regarding whether they survived or not
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

#Visualize the first 5 rows
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [19]:
filename = 'Titanic Predictions 1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Titanic Predictions 1.csv
