In [1]:
%matplotlib inline
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [153]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import SelectFromModel

In [155]:
raw_data = pd.DataFrame.from_csv('data/train.csv')

In [156]:
data_wo_cabin = raw_data.drop(['Cabin','Name','Ticket'],1)

In [157]:
clean_data = data_wo_cabin.dropna()
clean_data.count()

Survived    712
Pclass      712
Sex         712
Age         712
SibSp       712
Parch       712
Fare        712
Embarked    712
dtype: int64

In [158]:
# oen hot encoding of embarked

In [181]:
def get_columns(X, columns):
    return X[columns]

In [221]:
imputer = Imputer()
age_fare_pipe = Pipeline([
        ('select',FunctionTransformer(get_columns,
                                       kw_args={'columns':['Age','Fare']},
                                       validate=False)),
        ('inpute',imputer)
    ])

In [222]:
sex_pipe = Pipeline([
        ('select', FunctionTransformer(get_columns,
                                       kw_args={'columns':['Sex']},
                                       validate=False)),
        ('encode', LabelBinarizer())
    ])

In [223]:
embark_pipe = Pipeline([
        ('select', FunctionTransformer(get_columns,
                                       kw_args={'columns':['Embarked']},
                                       validate=False)),
        ('encode', LabelBinarizer())
    ])

In [224]:
def everything_except(X, y=None):
    return X[['Pclass','Age','SibSp','Parch','Fare']]

In [225]:
all_features_pipe = FeatureUnion([
        ('sex',sex_pipe),
        ('embark',embark_pipe),
        ('age_and_fare', age_fare_pipe),
        ('others', FunctionTransformer(get_columns,
                                       kw_args={'columns':['Pclass','SibSp','Parch']},
                                       validate=False))
    ])

In [226]:
transformed_data = all_features_pipe.fit_transform(clean_data)

In [227]:
X = transformed_data
y = clean_data['Survived']

In [228]:
from sklearn.ensemble import RandomForestClassifier

In [229]:
rf = RandomForestClassifier()

In [230]:
pipeine = Pipeline([
        ('rf', rf)
    ])

In [231]:
pipeine.fit_transform(transformed_data, clean_data['Survived'])



array([[  1.    ,  22.    ,   7.25  ],
       [  0.    ,  38.    ,  71.2833],
       [  0.    ,  26.    ,   7.925 ],
       ..., 
       [  0.    ,  19.    ,  30.    ],
       [  1.    ,  26.    ,  30.    ],
       [  1.    ,  32.    ,   7.75  ]])

In [232]:
pipeine.score(transformed_data, clean_data['Survived'])

0.9705056179775281

In [233]:
from sklearn.model_selection import GridSearchCV

In [234]:
param_grid = { 
    'rf__n_estimators': [200, 700],
    'rf__max_features': ['auto', 'sqrt', 'log2']
}

In [235]:
CV_rfc = GridSearchCV(estimator=pipeine, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y)
print(CV_rfc.best_params_)

{'rf__n_estimators': 700, 'rf__max_features': 'sqrt'}


In [236]:
CV_rfc.score(X,y)

0.9859550561797753

In [237]:
test_data = pd.DataFrame.from_csv('data/test.csv')

In [238]:
test_data.count()

Pclass      418
Name        418
Sex         418
Age         332
SibSp       418
Parch       418
Ticket      418
Fare        417
Cabin        91
Embarked    418
dtype: int64

In [240]:
test_data_wo_cabin = test_data.drop('Cabin',axis=1)

In [243]:
predictions = CV_rfc.predict(all_features_pipe.transform(test_data_wo_cabin))

In [246]:
predictions.savetext('predict.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'savetext'

In [247]:
np.savetxt('predict.csv', predictions)

In [248]:
predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [250]:
test_data_wo_cabin['Survived'] = predictions

In [254]:
test_data_wo_cabin['Survived'].to_csv('atmp.csv')