In [79]:
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer, Imputer

In [80]:
df = pd.read_csv('https://gist.githubusercontent.com/braingineer/5d15057ac482ee0130b6d0e6f9cc9311/raw/d4eefaecc98b342ec578cf3512184556e8856750/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [81]:
# fill in missing values (need to try move this into the pipeline)
df['Age'].fillna(0, inplace = True)

In [82]:
# use pandas sklearn to do some preprocessing
full_mapper = DataFrameMapper([
    ('Name', Pipeline([ ('name_vect', CountVectorizer()) , ('name_tfidf', TfidfTransformer()) ]) ),
    ('Ticket', Pipeline([ ('ticket_vect', CountVectorizer()) , ('ticket_tfidf', TfidfTransformer()) ]) ),
    ('Sex', LabelBinarizer()),
    (['Age', 'Fare'], None), # i tried to use Impute() but got an error
    ])

In [83]:
# build full pipeline
full_pipeline  = Pipeline([
    ('mapper',full_mapper),
    ('clf', SGDClassifier(n_iter=15, warm_start=True))
])



In [84]:
# determine full param search space (need to get the params for the mapper parts in here somehow)
full_params = {'clf__alpha': [1e-2,1e-3,1e-4],
                   'clf__loss':['modified_huber','hinge'],
                   'clf__penalty':['l2','l1']}

In [85]:
# set up grid search
gs_clf = GridSearchCV(full_pipeline, full_params, n_jobs=-1)

In [86]:
# do the fit
gs_clf.fit(df,df['Survived'])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('Name', Pipeline(memory=None,
     steps=[('name_vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=Tr...y='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__alpha': [0.01, 0.001, 0.0001], 'clf__loss': ['modified_huber', 'hinge'], 'clf__penalty': ['l2', 'l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [87]:
# look at f1
y = gs_clf.predict(df)
print(classification_report(y, df['Survived']))

             precision    recall  f1-score   support

          0       0.43      0.84      0.57       281
          1       0.87      0.49      0.62       610

avg / total       0.73      0.60      0.61       891

