In [28]:
import pandas as pd
import numpy as np
import eli5
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics

from IPython.display import display

In [29]:
eli5.__version__

'0.6.3'

In [30]:
# Load data with Pandas
data = pd.read_csv('data/train.csv')

In [31]:
col_to_del = ['Embarked', 'Cabin'] # Don't want to use this features here

data.drop(col_to_del, axis=1, inplace=True)

In [32]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05


In [33]:
data.shape

(891, 10)

In [34]:
def builder(data):
    """
    Build Feature & dummies
    return :
     - data - [DataFrame] Features
     - y - [Series] Target (0/1)
    """
    
    # target
    target = data.Survived
    
    # Pclass
    mes_dummy_Pclass = pd.get_dummies(data.Pclass, prefix="split_Pclass")
    data = data.join(mes_dummy_Pclass)
    data = data.drop('Pclass', axis=1)
    
    # Sex
    mes_dummy_Sex = pd.get_dummies(data.Sex, prefix="split_Sex")
    data = data.join(mes_dummy_Sex)
    data = data.drop('Sex', axis=1)
    
    # Child
    data['child'] = 0
    data.loc[data.Age <= 10, 'child'] = 1
    
    # Ticket
    data['Ticket'] = data['Ticket'].astype('str')

    return data, target

In [35]:
X, y = builder(data)

In [36]:
X.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,split_Pclass_1,split_Pclass_2,split_Pclass_3,split_Sex_female,split_Sex_male,child
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,0,1,0,1,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,0,1,1,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,0,0,1,0,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,0,1,0,1,0


In [37]:
X.shape

(891, 14)

In [38]:
y.shape

(891,)

In [39]:
# We split our X in 2 DataFrame (X_train, X_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2017)

In [40]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,split_Pclass_1,split_Pclass_2,split_Pclass_3,split_Sex_female,split_Sex_male,child
57,58,0,"Novel, Mr. Mansouer",28.5,0,0,2697,7.2292,0,0,1,0,1,0
32,33,1,"Glynn, Miss. Mary Agatha",,0,0,335677,7.75,0,0,1,1,0,0
662,663,0,"Colley, Mr. Edward Pomeroy",47.0,0,0,5727,25.5875,1,0,0,0,1,0
555,556,0,"Wright, Mr. George",62.0,0,0,113807,26.55,1,0,0,0,1,0
196,197,0,"Mernagh, Mr. Robert",,0,0,368703,7.75,0,0,1,0,1,0


In [41]:
data.Ticket.dtype

dtype('O')

In [42]:
data.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare']

In [43]:
# Col to use for NLP
col_nlp = ['Name', 'Ticket'] # Ticket is a bit overkill here but it's for the exemple.


count_vec_name = CountVectorizer(analyzer='char_wb',
                                 ngram_range=(3, 4),
                                 max_features=100)
count_vec_ticket = CountVectorizer(analyzer='char_wb',
                                   ngram_range=(1, 3),
                                   max_features=35)



class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(col_nlp, axis=1)
        return x
    #def get_feature_names(self):
    #    return x.columns.tolist()
        

class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]

In [44]:
xgb_model = XGBClassifier()

In [45]:
## My pipeline
clf = Pipeline([
        ('union', FeatureUnion(
                    transformer_list = [
                        ('cst',  cust_regression_vals()),
                        ('name', Pipeline([
                            ('Name', cust_txt_col(key='Name')), # Selector
                            ('count_vec_name', count_vec_name)
                        ])),
                        ('ticket', Pipeline([
                            ('Ticket', cust_txt_col(key='Ticket')), # Selector
                            ('count_vec_ticket', count_vec_ticket)
                        ])),
                    ]
        )),
        ('algo', xgb_model)
    ])

In [46]:
# Learning
clf.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('cst', cust_regression_vals()), ('name', Pipeline(steps=[('Name', cust_txt_col(key='Name')), ('count_vec_name', CountVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'u...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [47]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,split_Pclass_1,split_Pclass_2,split_Pclass_3,split_Sex_female,split_Sex_male,child
57,58,0,"Novel, Mr. Mansouer",28.5,0,0,2697,7.2292,0,0,1,0,1,0
32,33,1,"Glynn, Miss. Mary Agatha",,0,0,335677,7.75,0,0,1,1,0,0
662,663,0,"Colley, Mr. Edward Pomeroy",47.0,0,0,5727,25.5875,1,0,0,0,1,0
555,556,0,"Wright, Mr. George",62.0,0,0,113807,26.55,1,0,0,0,1,0
196,197,0,"Mernagh, Mr. Robert",,0,0,368703,7.75,0,0,1,0,1,0


In [48]:
clf.predict(X_test)

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0])

In [49]:
metrics.accuracy_score(y_test, clf.predict(X_test))

1.0

In [50]:
## My Goal now is to get all my features names (no get_feature_names() yet)
# get feature name with text transformer :
features =  X_train.columns.tolist()

# Remove feature with Text processing
for col in col_nlp:
    features.remove(col)

count_vec_name.fit(X_train['Name'])
features.extend(count_vec_name.get_feature_names())

count_vec_ticket.fit(X_train['Ticket'])
features.extend(count_vec_ticket.get_feature_names())



In [51]:
len(features)
# X_train : 14 features
# We Remove our 2 col_nlp feature --> 12 Features
# Add 100 features with count_vec_name --> 112
# add 35 features with count_vec_ticket --> 147

147

In [52]:
features

['PassengerId',
 'Survived',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'split_Pclass_1',
 'split_Pclass_2',
 'split_Pclass_3',
 'split_Sex_female',
 'split_Sex_male',
 'child',
 u' al',
 u' an',
 u' ca',
 u' ch',
 u' ed',
 u' fr',
 u' ha',
 u' he',
 u' jo',
 u' joh',
 u' le',
 u' ma',
 u' mar',
 u' mi',
 u' mis',
 u' mr',
 u' mr.',
 u' mrs',
 u' pe',
 u' sa',
 u' wi',
 u' wil',
 u'am ',
 u'an ',
 u'and',
 u'ann',
 u'ard',
 u'ard ',
 u'arl',
 u'art',
 u'ast',
 u'ber',
 u'cha',
 u'char',
 u'd, ',
 u'der',
 u'e, ',
 u'eli',
 u'ell',
 u'en,',
 u'en, ',
 u'er ',
 u'er,',
 u'er, ',
 u'ert',
 u'es ',
 u'han',
 u'har',
 u'hen',
 u'iam',
 u'iam ',
 u'ie ',
 u'ill',
 u'illi',
 u'iss',
 u'iss.',
 u'joh',
 u'john',
 u'l, ',
 u'lia',
 u'liam',
 u'lli',
 u'llia',
 u'man',
 u'mar',
 u'mas',
 u'mis',
 u'miss',
 u'mr.',
 u'mr. ',
 u'mrs',
 u'mrs.',
 u'n, ',
 u'nde',
 u'ne ',
 u'ohn',
 u'on ',
 u'on,',
 u'on, ',
 u'r, ',
 u'r. ',
 u'rd ',
 u'ric',
 u'rs.',
 u'rs. ',
 u'ry ',
 u's, ',
 u's. ',
 u'son',
 u'so

In [53]:
# Our test dataset
X_test.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,split_Pclass_1,split_Pclass_2,split_Pclass_3,split_Sex_female,split_Sex_male,child
809,810,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",33.0,1,0,113806,53.1,1,0,0,1,0,0
5,6,0,"Moran, Mr. James",,0,0,330877,8.4583,0,0,1,0,1,0
232,233,0,"Sjostedt, Mr. Ernst Adolf",59.0,0,0,237442,13.5,0,1,0,0,1,0
682,683,0,"Olsvigen, Mr. Thor Anderson",20.0,0,0,6563,9.225,0,0,1,0,1,0
666,667,0,"Butler, Mr. Reginald Fenton",25.0,0,0,234686,13.0,0,1,0,0,1,0


In [54]:
eli5.show_prediction(clf.named_steps['algo'], 
                     X_test[X_test.index == 809], 
                     feature_names=features,
                     vec=clf.named_steps['union'])

AttributeError: 'list' object has no attribute 'drop'