# sklearn-pandas: don't be `pd.get_dummies()`

Today we're talking about [`sklearn-pandas`](https://github.com/scikit-learn-contrib/sklearn-pandas#sklearn-pandas)

1. Prevents data leakage
2. Works with new data!

Pair Programmed by Miles Erickson, Brian Mcgarry, and Cristian Nuno
Date: May 16, 2019

In [17]:
!pip install sklearn-pandas

Collecting sklearn-pandas
  Downloading https://files.pythonhosted.org/packages/1f/48/4e1461d828baf41d609efaa720d20090ac6ec346b5daad3c88e243e2207e/sklearn_pandas-1.8.0-py2.py3-none-any.whl
Installing collected packages: sklearn-pandas
Successfully installed sklearn-pandas-1.8.0
[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [18]:
import sklearn_pandas 

In [1]:
import pandas as pd

In [2]:
!wget https://gist.github.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv

--2019-05-16 14:03:47--  https://gist.github.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv
Resolving gist.github.com (gist.github.com)... 192.30.255.118
Connecting to gist.github.com (gist.github.com)|192.30.255.118|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv [following]
--2019-05-16 14:03:47--  https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 151.101.64.133, 151.101.128.133, 151.101.192.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|151.101.64.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10305 (10K) [text/plain]
Saving to: ‘titanic.csv’




In [32]:
titanic = pd.read_csv("titanic.csv", delimiter="\t")

In [33]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 12 columns):
PassengerId    156 non-null int64
Survived       156 non-null int64
Pclass         156 non-null int64
Name           156 non-null object
Sex            156 non-null object
Age            126 non-null float64
SibSp          156 non-null int64
Parch          156 non-null int64
Ticket         156 non-null object
Fare           156 non-null float64
Cabin          31 non-null object
Embarked       155 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 14.7+ KB


Right off the bat, we need to clean `Age` because we have missing values!

In [35]:
import numpy as np
from sklearn.impute import SimpleImputer

```python
>>> mapper = DataFrameMapper([
...     ('pet', sklearn.preprocessing.LabelBinarizer()),
...     (['children'], sklearn.preprocessing.StandardScaler())
... ])
```

In [36]:
from sklearn.compose import ColumnTransformer

In [37]:
from sklearn_pandas import FunctionTransformer

In [84]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

def is_female(x):
    """Assigns 1 if female; 0 if else"""
    if x == "female":
        return 1
    else:
        return 0

mapper = sklearn_pandas.DataFrameMapper([
    (["Age"], imp_mean),
    ("Sex", FunctionTransformer(is_female)),
    ("Fare", None),
])

In [85]:
mapper.fit(titanic)

DataFrameMapper(default=False, df_out=False,
        features=[(['Age'], SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('Sex', FunctionTransformer(func=None)), ('Fare', None)],
        input_df=False, sparse=False)

In [86]:
mapper.transform(titanic)

array([[ 22.        ,   0.        ,   7.25      ],
       [ 38.        ,   1.        ,  71.2833    ],
       [ 26.        ,   1.        ,   7.925     ],
       [ 35.        ,   1.        ,  53.1       ],
       [ 35.        ,   0.        ,   8.05      ],
       [ 28.14150794,   0.        ,   8.4583    ],
       [ 54.        ,   0.        ,  51.8625    ],
       [  2.        ,   0.        ,  21.075     ],
       [ 27.        ,   1.        ,  11.1333    ],
       [ 14.        ,   1.        ,  30.0708    ],
       [  4.        ,   1.        ,  16.7       ],
       [ 58.        ,   1.        ,  26.55      ],
       [ 20.        ,   0.        ,   8.05      ],
       [ 39.        ,   0.        ,  31.275     ],
       [ 14.        ,   1.        ,   7.8542    ],
       [ 55.        ,   1.        ,  16.        ],
       [  2.        ,   0.        ,  29.125     ],
       [ 28.14150794,   0.        ,  13.        ],
       [ 31.        ,   1.        ,  18.        ],
       [ 28.14150794,   1.     

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(titanic.drop("Survived", axis=1),
                                                    titanic["Survived"])

In [43]:
mapper.fit(X_train)

DataFrameMapper(default=False, df_out=False,
        features=[(['Age'], SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('Sex', FunctionTransformer(func=None))],
        input_df=False, sparse=False)

In [44]:
mapper.transform(X_train)

array([[28.32797872,  0.        ],
       [34.        ,  0.        ],
       [29.        ,  0.        ],
       [27.        ,  1.        ],
       [28.32797872,  0.        ],
       [28.32797872,  1.        ],
       [38.        ,  0.        ],
       [27.        ,  0.        ],
       [28.32797872,  0.        ],
       [28.32797872,  1.        ],
       [28.32797872,  0.        ],
       [28.32797872,  0.        ],
       [47.        ,  1.        ],
       [42.        ,  0.        ],
       [26.        ,  1.        ],
       [25.        ,  0.        ],
       [22.        ,  0.        ],
       [15.        ,  1.        ],
       [46.        ,  0.        ],
       [28.5       ,  0.        ],
       [11.        ,  0.        ],
       [51.        ,  0.        ],
       [19.        ,  0.        ],
       [ 4.        ,  1.        ],
       [26.        ,  0.        ],
       [28.32797872,  1.        ],
       [29.        ,  0.        ],
       [21.        ,  0.        ],
       [32.        ,

In [45]:
mapper.transform(X_test)

array([[30.        ,  1.        ],
       [40.5       ,  0.        ],
       [22.        ,  0.        ],
       [ 2.        ,  1.        ],
       [29.        ,  1.        ],
       [19.        ,  1.        ],
       [54.        ,  0.        ],
       [26.        ,  0.        ],
       [32.5       ,  0.        ],
       [28.32797872,  1.        ],
       [54.        ,  0.        ],
       [17.        ,  1.        ],
       [ 7.        ,  0.        ],
       [28.32797872,  1.        ],
       [21.        ,  0.        ],
       [16.        ,  0.        ],
       [37.        ,  0.        ],
       [16.        ,  1.        ],
       [65.        ,  0.        ],
       [ 3.        ,  1.        ],
       [49.        ,  1.        ],
       [28.32797872,  0.        ],
       [28.32797872,  0.        ],
       [28.32797872,  0.        ],
       [33.        ,  0.        ],
       [29.        ,  1.        ],
       [ 2.        ,  0.        ],
       [ 4.        ,  0.        ],
       [22.        ,

In [49]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
model = DecisionTreeClassifier(max_depth=4)

In [63]:
pipe = sklearn_pandas.pipeline.Pipeline(steps=[
    ("dataprep", mapper),
    ("model", model)
])

In [64]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('dataprep', DataFrameMapper(default=False, df_out=False,
        features=[(['Age'], SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('Sex', FunctionTransformer(func=None))],
        input_df=False, sparse=False)), ('model', DecisionTreeClas...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [65]:
y_test

79     1
153    0
80     0
119    0
53     1
136    1
6      0
69     0
122    0
140    0
124    0
84     1
50     0
32     1
120    0
86     0
104    0
71     0
54     0
43     1
52     1
48     0
77     0
65     1
130    0
66     1
16     0
63     0
60     0
47     1
24     0
51     0
61     1
128    1
13     0
75     0
151    1
33     0
21     1
Name: Survived, dtype: int64

In [66]:
from sklearn.metrics import log_loss

In [67]:
y_pred = pipe.predict_proba(X_test)

In [68]:
y_pred

array([[0.1875    , 0.8125    ],
       [1.        , 0.        ],
       [0.94444444, 0.05555556],
       [0.        , 1.        ],
       [0.1875    , 0.8125    ],
       [0.54545455, 0.45454545],
       [1.        , 0.        ],
       [0.73684211, 0.26315789],
       [0.73684211, 0.26315789],
       [0.1875    , 0.8125    ],
       [1.        , 0.        ],
       [0.54545455, 0.45454545],
       [0.94444444, 0.05555556],
       [0.1875    , 0.8125    ],
       [0.94444444, 0.05555556],
       [0.94444444, 0.05555556],
       [1.        , 0.        ],
       [0.54545455, 0.45454545],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.73684211, 0.26315789],
       [0.73684211, 0.26315789],
       [0.73684211, 0.26315789],
       [1.        , 0.        ],
       [0.1875    , 0.8125    ],
       [0.94444444, 0.05555556],
       [0.94444444, 0.05555556],
       [0.94444444, 0.05555556],
       [0.1875    , 0.8125    ],
       [1.

In [70]:
log_loss(y_test, y_pred)

2.8781027774681576

In [71]:
from sklearn.linear_model import LogisticRegression

In [75]:
model = LogisticRegression(solver="lbfgs")
pipe = sklearn_pandas.pipeline.Pipeline(steps=[
    ("dataprep", mapper),
    ("model", model)
])

In [76]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('dataprep', DataFrameMapper(default=False, df_out=False,
        features=[(['Age'], SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('Sex', FunctionTransformer(func=None))],
        input_df=False, sparse=False)), ('model', LogisticRegressi...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [79]:
y_pred = pipe.predict_proba(X_test)

In [81]:
log_loss(y_test, y_pred)

0.45633493530627095