In [17]:
from utils import css_from_file
css_from_file('style/style.css')

Data preprocessing
----------------------

Data preprocessing is the most important step in the model preparation. It takes 90% of the time to prepare and clean the data so it can be processed by a predictive algorithm.

Here we have the data from Rossmann competition https://www.kaggle.com/c/rossmann-store-sales.

It is a good example of a dataset with many different types of data.

In [18]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [19]:
training_data = pd.read_csv("data/rossmann/train.csv")
store_data = pd.read_csv("data/rossmann/store.csv")

There are information about the Sales (our target).

In [20]:
print("Training data shape", training_data.shape)
training_data.head()

Training data shape (1017209, 9)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


And the stores themselves

In [21]:
print("Store data shape", store_data.shape)
store_data.head()

Store data shape (1115, 10)


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


Let's join the data

In [22]:
combined_data = pd.merge(training_data, store_data, on="Store")

# sample the data
combined_data = combined_data.sample(frac=0.1).reset_index()

print("Combined data shape", combined_data.shape)
combined_data.head()

Combined data shape (101721, 19)


Unnamed: 0,index,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,811239,890,5,2013-10-04,5636,457,1,0,0,0,a,a,4450.0,,,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
1,447099,490,6,2013-11-09,9567,1196,1,0,0,0,a,a,660.0,4.0,2013.0,1,40.0,2014.0,"Jan,Apr,Jul,Oct"
2,410567,450,2,2013-03-26,12535,1400,1,1,0,1,c,a,5780.0,11.0,1994.0,1,10.0,2014.0,"Mar,Jun,Sept,Dec"
3,856266,940,7,2014-05-25,0,0,0,0,0,0,d,c,6470.0,9.0,2012.0,0,,,
4,815857,895,4,2013-07-04,9598,915,1,1,0,0,a,c,4150.0,,,0,,,


In [23]:
combined_data.PromoInterval.value_counts()

Jan,Apr,Jul,Oct     29299
Feb,May,Aug,Nov     11865
Mar,Jun,Sept,Dec     9699
Name: PromoInterval, dtype: int64

In [24]:
combined_data.describe()

Unnamed: 0,index,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear
count,101721.0,101721.0,101721.0,101721.0,101721.0,101721.0,101721.0,101721.0,101449.0,69305.0,69305.0,101721.0,50863.0,50863.0
mean,508688.0,558.525781,3.997208,5772.431533,632.434866,0.829435,0.382881,0.179747,5391.505683,7.221124,2008.695145,0.500025,23.321629,2011.744293
std,293057.0,321.266963,2.000072,3855.870685,463.796343,0.37613,0.486092,0.383978,7639.43557,3.212361,5.98405,0.500002,14.128172,1.665646
min,20.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,1900.0,0.0,1.0,2009.0
25%,254140.0,280.0,2.0,3720.0,404.0,1.0,0.0,0.0,700.0,4.0,2006.0,0.0,13.0,2011.0
50%,508696.0,558.0,4.0,5746.0,609.0,1.0,0.0,0.0,2320.0,8.0,2010.0,1.0,22.0,2012.0
75%,762356.0,837.0,6.0,7856.0,836.0,1.0,1.0,0.0,6870.0,10.0,2013.0,1.0,37.0,2013.0
max,1017207.0,1115.0,7.0,38037.0,5106.0,1.0,1.0,1.0,75860.0,12.0,2015.0,1.0,50.0,2015.0


In [25]:
#combined_data.summary()

In [26]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101721 entries, 0 to 101720
Data columns (total 19 columns):
index                        101721 non-null int64
Store                        101721 non-null int64
DayOfWeek                    101721 non-null int64
Date                         101721 non-null object
Sales                        101721 non-null int64
Customers                    101721 non-null int64
Open                         101721 non-null int64
Promo                        101721 non-null int64
StateHoliday                 101721 non-null object
SchoolHoliday                101721 non-null int64
StoreType                    101721 non-null object
Assortment                   101721 non-null object
CompetitionDistance          101449 non-null float64
CompetitionOpenSinceMonth    69305 non-null float64
CompetitionOpenSinceYear     69305 non-null float64
Promo2                       101721 non-null int64
Promo2SinceWeek              50863 non-null float64
Promo2SinceYe

Exercise
----------------------

1. Identify types of data present in the dataset:
    - what would you do with each type of data?
    - are there missing values?
2. Write transformer `PandasSelector` which can select subsets of columns from the dataset.
3. Write transformers for each type of data that convert selected columns to numerical values.
4. Combine it all into 1 pipeline using `make_pipeline` and `make_union` functions.

Hint: you will need those imports
```python
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline, make_union
```

Explanation:
`BaseEstimator` and `TransformerMixin` are the classes from which you need to inherit in the your transformer class. They are needed for proper pipeline serialization (saving).

`DictVectorizer` is a transformer that can create a matrix from a dictionary of values - it is helpful to convert categorical variables. 

For example:
Let's say you have 2 columns which you want to convert to a matrix: `StoreType` and `Assortment`

DO NOT USE pandas.get_dummies to create binary features
-----------------

In [27]:
df = combined_data[0:5]
print(df.columns)
#df
df[['StoreType','Assortment']]

Index(['index', 'Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open',
       'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')


Unnamed: 0,StoreType,Assortment
0,a,a
1,a,a
2,c,a
3,d,c
4,a,c


In [28]:
# let's convert 2 columns to a list of dictionaries
data_as_dict = list(store_data.loc[:, ["StoreType","Assortment"]].to_dict(orient='records'))[:10]
data_as_dict

[{'Assortment': 'a', 'StoreType': 'c'},
 {'Assortment': 'a', 'StoreType': 'a'},
 {'Assortment': 'a', 'StoreType': 'a'},
 {'Assortment': 'c', 'StoreType': 'c'},
 {'Assortment': 'a', 'StoreType': 'a'},
 {'Assortment': 'a', 'StoreType': 'a'},
 {'Assortment': 'c', 'StoreType': 'a'},
 {'Assortment': 'a', 'StoreType': 'a'},
 {'Assortment': 'c', 'StoreType': 'a'},
 {'Assortment': 'a', 'StoreType': 'a'}]

In [29]:
from sklearn.feature_extraction import DictVectorizer
categorical_transformer = DictVectorizer()
print(categorical_transformer.fit_transform(data_as_dict).todense()) # by default DictVectorizer returns sparse matrix
print("NOTE: The first 2 binary digits represent Assortment, the second 2 rep StoreType.")
print("10 means 'a'. 01 means 'c'. 00 means anything else. 11 means nothing. ")

[[1. 0. 0. 1.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [0. 1. 0. 1.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [0. 1. 1. 0.]
 [1. 0. 1. 0.]
 [0. 1. 1. 0.]
 [1. 0. 1. 0.]]
NOTE: The first 2 binary digits represent Assortment, the second 2 rep StoreType.
10 means 'a'. 01 means 'c'. 00 means anything else. 11 means nothing. 


In [30]:
from sklearn.feature_extraction import DictVectorizer
data_as_dict[0]['Assortment'] = 'b'
print(categorical_transformer.transform(data_as_dict).todense()) # by default DictVectorizer returns sparse matrix

[[0. 0. 0. 1.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [0. 1. 0. 1.]
 [1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [0. 1. 1. 0.]
 [1. 0. 1. 0.]
 [0. 1. 1. 0.]
 [1. 0. 1. 0.]]


Alternative way (or maybe the main way)
------------------

Use a Python package that does it for you. http://contrib.scikit-learn.org/categorical-encoding/index.html

In [31]:
#!pip install category_encoders
from category_encoders.one_hot import OneHotEncoder
one_hot = OneHotEncoder()
one_hot.fit_transform(store_data[["StoreType","Assortment"]].values);

Exercise template
-----------------

Your final process should like like this:
    
```python
from sklearn.preprocessing import Imputer

processing_pipeline = make_pipeline(
    # combine features
    make_union(
        make_pipeline(
            PandasSelector(["StoreType","Assortment"]),
            one_hot,
            DictVectorizer(),
            # select categorical data
            # do something with categorical data
        ),
        make_pipeline(
            PandasSelector(["Date"]),
            TransformDates(),
            # select date
            # do something with dates
            # first convert text to date using pd.to_datetime
            # use .dt attribute of pandas column
        ),
        make_pipeline(
            PandasSelector([list of numerical columns]),
            Imputer(),
            # select numerical data
            # do something with numerical data
        ),
        make_pipeline(
            # make some feature engineering
        )
    )
)
```

In [32]:
from sklearn.preprocessing import Imputer

processing_pipeline = make_pipeline(
    # combine features
    make_union(
        make_pipeline(
            PandasSelector(["StoreType","Assortment"]),
            one_hot,
            DictVectorizer(),
            # select categorical data
            # do something with categorical data
        ),
        make_pipeline(
            PandasSelector(["Date"]),
            TransformDates(),
            # select date
            # do something with dates
            # first convert text to date using pd.to_datetime
            # use .dt attribute of pandas column
        ),
        make_pipeline(
            PandasSelector(['index' 'Store' 'DayOfWeek' 'Sales' 'Customers' 'Open' 'Promo'
 'SchoolHoliday' 'Promo2']),
            Imputer(),
            # select numerical data
            # do something with numerical data
        ),
        make_pipeline(
            # make some feature engineering\
            #pass
        )
    )
)

NameError: name 'make_pipeline' is not defined

In [33]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer

In [34]:
class PandasSelector(TransformerMixin):
    def __init__(self,col_list):
        self.col_sel = col_list
        print("Init PandasSelector with", self.col_sel)
    
    def fit(self,df):
        
        self.df = df
        print("Fit PandasSelector wtih",self.df.shape )
        return self
    
    def transform(self):
        print("PandasSelector transform", self.df.shape, self.col_sel)
        return self.df[self.col_sel]

In [35]:
sample_df = combined_data[0:5]
psel = PandasSelector(["StoreType","Assortment"])
psel.fit(sample_df)
psel.transform()

Init PandasSelector with ['StoreType', 'Assortment']
Fit PandasSelector wtih (5, 19)
PandasSelector transform (5, 19) ['StoreType', 'Assortment']


Unnamed: 0,StoreType,Assortment
0,a,a
1,a,a
2,c,a
3,d,c
4,a,c


In [36]:
pd.to_datetime(sample_df['Date'])

0   2013-10-04
1   2013-11-09
2   2013-03-26
3   2014-05-25
4   2013-07-04
Name: Date, dtype: datetime64[ns]

In [37]:
class TransformDates(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, df):
        self.df = df
        print("Fit TransformDates to df", self.df.shape)
        return self
    
    def transform(self):
        #print(self.df)
        #print(type(self.df))
        stacked_df = self.df.stack()
        #print(type(stacked_df))
        dtstacked = pd.to_datetime(stacked_df)
        #print(dtstacked)
        dtunstacked = dtstacked.unstack()
        #print(dtunstacked)
        #raise
        #pd.to_datetime(self.df.stack()).unstack()
        #raise
        
        return dtunstacked

In [38]:
pd.to_datetime(sample_df['Date'])

0   2013-10-04
1   2013-11-09
2   2013-03-26
3   2014-05-25
4   2013-07-04
Name: Date, dtype: datetime64[ns]

In [39]:
this_trans_dates=TransformDates()
print(this_trans_dates)
this_ps = PandasSelector(['Date'])
this_ps.fit(sample_df)
this_ps2 = this_ps.transform()
this_transformer = TransformDates()
this_transformer.fit(this_ps2)
this_transformer.transform()

<__main__.TransformDates object at 0x7f65b23c92b0>
Init PandasSelector with ['Date']
Fit PandasSelector wtih (5, 19)
PandasSelector transform (5, 19) ['Date']
Fit TransformDates to df (5, 1)


Unnamed: 0,Date
0,2013-10-04
1,2013-11-09
2,2013-03-26
3,2014-05-25
4,2013-07-04


In [40]:
sample_df.dtypes

index                          int64
Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

In [41]:
column_subset1 = sample_df.dtypes == 'int64'
column_subset2 = sample_df.dtypes == 'float64'
column_subset = column_subset1 | column_subset2
column_subset

index                         True
Store                         True
DayOfWeek                     True
Date                         False
Sales                         True
Customers                     True
Open                          True
Promo                         True
StateHoliday                 False
SchoolHoliday                 True
StoreType                    False
Assortment                   False
CompetitionDistance           True
CompetitionOpenSinceMonth     True
CompetitionOpenSinceYear      True
Promo2                        True
Promo2SinceWeek               True
Promo2SinceYear               True
PromoInterval                False
dtype: bool

In [42]:
column_subset = sample_df.dtypes == 'int64'

sample_df.loc[:,column_subset]
numerical_col_names = sample_df.columns[column_subset].values

In [43]:
new_psel = PandasSelector(numerical_col_names)

Init PandasSelector with ['index' 'Store' 'DayOfWeek' 'Sales' 'Customers' 'Open' 'Promo'
 'SchoolHoliday' 'Promo2']


In [44]:
my_pipeline.transform(sample_df)

NameError: name 'my_pipeline' is not defined

In [45]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
my_pipeline = make_pipeline(StandardScaler(), GaussianNB(priors=None))

In [46]:
my_pipeline.fit(sample_df)

ValueError: could not convert string to float: 'Mar,Jun,Sept,Dec'

**Double click to see the solution**

<div class='spoiler'>

class PandasSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, x, y = None):
        return self
    
    def transform(self, x):
        return x.loc[:,self.columns]
    
    
class PandasToDict(BaseEstimator, TransformerMixin):

    def fit(self, x, y = None):
        return self
    
    def transform(self, x):
        return x.T.to_dict().values()

    
class ExtractDateAttributes(BaseEstimator, TransformerMixin):
    
    def __init__(self, date_format=None,
                 attributes=["year","month","day","weekday"]):
        self.date_format = date_format
        self.attributes = attributes
        
    def fit(self, x, y = None):
        return self
    
    def transform(self, x):
        assert x.shape[1] == 1, "This transformer can handle 1 date"
        
        # convert data to date
        dt = pd.to_datetime(x.ix[:,0])
        
        # create an empty DataFrame
        df = pd.DataFrame()
        
        for attr in self.attributes:
            df[attr] = getattr(dt.dt, attr)
            
        return df
    

processing_pipeline = make_pipeline(
    # Select used variables
    PandasSelector(["Open", "Promo", "SchoolHoliday", 
                    "Date", "StoreType", "Assortment",
                    "CompetitionDistance", "CompetitionOpenSinceMonth",
                    "CompetitionOpenSinceYear", "Promo2",
                    "Promo2SinceWeek", "Promo2SinceYear"]),
    
    # combine features
    make_union(
        make_pipeline(
            PandasSelector(["Open", "Promo", "SchoolHoliday", "CompetitionDistance", 
                            "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", 
                            "Promo2", "Promo2SinceWeek", "Promo2SinceYear"]),
            Imputer(strategy='mean')
        ),
        make_pipeline(
            PandasSelector(["Date"]),
            ExtractDateAttributes()
        ),
        make_pipeline(
            PandasSelector(["StoreType", "Assortment"]),
            PandasToDict(),
            DictVectorizer(sparse=False)
        )
    )
)

</div>

To combine together your data processing pipeline and predictive algorithm you can chain them using `make_pipeline` function.

## Exercise

1. **Run and understand the code below**
   - explain each line <br/><br/>
   
2. **Why the CV is so low?**
   - the problem is to predict future prices is cross validation a good method to check if models learns well? <br/><br/>
   
3. **Use different splitting scheme and compare results** 
   - train on dates < X and test on dates > X, where X is some date
   
4. **Plot the errors of the predictions depending on how far ahead you make the prediction**

In [47]:
import sklearn
sklearn.__version__

'0.19.1'

In [49]:
from sklearn.cross_validation import cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from metrics import rmspe

est = RandomForestRegressor(verbose=True, n_jobs=-1)

pred = cross_val_predict(make_pipeline(processing_pipeline, est), 
                         combined_data, 
                         np.log1p(combined_data.Sales),
                         cv=5)

print(rmspe(combined_data.Sales, np.expm1(pred)))

NameError: name 'processing_pipeline' is not defined

Click here to see the solution

<div class="spoiler">

from cross_validation import cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from metrics import rmspe

est = RandomForestRegressor(verbose=True, n_jobs=-1)
training_data = combined_data[combined_data.Date <= '2015-05-30']
test_data = combined_data[combined_data.Date > '2015-05-30']

model = make_pipeline(processing_pipeline, est)

model.fit(training_data, np.log1p(training_data.Sales))

predictions = model.predict(test_data)

print(rmspe(combined_data.Sales, np.expm1(pred)))

</div>