# Model Serving 

## Sklearn Pipeline 

The steps are: 

1. Load Dataset
2. Build Pipeline

### 1. Load Dataset

Import the pandas and numpy packages

In [1]:
import pandas as pd
import numpy as np

Load dataset

In [2]:
# Load the datasets
clean_df = pd.read_csv('/Users/zoebarrett/13964907_adv_mla_at2/adv_mla_at2_model/data/raw/sales_train_10.csv')

events_df = pd.read_csv('/Users/zoebarrett/13964907_adv_mla_at2/adv_mla_at2_model/data/raw/calendar_events.csv')


In [3]:
clean_df.head()

Unnamed: 0,item_id,dept_id,store_id,sales,date
0,HOBBIES_1_001,HOBBIES_1,CA_1,0,2012-01-01
1,HOBBIES_1_002,HOBBIES_1,CA_1,0,2012-01-01
2,HOBBIES_1_003,HOBBIES_1,CA_1,0,2012-01-01
3,HOBBIES_1_004,HOBBIES_1,CA_1,0,2012-01-01
4,HOBBIES_1_005,HOBBIES_1,CA_1,1,2012-01-01


In [4]:
clean_df.shape

(10, 5)

In [5]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   item_id   10 non-null     object
 1   dept_id   10 non-null     object
 2   store_id  10 non-null     object
 3   sales     10 non-null     int64 
 4   date      10 non-null     object
dtypes: int64(1), object(4)
memory usage: 528.0+ bytes


### 2. Build Pipeline 

Import `Pipeline` from `sklearn.pipeline`, `OrdinalEncoder` and `OneHotEncoder` from `sklearn.preprocessing` and `SGDClassifier` from `sklearn.linear_model`

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
class EventTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, events_df):
        self.events_df = events_df
    
    def fit(self, clean_df, y=None):
        return self
    
    def transform(self, clean_df):
        clean_df_copy = clean_df.merge(self.events_df, on='date', how='left')
        clean_df_copy['event_name'].fillna('NoEvent', inplace=True)
        clean_df_copy['event_type'].fillna('NoEvent', inplace=True)
        
        # One-hot encode 'event_name' and 'event_type'
        event_name_encoded = pd.get_dummies(clean_df_copy['event_name'], prefix='event_name', dtype=int)
        event_type_encoded = pd.get_dummies(clean_df_copy['event_type'], prefix='event_type', dtype=int)
        
        # Concatenate the one-hot encoded columns with the original DataFrame
        transformed_df = pd.concat([clean_df_copy, event_name_encoded, event_type_encoded], axis=1)
        
        # Drop the original 'date', 'event_name' and 'event_type' columns
        transformed_df.drop(['date', 'event_name', 'event_type'], axis=1, inplace=True)
        
        return transformed_df

In [8]:
# Test EventTransformer works 
events_df_sample = pd.DataFrame({'date': ['2023-09-01', '2023-09-02'], 'event_name': ['EventA', 'EventB'], 'event_type': ['TypeA', 'TypeB']})
clean_df_sample = pd.DataFrame({'date': ['2023-09-01', '2023-09-02']})

event_transformer = EventTransformer(events_df_sample)
transformed_df = event_transformer.transform(clean_df_sample)
print(transformed_df)

   event_name_EventA  event_name_EventB  event_type_TypeA  event_type_TypeB
0                  1                  0                 1                 0
1                  0                  1                 0                 1


In [9]:
class ItemIdTransformer(BaseEstimator, TransformerMixin):
    def fit(self, clean_df, y=None):
        return self
    
    def transform(self, clean_df):
        clean_df_copy = clean_df.copy()
        # extract the last 3 numbers of the item_id e.g. 'HOBBIES_1_001' becomes '001'
        clean_df_copy['transformed_item_id'] = clean_df_copy['item_id'].str.extract('_(\d+)$').astype(str)
        
        # One-hot encode 'transformed_item_id' using pd.get_dummies
        one_hot_encoded = pd.get_dummies(clean_df_copy['transformed_item_id'], prefix='item_id', dtype=int)
        
        # Concatenate the one-hot encoded columns with the original DataFrame
        transformed_df = pd.concat([clean_df_copy, one_hot_encoded], axis=1)
        
        # Drop the original 'transformed_item_id' column
        transformed_df.drop(['transformed_item_id', 'item_id'], axis=1, inplace=True)
        
        return transformed_df

In [10]:
# test ItemIDTransformer
clean_df_sample = pd.DataFrame({'item_id': ['HOBBIES_1_001', 'HOBBIES_1_002', 'HOBBIES_1_003']})

item_id_transformer = ItemIdTransformer()
transformed_df = item_id_transformer.transform(clean_df_sample)
print(transformed_df)

   item_id_001  item_id_002  item_id_003
0            1            0            0
1            0            1            0
2            0            0            1


Create a `Pipeline` called `cat_transformer` with one step that contains `OneHotEncoder`

In [11]:
cat_transformer = Pipeline(
    steps=[
        ('one_hot_encoder', OneHotEncoder())
    ]
)

Create a list called `cat_cols` that will contain the list of columns that are categorical type

In [12]:
cat_cols = ['dept_id', 'store_id']


Import `ColumnTransformer` from `sklearn.compose`

In [13]:
from sklearn.compose import ColumnTransformer

Create a `ColumnTransformer` called `preprocessor` containing the following steps
- `date_transformer` for date related variables
- `cat_transformer` for `cat_cols`
- `ord_transformer` for `ord_cols`

In [14]:
# Instantiate transformers
event_transformer = EventTransformer(events_df)
item_id_transformer = ItemIdTransformer()

# Create preprocessor 
preprocessor = ColumnTransformer(
    transformers=[
        ('event_transformer', event_transformer, ['date']),
        ('item_id_transformer', item_id_transformer, ['item_id']),
        ('cat_transformer', cat_transformer, cat_cols)
    ]
)

In [15]:
# Check how preprocessor is behaving 
transformed_sample = preprocessor.fit_transform(clean_df)

transformed_df = pd.DataFrame(transformed_sample)

transformed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


Import XGBRegressor module from xgboost

In [16]:
from xgboost import XGBRegressor

[2.8] Create a `Pipeline` called `xgb_pipe` that contains 2 steps `preprocessor` and another that instantiates an XGBoost Regressor model 

In [17]:
xgb_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('xgb', XGBRegressor(objective ='reg:linear', n_estimators = 10))
    ]
)

Fit `xgb_pipe` with `clean_df`

In [18]:
target = clean_df['sales']
xgb_pipe.fit(clean_df, target)



  if is_sparse(data):


Make predictions on clean_df

In [19]:
xgb_pipe.predict(clean_df)

array([0.03031158, 0.03031158, 0.03031158, 0.0984372 , 0.9015628 ,
       0.02251833, 0.02251833, 0.02251833, 1.7046884 , 0.10262574],
      dtype=float32)

Transform the first observation of `clean_df` into a dataframe, call it `obs` and make prediction on it

In [20]:
obs = pd.DataFrame(clean_df.iloc[[0]])

In [21]:
print(obs)

         item_id    dept_id store_id  sales        date
0  HOBBIES_1_001  HOBBIES_1     CA_1      0  2012-01-01


In [22]:
xgb_pipe.predict(obs)

ValueError: Feature shape mismatch, expected: 10, got 6

Import `dump` from `joblib` package and save `xgb_pipe` into `models` folder

In [None]:
from joblib import dump

dump(xgb_pipe,  '../models/xgb_pipeline_no_events.joblib')