## Imports

In [1]:
import pandas as pd
import numpy as np
import inflection
import math
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from IPython.core.display import HTML
from scipy import stats
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
import xgboost as xgb
import random
import pickle

## Class Rossmann

In [22]:
import pickle
import inflection
import pandas as pd
import numpy as np
import math
import datetime

class Rossmann (object):
    def __init__(self):
        self.home_path = '/home/pedro/Documentos/repos/Rossman sales project/'
        self.competition_distance_scaler = pickle.load(open(self.home_path + 'parameter/competition_distance_scaler.pkl','rb'))
        self.competition_time_month_scaler = pickle.load(open(self.home_path + 'parameter/competition_time_month_scaler.pkl','rb'))
        self.promo_time_week_scaler = pickle.load(open(self.home_path + 'parameter/promo_time_week_scaler.pkl','rb'))
        self.year_scaler = pickle.load(open(self.home_path + 'parameter/year_scaler.pkl','rb'))    
        self.store_type_scaler = pickle.load(open(self.home_path + 'parameter/store_type_scaler.pkl','rb'))
    
    def data_cleaning(self,df1):

        ### Rename Columns

        old_cols = ['Store', 'DayOfWeek', 'Date', 'Open','Promo', 'StateHoliday', 'SchoolHoliday','StoreType', 'Assortment', 'CompetitionDistance',
                    'CompetitionOpenSinceMonth','CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek','Promo2SinceYear', 'PromoInterval']
        snakecase = lambda x: inflection.underscore(x)
        df1.columns = list(map(snakecase,old_cols))

        ### Data types and NaN values

        df1['date'] = pd.to_datetime(df1['date'])

        ### Fillout NaN

        #competition_distance 
        df1['competition_distance'] = df1['competition_distance'].apply(lambda x: 200000 if math.isnan(x) else x)
        #competition_open_since_month
        df1['competition_open_since_month'] = df1.apply(lambda x: x['date'].month if math.isnan(x['competition_open_since_month']) else x['competition_open_since_month'], axis = 1)
        #competition_open_since_year
        df1['competition_open_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['competition_open_since_year']) else x['competition_open_since_year'], axis = 1)
        #promo2_since_week
        df1['promo2_since_week'] = df1.apply(lambda x: x['date'].week if math.isnan(x['promo2_since_week']) else x['promo2_since_week'], axis = 1)
        #promo2_since_year       
        df1['promo2_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['promo2_since_year']) else x['promo2_since_year'], axis = 1)
        #promo_interval
        month_map = {1: 'Jan',2: 'Feb',3: 'Mar',4: 'Apr',5: 'May',6: 'Jun',7: 'Jul',8: 'Aug',9: 'Sept',10: 'Oct',11: 'Nov',12: 'Dec'}

        df1['promo_interval'].fillna(0,inplace = True)

        df1['month_map'] = df1['date'].dt.month.map(month_map)

        df1['is_promo'] = df1[['promo_interval','month_map']].apply(lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split(',') else 0, axis = 1)

        ### Changing data types

        df1['competition_open_since_month'] = df1['competition_open_since_month'].astype(int)
        df1['competition_open_since_year'] = df1['competition_open_since_year'].astype(int)
        df1['promo2_since_week'] = df1['promo2_since_week'].astype(int)
        df1['promo2_since_year'] = df1['promo2_since_year'].astype(int)
        
        return df1
    
    def feature_engineering(self,df2):

        ### Feature Engineering

        #Year
        df2['year'] = df2['date'].dt.year
        #Month
        df2['month'] = df2['date'].dt.month
        #Day
        df2['day'] = df2['date'].dt.day
        #Week of Year
        df2['week_of_year'] = df2['date'].dt.weekofyear
        #Year Week
        df2['year_week'] = df2['date'].dt.strftime('%Y-%W')
        #Competition Since
        df2['competition_since'] = df2.apply( lambda x: datetime.datetime(year=x['competition_open_since_year'],month=x['competition_open_since_month'],day=1 ), axis=1 )
        df2['competition_time_month'] = ( ( df2['date'] - df2['competition_since'] )/30).apply( lambda x: x.days ).astype( int )
        #Promo since
        df2['promo_since'] = df2['promo2_since_year'].astype( str ) + '-' +df2['promo2_since_week'].astype( str )
        df2['promo_since'] = df2['promo_since'].apply( lambda x: datetime.datetime.strptime( x + '-1', '%Y-%W-%w' ) - datetime.timedelta( days=7 ) )
        df2['promo_time_week'] = ( ( df2['date'] - df2['promo_since'] )/7 ).apply(lambda x: x.days ).astype( int )
        #Assortment
        df2['assortment'] = df2['assortment'].apply( lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended' )
        #State Holiday
        df2['state_holiday'] = df2['state_holiday'].apply( lambda x: 'public_holiday'if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x == 'c' else 'regular_day' )

        ## Variable Filtering


        ### Selecting rows

        df2 = df2[df2['open'] != 0]

        ### Selecting columns

        drop_cols = ['open','promo_interval','month_map']
        df2 = df2.drop(drop_cols, axis = 1)
        
        return df2
    
    def data_preparation(self,df5):
        

        ### Rescaling

        # competition distance
        df5['competition_distance'] = self.competition_distance_scaler.fit_transform(df5[['competition_distance']].values)
        
        # competition time month
        df5['competition_time_month'] = self.competition_time_month_scaler.fit_transform(df5[['competition_time_month']].values)

        # promo time week
        df5['promo_time_week'] = self.promo_time_week_scaler.fit_transform(df5[['promo_time_week']].values)

        # year
        df5['year'] = self.year_scaler.fit_transform(df5[['year']].values)

        ### Transformation

        #### Enconding

        # state_holiday - One Hot Encoder
        df5 = pd.get_dummies(df5,prefix=['state_holiday'],columns=['state_holiday'])

        # store_type - Label Encoder
        df5['store_type'] = self.store_type_scaler.fit_transform(df5['store_type'])

        # assortment - Ordinal Encoder
        assortment_dict = {'basic':1,'extra':2,'extended':3}
        df5['assortment'] = df5['assortment'].map(assortment_dict)


        #### Nature transformation

        # day of week
        df5['day_of_week_sin'] = df5['day_of_week'].apply(lambda x: np.sin(x*(2*np.pi/7)))
        df5['day_of_week_cos'] = df5['day_of_week'].apply(lambda x: np.cos(x*(2*np.pi/7)))
        # month
        df5['month_sin'] = df5['month'].apply(lambda x: np.sin(x*(2*np.pi/12)))
        df5['month_cos'] = df5['month'].apply(lambda x: np.cos(x*(2*np.pi/12)))
        # day
        df5['day_sin'] = df5['day'].apply(lambda x: np.sin(x*(2*np.pi/30)))
        df5['day_cos'] = df5['day'].apply(lambda x: np.cos(x*(2*np.pi/30)))
        # week of year
        df5['week_of_year_sin'] = df5['week_of_year'].apply(lambda x: np.sin(x*(2*np.pi/52)))
        df5['week_of_year_cos'] = df5['week_of_year'].apply(lambda x: np.cos(x*(2*np.pi/52)))
        
        cols_selected = [ 'store',
                         'promo',
                         'store_type',
                         'assortment',
                         'competition_distance',
                         'competition_open_since_month',
                         'competition_open_since_year',
                         'promo2',
                         'promo2_since_week',
                         'promo2_since_year',
                         'competition_time_month',
                         'promo_time_week',
                         'day_of_week_sin',
                         'day_of_week_cos',
                         'month_sin',
                         'month_cos',
                         'day_sin',
                         'day_cos',
                         'week_of_year_sin',
                         'week_of_year_cos' ]
        
        return df5[cols_selected]
    
    def get_prediction(self,model,original_data,test_data):
        # prediction
        pred = model.predict(test_data)
        
        # join pred into the original data
        original_data['prediction'] = np.expm1(pred)
        
        return original_data.to_json(orient='records',date_format = 'iso')

## API handler

In [37]:
import pickle
import pandas as pd
from flask import Flask, request, Response
from rossmann.Rossmann import Rossmann

# loading model
model = pickle.load(open('/home/pedro/Documentos/repos/Rossman sales project/model_rossman.pkl','rb'))

# initialize API
app = Flask(__name__)

@app.route('/rossmann/predict',methods = ['POST'])

def rossmann_predict():
    test_json = request.get_json()
    
    if test_json: # there is data
        if isinstance(test_json, dict): # Unique example
            test_raw = pd.DataFrame(test_jason,index = [0])
        
        else: # Multiple examples
            teste_raw = pd.DataFrame(test_json, columns = test_json[0].keys())
            
        # Instatiate Rossmann class
        pipeline = Rossmann()
        
        # data cleaning
        df1 = pipeline.data_cleaning(teste_raw)
        
        # feature engineering
        df2 = pipeline.feature_engineering(df1)
        
        # data preparation
        df3 = pipeline.data_preparation(df2)
        
        # prediction
        df_response = pipeline.get_prediction(model,teste_raw,df3)
        
        return df_response
        
    else:
        return Response('{}', status = 200, mimetype = 'application/json')
    

if __name__ == '__main__':
    app.run('0.0.0.0')

ModuleNotFoundError: No module named 'rossmann'

## API Tester

In [3]:
import requests

In [4]:
# loading test data set
df10 = pd.read_csv('Ross_data/test.csv')
df_store_raw = pd.read_csv('Ross_data/store.csv')

In [20]:
# merge test data + store
df_test = pd.merge(df10,df_store_raw,how = 'left',on='Store')

# choose store for prediction
df_test = df_test[df_test['Store'].isin([20,23,22])]

# remove closed days
df_test = df_test[df_test['Open'] != 0]
df_test = df_test[~df_test['Open'].isnull()]
df_test = df_test.drop('Id',axis=1)

In [21]:
# convert DataFrame to json
data = json.dumps(df_test.to_dict(orient='records'))

In [22]:
# API Call
#url = 'http://0.0.0.0:5000/rossmann/predict'
url = 'https://prediction-rossmann-model-test.herokuapp.com/rossmann/predict'
header = {'Content-type': 'application/json' }
data = data
r = requests.post( url, data=data, headers=header )
print( 'Status Code {}'.format( r.status_code ) )

Status Code 200


In [23]:
d1 = pd.DataFrame(r.json(),columns=r.json()[0].keys())

In [24]:
d2 = d1[['store','prediction']].groupby('store').sum().reset_index()

for i in range(len(d2)):
    print('Store number {} will sell R${:,.2f} in the next 6 weeks'.format(d2.loc[i,'store'],d2.loc[i,'prediction']))

Store number 20 will sell R$295,864.54 in the next 6 weeks
Store number 22 will sell R$217,901.74 in the next 6 weeks
Store number 23 will sell R$227,984.94 in the next 6 weeks
