In [None]:
# This cell is NOT editable. Overwrite variables on your own discretion.
# Any changes other than the script code will NOT BE SAVED!
# All cells are assumed to be script code cells, unless explictly tagged as 'o9_ignore'

In [1]:
_sales = "select ([Time].[Day] * [Version].[Version Name].[CurrentWorkingView]  * [Department].[Department_ID] * [Store].[Store_ID].[1] * {Measure.[Weekly Sales]}  ) on row, () on column;"
_features = "select( [Time].[Day] * [Version].[Version Name].[CurrentWorkingView] * [Store].[Store_ID].[1] * { Measure.[Temperature], Measure.[Fuel Price], Measure.[MarkDown1] , Measure.[MarkDown2], Measure.[MarkDown3], Measure.[MarkDown4] , Measure.[MarkDown5] , Measure.[CPI] , Measure.[Unemployment] } ) on row, () on column;"
_stores = "select([Store].[Store_ID].[1] * [Store].[Type] ) on row, () on column;"


# Initialize the O9DataLake with the input parameters and dataframes
# Data can be accessed with O9DataLake.get(<Input Name>)
# Overwritten values will not be reflected in the O9DataLake after initialization

from o9_common_utils.O9DataLake import O9DataLake, ResourceType, DataSource
O9DataLake.register("sales",DataSource.LS, ResourceType.IBPL, _sales)
O9DataLake.register("features",DataSource.LS, ResourceType.IBPL, _features)
O9DataLake.register("stores",DataSource.LS, ResourceType.IBPL, _stores)

In [2]:
"""
Exec plugin instance [DemoPlugin] for measures {[PredictedSales]} using scope ([Version].[Version Name].[CurrentWorkingView]*Store.[Store_ID]) using arguments {(NumExecutors,1), (ExecutorMemory, "1G"), (DriverMemory, "2G")};
"""
import numpy as np
import pandas as pd
import logging
logger = logging.getLogger('o9_logger')

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

KNN = "KNN"
LINRRG = "LINREG"
DECISIONTREE = "DECISIONTREE"
RANDOMFOREST = "RANDOMFOREST"
EXTRATREES = "EXTRATREES"
available_models = [KNN, LINRRG, DECISIONTREE, RANDOMFOREST] # EXTRATREES]

class o9Models:
    def __init__(self):
        pass

    def get_availabe_models(self):
        return available_models

    def get_model(self, model_name):
        model_name = model_name.upper()
        assert (model_name.upper() in available_models), "can not get [{}] model".format(model_name)
        if model_name == KNN:
            return self.get_knn()
        if model_name == LINRRG:
            return self.get_lin_reg()
        if model_name == DECISIONTREE:
            return self.get_dtree()
        if model_name == RANDOMFOREST:
            return self.get_randomForest()

    def get_lin_reg(self):
        lin_reg_model = LinearRegression()
        return lin_reg_model

    def get_knn(self):
        knn_model = KNeighborsRegressor(n_neighbors=10,n_jobs=4)
        return knn_model

    def get_dtree(self):
        dtree_model = DecisionTreeRegressor(random_state=0)
        return dtree_model

    def get_randomForest(self):
        randomForest_model = RandomForestRegressor(n_estimators = 400,max_depth=15,n_jobs=5)
        return randomForest_model

    def get_xgb(self):
        #xgb_model = XGBRegressor(objective='reg:linear', nthread=4, n_estimators=500, max_depth=6, learning_rate=0.5)
        #return xgb_model
        pass

    def get_arima(self):
        #from statsmodel.tsa.arima_model import ARIMA
        #arima = ARIMA()
        pass

    def get_extratrees(self):
        etr = ExtraTreesRegressor(n_estimators=30, n_jobs=4)
        return etr


def get_dataset(sales, features, stores):
    """
    merges teh dataframe
    fills the dataframe nulls with 0
    """
    dataset = sales.merge(stores, how='left').merge(features, how='left')
    from statistics import mean
    dataset['CPI'] = dataset['CPI'].fillna(mean(dataset['CPI']))
    dataset['Unemployment'] = dataset['Unemployment'].fillna(mean(dataset['Unemployment']))
    dataset[['Temperature','Fuel Price','MarkDown3']] \
        = dataset[['Temperature','Fuel Price','MarkDown3']].fillna(0)
    dataset[['CPI', 'Unemployment']] = dataset[['CPI', 'Unemployment']].fillna(0)
    date = pd.to_datetime(dataset["Time.[Day]"], format="%m/%d/%Y")
    dataset['Year'] = date.dt.year
    dataset['Day'] = date.dt.day
    dataset['Month'] = date.dt.month
    dataset["Days to Next Christmas"] = (
                pd.to_datetime("12/31/" + dataset["Year"].astype(str), format="%m/%d/%Y") -
                date).dt.days.astype(int)
    dataset = dataset.drop(columns=['MarkDown1','MarkDown2', 'MarkDown4','MarkDown5'])
    return dataset

def create_x_y(dataset):
    """"
    weekly sales is the predicted output
    rest of the columns are input features
    """
    X = dataset.loc[:, dataset.columns != 'Weekly Sales']
    X = pd.get_dummies(X, columns=["Store.[Type]"])
    y = dataset[['Weekly Sales']]
    return (X, y)

def drop_columns(X):
    return X.drop(columns =['Time.[Day]'])

def scale_x_y(X, y):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
    from sklearn import preprocessing
    sc_X = preprocessing.StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    return (X_train, X_test, y_train, y_test)

def build_model_and_get_metrics(model_name, X_train, X_test, y_train, y_test):
    models = o9Models()
    model = models.get_model(model_name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    from sklearn import metrics
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    accuracy = model.score(X_test, y_test)
    model_metrics = {
        "MeanAbsoluteError" : mae,
        "MeanSquaredError" : mse,
        "RootMeanSquaredError" : rmse,
        "Accuracy" : accuracy
    }
    return model, model_metrics

def run_a_model(model_name, X_train, X_test, y_train, y_test):
    logger.info("building {} model...".format(model_name))
    model, metrics = build_model_and_get_metrics(model_name, X_train, X_test, y_train, y_test)
    details = {"model": model, "metrics": metrics}
    logger.info(details)
    return details


def run_models(X_train, X_test, y_train, y_test):
    out = {}
    logger.info("started model building")
    for model_name in available_models:
        model_details = run_a_model(model_name, X_train, X_test, y_train, y_test)
        out[model_name] = model_details
    logger.info(out)


from o9_common_utils.O9DataLake import O9DataLake
sales = O9DataLake.get("sales")
features = O9DataLake.get("features")
stores = O9DataLake.get("stores")

sales_df = sales[['Time.[Day]', 'Department.[Department_ID]','Store.[Store_ID]','Weekly Sales']]
features_df = features[['Store.[Store_ID]','Time.[Day]', 'Temperature','Fuel Price','MarkDown1','MarkDown2','MarkDown3', 'MarkDown4','MarkDown5', 'CPI', 'Unemployment']]
stores_df = stores[['Store.[Store_ID]', 'Store.[Type]']]

dataset = get_dataset(sales_df, features_df, stores_df)
(X, y) = create_x_y(dataset)
X = drop_columns(X)
X_train, X_test, y_train, y_test = scale_x_y(X, y)
#run_models(X_train, X_test, y_train, y_test)
run_a_model(DECISIONTREE, X_train, X_test, y_train, y_test)
logger.info("output df")
Output = sales.head(5)
O9DataLake.put("Output", Output)
logger.info(Output)

StatisticsError: mean requires at least one data point