# Data

In [None]:
import pandas as pd
df_od_raw = pd.read_csv('orders.csv')
df_ps_raw = pd.read_csv('product-supplier.csv')

: 

In [16]:
df_od_raw.head(3)

Unnamed: 0,Customer ID,Customer Status,Date Order was placed,Delivery Date,Order ID,Product ID,Quantity Ordered,Total Retail Price for This Order,Cost Price Per Unit
0,579,Silver,01-Jan-17,07-Jan-17,123002578,220101400106,2,92.6,20.7
1,7574,SILVER,01-Jan-17,05-Jan-17,123004074,210201000009,1,21.7,9.95
2,28861,Gold,01-Jan-17,04-Jan-17,123000871,230100500068,1,1.7,0.8


In [17]:
df_ps_raw.head(3)

Unnamed: 0,Product ID,Product Line,Product Category,Product Group,Product Name,Supplier Country,Supplier Name,Supplier ID
0,210100100001,Children,Children Outdoors,"Outdoor things, Kids",Boy's and Girl's Ski Pants with Braces,NO,Scandinavian Clothing A/S,50
1,210100100002,Children,Children Outdoors,"Outdoor things, Kids",Children's Jacket,ES,Luna sastreria S.A.,4742
2,210100100003,Children,Children Outdoors,"Outdoor things, Kids",Children's Jacket Sidney,NO,Scandinavian Clothing A/S,50


# EDA & Data Engineering

In [18]:
# Copy
df_od = df_od_raw.copy()
df_ps = df_ps_raw.copy()

# Replace customer status to upper cases
df_od['Customer Status'] = df_od['Customer Status'].str.upper()
# Add Item Retail Value features
df_od['Item Retail Value'] = df_od['Total Retail Price for This Order'] / df_od['Quantity Ordered']
# Convert dates to datetime objects
df_od['date'] = pd.to_datetime(df_od['Date Order was placed'])
df_od['Delivery Date'] = pd.to_datetime(df_od['Delivery Date'])
# Segment to year, month, and day
df_od['date_year'] = pd.to_datetime(df_od.date).dt.year
df_od['date_month'] = pd.to_datetime(df_od.date).dt.month
df_od['date_day'] = pd.to_datetime(df_od.date).dt.day
df_od['date_day_of_week'] = pd.to_datetime(df_od.date).dt.dayofweek + 1

# Merge
df_merge = df_od.merge(df_ps, how='left', left_on='Product ID', right_on='Product ID')

# Filter features
print(f'Initial features: {list(df_merge.columns)}')
features2keep = ['date_month', 'date_day', 'date_day_of_week',
                 'Cost Price Per Unit','Item Retail Value', #'Product Line', 'Product Group',
                 'Product Category','Supplier Country']
print(f'Features used: {features2keep}')
# Target Variable
target = 'Quantity Ordered'
print(f'target: {target}')

X = df_merge[features2keep]
y = df_merge[target]

  df_od['date'] = pd.to_datetime(df_od['Date Order was placed'])
  df_od['Delivery Date'] = pd.to_datetime(df_od['Delivery Date'])


Initial features: ['Customer ID', 'Customer Status', 'Date Order was placed', 'Delivery Date', 'Order ID', 'Product ID', 'Quantity Ordered', 'Total Retail Price for This Order', 'Cost Price Per Unit', 'Item Retail Value', 'date', 'date_year', 'date_month', 'date_day', 'date_day_of_week', 'Product Line', 'Product Category', 'Product Group', 'Product Name', 'Supplier Country', 'Supplier Name', 'Supplier ID']
Features used: ['date_month', 'date_day', 'date_day_of_week', 'Cost Price Per Unit', 'Item Retail Value', 'Product Category', 'Supplier Country']
target: Quantity Ordered


# Modeling

In [19]:
# Define loss function to calculate metrics
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score
def loss(data, X, y_true, model):
    y_pred = model.predict(X)
    print(data, "MAPE: {0:.2e}".format(mean_absolute_percentage_error(y_true, y_pred)))
    print(data, "MAE: {0:.2e}".format(mean_absolute_error(y_true, y_pred)))
    print(data, "MSE: {0:.2e}".format(mean_squared_error(y_true, y_pred)))
    print(data, "R2 score : {0:.3f}".format(r2_score(y_true, y_pred)))

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [20]:
# Preprocessing and model pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression


feat_num = list(X_train.select_dtypes(exclude='object').columns)
feat_cat = list(X_train.select_dtypes(include='object').columns)

# To rescale numerical and categorical data separately
colTransformer = ColumnTransformer([
    ('cat_cols', OneHotEncoder(handle_unknown='ignore'), feat_cat),
    ('num_cols', MinMaxScaler(), feat_num)
])

# Define the pipeline
steps = [
    ("col_tf", colTransformer),
    ("lr", LinearRegression())
]

model_lr = Pipeline(steps)
model_lr.fit(X_train, y_train)

# Model Performance
loss("Train", X_train, y_train, model_lr)
loss("Test", X_test, y_test, model_lr)


Train MAPE: 4.59e-01
Train MAE: 6.84e-01
Train MSE: 7.83e-01
Train R2 score : 0.025
Test MAPE: 4.58e-01
Test MAE: 6.89e-01
Test MSE: 7.99e-01
Test R2 score : 0.026


In [21]:
from xgboost.sklearn import XGBRegressor
steps = [("preprocess", colTransformer),
         ('xgbr', XGBRegressor())]
model_xgb = Pipeline(steps)
model_xgb.fit(X_train, y_train)

# Model Performance
loss("Train", X_train, y_train, model_xgb)
loss("Test", X_test, y_test, model_xgb)

Train MAPE: 2.77e-01
Train MAE: 4.74e-01
Train MSE: 4.65e-01
Train R2 score : 0.421
Test MAPE: 2.88e-01
Test MAE: 4.97e-01
Test MSE: 5.13e-01
Test R2 score : 0.374


In [22]:
import joblib
# save the model to disk
joblib.dump(model_xgb, "xgb_model.sav")

['xgb_model.sav']

# APP

In [23]:
!npm install localtunnel
!pip install -q streamlit

[K[?25h
up to date, audited 23 packages in 2s

3 packages are looking for funding
  run `npm fund` for details

2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.


In [24]:
%%writefile app.py
import streamlit as st

st.title('Online Sporting Goods Forecastor')
st.markdown('A model to predict the ordered quantity of a certain type of sports goods.')
st.header("Type of Products")
col1, col2 = st.columns(2)
with col1:
  st.text("Product Details")
  pc = st.selectbox('Product Category', ['Clothes',
                    'Children Sports',
                    'Outdoors',
                    'Shoes',
                    'Golf',
                    'Assorted Sports Articles',
                    'Swim Sports',
                    'Winter Sports',
                    'Running - Jogging',
                    'Indoor Sports',
                    'Team Sports',
                    'Racket Sports'])
  cpu = st.slider('Cost Price Per Unit', 0, 800, 100)
  irv = st.slider('Item Retail Value', 0, 1600, 100)
  cou = st.selectbox('Supplier Country', ['US', 'GB', 'NL', 'PT', 'NO', 'ES', 'BE', 'CA', 'AU', 'FR', 'SE', 'DK', 'DE'])
with col2:
  st.text("Time")
  mon = st.slider('Month',1,12,1)
  day = st.slider('Day',1,31,1)
  day_week = st.slider('Day of Week',1,7,1)


# Model prediction
import pandas as pd
data_dict = {'date_month': mon,
             'date_day': day,
             'date_day_of_week': day_week,
             'Cost Price Per Unit':cpu,
             'Item Retail Value':irv,
             'Product Category': pc,
             'Supplier Country': cou}
X_input = pd.DataFrame(data_dict, index=[0])

import joblib
def predict(data):
  clf = joblib.load("xgb_model.sav")
  return clf.predict(data)


pred_b = st.button("Predict Order Quantity")

if pred_b:
  result = predict(X_input)
  st.text(result[0])

# a = st.sidebar.radio('Choose:',[1,2])
# st.dataframe(X)

Overwriting app.py


In [25]:
!streamlit run app.py &>gdrive/MyDrive/OnlineSportingGoodsForecastor/logs.txt & npx localtunnel -- port 8501 & curl ipv4.icanhazip.com

34.42.177.213
Usage: lt --port [num] <options>

Options:
  -p, --port                Internal HTTP server port                 [required]
  -h, --host                Upstream server providing forwarding
                                             [default: "https://localtunnel.me"]
  -s, --subdomain           Request this subdomain
  -l, --local-host          Tunnel traffic to this host instead of localhost,
                            override Host header to this host
      --local-https         Tunnel traffic to a local HTTPS server     [boolean]
      --local-cert          Path to certificate PEM file for local HTTPS server
      --local-key           Path to certificate key file for local HTTPS server
      --local-ca            Path to certificate authority file for self-signed
                            certificates
      --allow-invalid-cert  Disable certificate checks for your local HTTPS
                            server (ignore cert/key/ca options)        [boolean]
  -o, -

In [None]:
!npx localtunnel --port 8501

your url is: https://ten-results-behave.loca.lt
