In [46]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA

In [47]:
# Load the data into a pandas dataframe
df = pd.read_csv("../Queries/order_history.csv")

In [48]:
# Convert the order_date column to datetime datatype and set it as the index
df.head()

Unnamed: 0,customer_id,order_id,order_date,product_type,quantity
0,1,455,2021-04-04,Jacket,2
1,1,455,2021-04-04,Shirt,3
2,1,670,2021-03-06,Jacket,2
3,1,670,2021-03-06,Jacket,2
4,1,670,2021-03-06,Shirt,1


In [50]:
df['order_date'] = pd.to_datetime(df['order_date'])


In [51]:
df.head()

Unnamed: 0,customer_id,order_id,order_date,product_type,quantity
0,1,455,2021-04-04,Jacket,2
1,1,455,2021-04-04,Shirt,3
2,1,670,2021-03-06,Jacket,2
3,1,670,2021-03-06,Jacket,2
4,1,670,2021-03-06,Shirt,1


In [53]:
print(df["order_date"].dtypes)


datetime64[ns]


In [55]:
df = df.set_index("order_date")


In [56]:
grouped = df.groupby("product_type").resample("M")
monthly_totals = grouped["quantity"].sum()


In [57]:
monthly_totals.head()

product_type  order_date
Jacket        2021-01-31    395
              2021-02-28    316
              2021-03-31    427
              2021-04-30    318
              2021-05-31    289
Name: quantity, dtype: int64

In [61]:
def create_product_df(group):
    product_type = group["product_type"].iloc[0]
    data = group[["quantity"]].sum()
    return pd.DataFrame(data, columns=["quantity"])

products = {}
for product_type, group in df.groupby("product_type"):
    resampled = group.resample("M")
    for month, data in resampled:
        if product_type not in products:
            products[product_type] = create_product_df(data)
        else:
            products[product_type] = pd.concat([products[product_type], create_product_df(data)])


In [67]:
import pandas as pd

# define the data and train_size
data = [0, 1, 2, 3, 4, 5, 6, 7, 8]
train_size = 7

# create a dictionary for each product
products = {
    "Jacket": {},
    "Shirt": {},
    "Pants": {}
}

# set the train_data and test_data arrays as values in each product dictionary
for name in products.keys():
    train_data, test_data = data[:train_size], data[train_size:]
    products[name]["train_data"] = train_data
    products[name]["test_data"] = test_data

    print(f"{name}: data ({len(data)}): {pd.Series(data).index.values}, train_data ({len(train_data)}): {pd.Series(train_data).index.values}, test_data ({len(test_data)}): {pd.Series(test_data).index.values}")


Jacket: data (9): [0 1 2 3 4 5 6 7 8], train_data (7): [0 1 2 3 4 5 6], test_data (2): [0 1]
Shirt: data (9): [0 1 2 3 4 5 6 7 8], train_data (7): [0 1 2 3 4 5 6], test_data (2): [0 1]
Pants: data (9): [0 1 2 3 4 5 6 7 8], train_data (7): [0 1 2 3 4 5 6], test_data (2): [0 1]


In [69]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# define the data and train_size
data = [0, 1, 2, 3, 4, 5, 6, 7, 8]
train_size = 7

# create a dictionary for each product
products = {
    "Jacket": {},
    "Shirt": {},
    "Pants": {}
}

# set the train_data and test_data arrays as values in each product dictionary
for name in products.keys():
    train_data, test_data = data[:train_size], data[train_size:]
    products[name]["train_data"] = train_data
    products[name]["test_data"] = test_data

    # fit an ARIMA model to the training data and forecast the next month's sales
    model = ARIMA(train_data, order=(1, 1, 1))
    model_fit = model.fit()
    forecast = model_fit.forecast()[0]
    products[name]["forecast"] = forecast

    # print the forecast for the next month
    print(f"{name}: {forecast}")


  warn('Non-stationary starting autoregressive parameters'


Jacket: 6.999855657205232
Shirt: 6.999855657205232
Pants: 6.999855657205232




In [74]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# create a dictionary to hold the trained model and test data for each product
products = {}

# loop through each product and fit an ARIMA model
for name in df["product_type"].unique():
    # filter the data for the current product
    product_data = df[df["product_type"] == name]["quantity"]
    
    # create a differenced series
    diff_data = product_data.diff().dropna()
    
    # fit a seasonal ARIMA model with auto-arima
    model = SARIMAX(diff_data, order=(1, 1, 1), seasonal_order=(1, 1, 0, 12), enforce_stationarity=False, enforce_invertibility=False)
    model_fit = model.fit()
    
    # store the model and test data for the current product
    products[name] = {"model": model_fit, "test_data": product_data[-12:].values}

# forecast sales for the next month for each product
predictions = []
for name in products:
    # use the last 12 months of sales data as input to the model
    input_data = products[name]["test_data"]
    
    if len(input_data) > 0:
    forecast = products[name]["model"].forecast(steps=1, exog=[input_data[-1]])[-1]
else:
    forecast = None


# print the forecast for the next month for each product
for i, name in enumerate(products):
    print(f"Next month's forecast for {name}: {predictions[i]}")


IndentationError: expected an indented block (544215252.py, line 28)

In [71]:
print(df.columns)


Index(['customer_id', 'order_id', 'product_type', 'quantity'], dtype='object')
