# Creating Data Subsets
Here we will create subsets of data that can be passed into a multivariate time series regression

In [2]:
import pandas as pd
from tqdm import tqdm
import json

In [3]:
sales = pd.read_csv("../data/raw/sales.csv", header=0, delimiter=",")
sku = pd.read_csv("../data/raw/sku.csv", header=0, delimiter=",")
geoParams = pd.read_csv("../data/raw/geo_params.csv", header=0, delimiter=",")

Merge the geo_params and sales data files. A different subset will be generated for each item (SKU) for every geographic cluster location. The data will be stored in a dictionary with:
- key: `geoCluster_ID` and `SKU`
- value: list of `date`, `price`, and `sales`

The dictionary will be written to a json file for convenience.

In [4]:
sales.sort_values("date", inplace=True)
sales = sales.join(geoParams.set_index("geoCluster"),on="geoCluster", rsuffix="gC")

In [5]:
SKUset = set(sales["SKU"])
salesSubsets = {}
for geoCluster in tqdm(enumerate(geoParams["geoCluster"]), total=len(geoParams["geoCluster"])):
    for sku in SKUset:
        salesSKU = sales[sales["SKU"]==sku]
        salesSKUgeoCluster = salesSKU[salesSKU["geoCluster"]==geoCluster[1]]
        key = f'{geoCluster[1]}_{sku}'
        salesSubsets[key] = salesSKUgeoCluster[["date", "price", "sales"]].values.tolist()

100%|██████████| 515/515 [05:29<00:00,  1.56it/s]


In [None]:
salesSubsets_csv = "../data/processed/sales_subsets.csv"
try:
    with open(salesSubsets_csv, 'w') as file:
        json.dump(salesSubsets, file)

except IOError:
    print("I/O error")

# Reload processed subsets

In [None]:
with open(salesSubsets_csv) as file:
    salesSubsets = json.load(file)