From d6539896b6ebec1e78d8fc3e15e7f800be42a0d5 Mon Sep 17 00:00:00 2001 From: Noah Sayre Date: Thu, 14 Jan 2021 15:15:57 -0500 Subject: [PATCH 1/5] create summing matrix using labels/ support grouped time series --- hts/functions.py | 61 +++++++++------ hts/hierarchy/__init__.py | 25 +++++++ tests/unit/test_functions.py | 139 +++++++++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 21 deletions(-) diff --git a/hts/functions.py b/hts/functions.py index 0e44ee3..ff2f43c 100644 --- a/hts/functions.py +++ b/hts/functions.py @@ -21,34 +21,53 @@ def to_sum_mat(ntree: NAryTreeT): Returns ------- + numpy.ndarray + Summing matrix. """ nodes = ntree.level_order_traversal() + node_labels = ntree.get_level_order_labels() num_at_level = list(map(sum, nodes)) columns = num_at_level[-1] - bl_mat = np.identity(columns) + + # Initialize summing matrix with bottom level rows + sum_mat = np.identity(columns) + + # Names of each row in summing matrix. + sum_mat_labels = [] + + # Bottom level matrix labels, with indices correspoding to column in summing matrix + bl_mat_idx_ref = node_labels[-1] + + # Skip total and bottom level of tree. Rows added outside of loop. + for level in node_labels[1:-1]: + for label in level: + # Exclude duplicates specified in tree + if label not in sum_mat_labels: + row = [] + for bl_element in bl_mat_idx_ref: + # Check if the bottom level element is part of label + is_component = all([True if l in bl_element else False for l in label.split("_")]) + if is_component: + row.append(1) + else: + row.append(0) + + # Add row correspoding to label to top of summing matrix + row = np.array(row) + sum_mat = np.vstack((row, sum_mat)) + sum_mat_labels.append(label) + + # Add top as first row in summing matrix top = np.ones(columns) - final_mat = bl_mat - num_levels = len(num_at_level) - - for lev in range(num_levels - 1): - summing = nodes[-(lev + 1)] - count = 0 - num2sum_ind = 0 - B = np.zeros([num_at_level[-1]]) - for num2sum in summing: - num2sum_ind += num2sum - a = bl_mat[count:num2sum_ind, :] - count += num2sum - if np.all(B == 0): - B = a.sum(axis=0) - else: - B = np.vstack((B, a.sum(axis=0))) - final_mat = np.vstack((B, final_mat)) - bl_mat = B + sum_mat = np.vstack((top, sum_mat)) + + # Reverse list of labels to match summing matrix, since vstack and append worked in the opposite order. + # Not currently returned, but could be for information or matrix alignment. + sum_mat_labels.reverse() + sum_mat_labels = ["total"] + sum_mat_labels + bl_mat_idx_ref - final_mat = np.vstack((top, final_mat)) - return final_mat + return sum_mat def project( diff --git a/hts/hierarchy/__init__.py b/hts/hierarchy/__init__.py index b994134..3f83ee5 100644 --- a/hts/hierarchy/__init__.py +++ b/hts/hierarchy/__init__.py @@ -316,6 +316,31 @@ def level_order_traversal(self: NAryTreeT) -> List[List[int]]: res[li].append(len(n.children)) return res[:-1] + def get_level_order_labels(self: NAryTreeT) -> List[List[str]]: + """ + Get the associated node labels from the NAryTreeT level_order_traversal(). + + Parameters + ---------- + self: NAryTreeT + Tree being searched. + + Returns + ------- + List[List[str]] + Node labels corresponding to level order traversal. + """ + labels = [] + q = deque([(self, 0)]) + while q: + n, li = q.popleft() + if len(labels) < li + 1: + labels.append([]) + for i in n.children: + q.append((i, li + 1)) + labels[li].append(n.key) + return labels + def add_child(self, key=None, item=None, exogenous=None) -> NAryTreeT: child = HierarchyTree(key=key, item=item, exogenous=exogenous, parent=self) self.children.append(child) diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index 827074c..9d26554 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -1,6 +1,8 @@ import numpy +import pandas from hts.functions import to_sum_mat +import hts.hierarchy def test_sum_mat_uv(uv_tree): @@ -17,3 +19,140 @@ def test_sum_mat_mv(mv_tree): shp = mat.shape assert shp[0] == mv_tree.num_nodes() + 1 assert shp[1] == mv_tree.leaf_sum() + +def test_sum_mat_hierarchical(): + hierarchy = {'total': ['A', 'B'], + 'A': ['A_X', 'A_Y', 'A_Z'], + 'B': ['B_X', 'B_Y']} + hier_df = pandas.DataFrame(data={'total': [], + 'A': [], + 'B': [], + 'A_X': [], + 'A_Y': [], + 'A_Z': [], + 'B_X': [], + 'B_Y': []}) + + tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, hier_df) + sum_mat = to_sum_mat(tree) + + expected_sum_mat = numpy.array([[1, 1, 1, 1, 1], # total + [0, 0, 0, 1, 1], # B + [1, 1, 1, 0, 0], # A + [1, 0, 0, 0, 0], # A_X + [0, 1, 0, 0, 0], # A_Y + [0, 0, 1, 0, 0], # A_Z + [0, 0, 0, 1, 0], # B_X + [0, 0, 0, 0, 1]]) # B_Y + + numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) + + +def test_sum_mat_grouped(): + hierarchy = {'total': ['A', 'B', 'X', 'Y'], + 'A': ['A_X', 'A_Y'], + 'B': ['B_X', 'B_Y']} + grouped_df = pandas.DataFrame(data={'total': [], + 'A': [], + 'B': [], + 'X': [], + 'Y': [], + 'A_X': [], + 'A_Y': [], + 'B_X': [], + 'B_Y': []}) + + tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, grouped_df) + sum_mat = to_sum_mat(tree) + + expected_sum_mat = numpy.array([[1, 1, 1, 1], # total + [0, 1, 0, 1], # Y + [1, 0, 1, 0], # X + [0, 0, 1, 1], # B + [1, 1, 0, 0], # A + [1, 0, 0, 0], # A_X + [0, 1, 0, 0], # A_Y + [0, 0, 1, 0], # B_X + [0, 0, 0, 1]]) # B_Y + + numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) + + +def test_sum_mat_visnights_hier(visnights_hier): + hier_df = pandas.DataFrame(data={'total': [], + 'VIC': [], + 'QLD': [], + 'SAU': [], + 'WAU': [], + 'OTH': [], + 'NSW': [], + 'NSW_Metro': [], 'NSW_NthCo': [], 'NSW_NthIn': [], 'NSW_SthCo': [], 'NSW_SthIn': [], + 'OTH_Metro': [], 'OTH_NoMet': [], + 'QLD_Cntrl': [], 'QLD_Metro': [], 'QLD_NthCo': [], + 'SAU_Coast': [], 'SAU_Inner': [], 'SAU_Metro': [], + 'VIC_EstCo': [], 'VIC_Inner': [], 'VIC_Metro': [], 'VIC_WstCo': [], + 'WAU_Coast': [], 'WAU_Inner': [], 'WAU_Metro': []}) + + tree = hts.hierarchy.HierarchyTree.from_nodes(visnights_hier, hier_df) + sum_mat = to_sum_mat(tree) + + expected_sum_mat = numpy.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # total + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], # VIC + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0], # QLD + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], # SAU + [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU + [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_Metro + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_NthCo + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_NthIn + [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_SthCo + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_SthIn + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH_Metro + [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH_NoMet + [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Coast + [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Inner + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], # SAU_Coast + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], # SAU_Inner + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # SAU_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], # QLD_Cntrl + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # QLD_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], # QLD_NthCo + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], # VIC_EstCo + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], # VIC_Inner + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # VIC_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]) # VIC_WstCo + + numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) + + +def test_demo_unique_constraint(): + # Example https://otexts.com/fpp2/hts.html + # Does not work when you have elements that are named the same, but represent + # different levels in the hierarchy. See expected_sum_mat below for example. + hierarchy = {'total': ['A', 'B'], + 'A': ['AA', 'AB', 'AC'], + 'B': ['BA', 'BB']} + hier_df = pandas.DataFrame(data={'total': [], + 'A': [], + 'B': [], + 'AA': [], + 'AB': [], + 'AC': [], + 'BA': [], + 'BB': []}) + + tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, hier_df) + sum_mat = to_sum_mat(tree) + + expected_sum_mat = numpy.array([[1, 1, 1, 1, 1], # total + [0, 1, 0, 1, 1], # B, Incorrectly finds B in AB + [1, 1, 1, 1, 0], # A, Incorrectly finds A in BA + [1, 0, 0, 0, 0], # AA + [0, 1, 0, 0, 0], # AB + [0, 0, 1, 0, 0], # AC + [0, 0, 0, 1, 0], # BA + [0, 0, 0, 0, 1]]) # BB + + numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) From 26657f2c87f99ff9d029e231f9713b5f4eff2215 Mon Sep 17 00:00:00 2001 From: Noah Sayre Date: Thu, 28 Jan 2021 16:47:39 -0500 Subject: [PATCH 2/5] ran linter --- hts/functions.py | 4 +- tests/unit/test_functions.py | 243 +++++++++++++++++++++-------------- 2 files changed, 147 insertions(+), 100 deletions(-) diff --git a/hts/functions.py b/hts/functions.py index ff2f43c..1b558ae 100644 --- a/hts/functions.py +++ b/hts/functions.py @@ -47,7 +47,9 @@ def to_sum_mat(ntree: NAryTreeT): row = [] for bl_element in bl_mat_idx_ref: # Check if the bottom level element is part of label - is_component = all([True if l in bl_element else False for l in label.split("_")]) + is_component = all( + [True if l in bl_element else False for l in label.split("_")] + ) if is_component: row.append(1) else: diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index 9d26554..6d85dc2 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -20,109 +20,148 @@ def test_sum_mat_mv(mv_tree): assert shp[0] == mv_tree.num_nodes() + 1 assert shp[1] == mv_tree.leaf_sum() + def test_sum_mat_hierarchical(): - hierarchy = {'total': ['A', 'B'], - 'A': ['A_X', 'A_Y', 'A_Z'], - 'B': ['B_X', 'B_Y']} - hier_df = pandas.DataFrame(data={'total': [], - 'A': [], - 'B': [], - 'A_X': [], - 'A_Y': [], - 'A_Z': [], - 'B_X': [], - 'B_Y': []}) + hierarchy = {"total": ["A", "B"], "A": ["A_X", "A_Y", "A_Z"], "B": ["B_X", "B_Y"]} + hier_df = pandas.DataFrame( + data={ + "total": [], + "A": [], + "B": [], + "A_X": [], + "A_Y": [], + "A_Z": [], + "B_X": [], + "B_Y": [], + } + ) tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, hier_df) sum_mat = to_sum_mat(tree) - expected_sum_mat = numpy.array([[1, 1, 1, 1, 1], # total - [0, 0, 0, 1, 1], # B - [1, 1, 1, 0, 0], # A - [1, 0, 0, 0, 0], # A_X - [0, 1, 0, 0, 0], # A_Y - [0, 0, 1, 0, 0], # A_Z - [0, 0, 0, 1, 0], # B_X - [0, 0, 0, 0, 1]]) # B_Y + expected_sum_mat = numpy.array( + [ + [1, 1, 1, 1, 1], # total + [0, 0, 0, 1, 1], # B + [1, 1, 1, 0, 0], # A + [1, 0, 0, 0, 0], # A_X + [0, 1, 0, 0, 0], # A_Y + [0, 0, 1, 0, 0], # A_Z + [0, 0, 0, 1, 0], # B_X + [0, 0, 0, 0, 1], + ] + ) # B_Y numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) def test_sum_mat_grouped(): - hierarchy = {'total': ['A', 'B', 'X', 'Y'], - 'A': ['A_X', 'A_Y'], - 'B': ['B_X', 'B_Y']} - grouped_df = pandas.DataFrame(data={'total': [], - 'A': [], - 'B': [], - 'X': [], - 'Y': [], - 'A_X': [], - 'A_Y': [], - 'B_X': [], - 'B_Y': []}) + hierarchy = { + "total": ["A", "B", "X", "Y"], + "A": ["A_X", "A_Y"], + "B": ["B_X", "B_Y"], + } + grouped_df = pandas.DataFrame( + data={ + "total": [], + "A": [], + "B": [], + "X": [], + "Y": [], + "A_X": [], + "A_Y": [], + "B_X": [], + "B_Y": [], + } + ) tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, grouped_df) sum_mat = to_sum_mat(tree) - expected_sum_mat = numpy.array([[1, 1, 1, 1], # total - [0, 1, 0, 1], # Y - [1, 0, 1, 0], # X - [0, 0, 1, 1], # B - [1, 1, 0, 0], # A - [1, 0, 0, 0], # A_X - [0, 1, 0, 0], # A_Y - [0, 0, 1, 0], # B_X - [0, 0, 0, 1]]) # B_Y + expected_sum_mat = numpy.array( + [ + [1, 1, 1, 1], # total + [0, 1, 0, 1], # Y + [1, 0, 1, 0], # X + [0, 0, 1, 1], # B + [1, 1, 0, 0], # A + [1, 0, 0, 0], # A_X + [0, 1, 0, 0], # A_Y + [0, 0, 1, 0], # B_X + [0, 0, 0, 1], # B_Y + ] + ) numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) def test_sum_mat_visnights_hier(visnights_hier): - hier_df = pandas.DataFrame(data={'total': [], - 'VIC': [], - 'QLD': [], - 'SAU': [], - 'WAU': [], - 'OTH': [], - 'NSW': [], - 'NSW_Metro': [], 'NSW_NthCo': [], 'NSW_NthIn': [], 'NSW_SthCo': [], 'NSW_SthIn': [], - 'OTH_Metro': [], 'OTH_NoMet': [], - 'QLD_Cntrl': [], 'QLD_Metro': [], 'QLD_NthCo': [], - 'SAU_Coast': [], 'SAU_Inner': [], 'SAU_Metro': [], - 'VIC_EstCo': [], 'VIC_Inner': [], 'VIC_Metro': [], 'VIC_WstCo': [], - 'WAU_Coast': [], 'WAU_Inner': [], 'WAU_Metro': []}) + hier_df = pandas.DataFrame( + data={ + "total": [], + "VIC": [], + "QLD": [], + "SAU": [], + "WAU": [], + "OTH": [], + "NSW": [], + "NSW_Metro": [], + "NSW_NthCo": [], + "NSW_NthIn": [], + "NSW_SthCo": [], + "NSW_SthIn": [], + "OTH_Metro": [], + "OTH_NoMet": [], + "QLD_Cntrl": [], + "QLD_Metro": [], + "QLD_NthCo": [], + "SAU_Coast": [], + "SAU_Inner": [], + "SAU_Metro": [], + "VIC_EstCo": [], + "VIC_Inner": [], + "VIC_Metro": [], + "VIC_WstCo": [], + "WAU_Coast": [], + "WAU_Inner": [], + "WAU_Metro": [], + } + ) tree = hts.hierarchy.HierarchyTree.from_nodes(visnights_hier, hier_df) sum_mat = to_sum_mat(tree) - expected_sum_mat = numpy.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # total - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], # VIC - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0], # QLD - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], # SAU - [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU - [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH - [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW - [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_Metro - [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_NthCo - [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_NthIn - [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_SthCo - [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_SthIn - [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH_Metro - [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH_NoMet - [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Coast - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Inner - [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Metro - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], # SAU_Coast - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], # SAU_Inner - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # SAU_Metro - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], # QLD_Cntrl - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # QLD_Metro - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], # QLD_NthCo - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], # VIC_EstCo - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], # VIC_Inner - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # VIC_Metro - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]) # VIC_WstCo + expected_sum_mat = numpy.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # total + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], # VIC + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0], # QLD + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], # SAU + [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU + [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_Metro + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_NthCo + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_NthIn + [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_SthCo + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # NSW_SthIn + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH_Metro + [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # OTH_NoMet + [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Coast + [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Inner + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # WAU_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], # SAU_Coast + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], # SAU_Inner + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # SAU_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], # QLD_Cntrl + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # QLD_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], # QLD_NthCo + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], # VIC_EstCo + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], # VIC_Inner + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # VIC_Metro + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # VIC_WstCo + ] + ) numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) @@ -131,28 +170,34 @@ def test_demo_unique_constraint(): # Example https://otexts.com/fpp2/hts.html # Does not work when you have elements that are named the same, but represent # different levels in the hierarchy. See expected_sum_mat below for example. - hierarchy = {'total': ['A', 'B'], - 'A': ['AA', 'AB', 'AC'], - 'B': ['BA', 'BB']} - hier_df = pandas.DataFrame(data={'total': [], - 'A': [], - 'B': [], - 'AA': [], - 'AB': [], - 'AC': [], - 'BA': [], - 'BB': []}) + hierarchy = {"total": ["A", "B"], "A": ["AA", "AB", "AC"], "B": ["BA", "BB"]} + hier_df = pandas.DataFrame( + data={ + "total": [], + "A": [], + "B": [], + "AA": [], + "AB": [], + "AC": [], + "BA": [], + "BB": [], + } + ) tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, hier_df) sum_mat = to_sum_mat(tree) - expected_sum_mat = numpy.array([[1, 1, 1, 1, 1], # total - [0, 1, 0, 1, 1], # B, Incorrectly finds B in AB - [1, 1, 1, 1, 0], # A, Incorrectly finds A in BA - [1, 0, 0, 0, 0], # AA - [0, 1, 0, 0, 0], # AB - [0, 0, 1, 0, 0], # AC - [0, 0, 0, 1, 0], # BA - [0, 0, 0, 0, 1]]) # BB + expected_sum_mat = numpy.array( + [ + [1, 1, 1, 1, 1], # total + [0, 1, 0, 1, 1], # B, Incorrectly finds B in AB + [1, 1, 1, 1, 0], # A, Incorrectly finds A in BA + [1, 0, 0, 0, 0], # AA + [0, 1, 0, 0, 0], # AB + [0, 0, 1, 0, 0], # AC + [0, 0, 0, 1, 0], # BA + [0, 0, 0, 0, 1], # BB + ] + ) numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) From 4a0e0ac73d9c9cd4c77db823736df403225370f1 Mon Sep 17 00:00:00 2001 From: Noah Sayre Date: Thu, 28 Jan 2021 16:58:15 -0500 Subject: [PATCH 3/5] re-order import for isort --- tests/unit/test_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index 6d85dc2..fae6f7d 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -1,8 +1,8 @@ import numpy import pandas -from hts.functions import to_sum_mat import hts.hierarchy +from hts.functions import to_sum_mat def test_sum_mat_uv(uv_tree): From bb99e5b80419d7c59ed4f8e755169a4e9ecdc1bf Mon Sep 17 00:00:00 2001 From: Noah Sayre Date: Mon, 8 Feb 2021 15:53:21 -0500 Subject: [PATCH 4/5] return summing matrix labels from to_sum_mat, add & update unit tests --- hts/convenience.py | 2 +- hts/core/regressor.py | 2 +- hts/functions.py | 9 ++++++--- hts/utilities/load_data.py | 15 ++++++++++----- tests/conftest.py | 13 +++++++++---- tests/unit/test_functions.py | 14 ++++++++------ tests/unit/test_ntree.py | 24 ++++++++++++++++++------ 7 files changed, 53 insertions(+), 26 deletions(-) diff --git a/hts/convenience.py b/hts/convenience.py index faef5c6..8752d43 100644 --- a/hts/convenience.py +++ b/hts/convenience.py @@ -46,7 +46,7 @@ def revise_forecasts( """ if nodes: - summing_matrix = to_sum_mat(nodes) + summing_matrix, sum_mat_labels = to_sum_mat(nodes) if method in [MethodT.AHP.name, MethodT.PHA.name, MethodT.FP.name] and not nodes: raise ValueError(f"Method {method} requires an NAryTree to be passed") diff --git a/hts/core/regressor.py b/hts/core/regressor.py index 7bf4885..b453dbf 100644 --- a/hts/core/regressor.py +++ b/hts/core/regressor.py @@ -133,7 +133,7 @@ def __init_hts( nodes=nodes, df=df, exogenous=exogenous, root=root ) self.exogenous = exogenous - self.sum_mat = to_sum_mat(self.nodes) + self.sum_mat, sum_mat_labels = to_sum_mat(self.nodes) self._set_model_instance() self._init_revision() diff --git a/hts/functions.py b/hts/functions.py index 1b558ae..7b536bf 100644 --- a/hts/functions.py +++ b/hts/functions.py @@ -1,5 +1,5 @@ from random import choice -from typing import Dict +from typing import Dict, List, Tuple import numpy as np import pandas @@ -8,7 +8,7 @@ from hts.hierarchy import make_iterable -def to_sum_mat(ntree: NAryTreeT): +def to_sum_mat(ntree: NAryTreeT) -> Tuple[np.ndarray, List[str]]: """ This function creates a summing matrix for the bottom up and optimal combination approaches All the inputs are the same as above @@ -24,6 +24,9 @@ def to_sum_mat(ntree: NAryTreeT): numpy.ndarray Summing matrix. + List[str] + Row order list of the level in the hierarchy represented by each row in the summing matrix. + """ nodes = ntree.level_order_traversal() node_labels = ntree.get_level_order_labels() @@ -69,7 +72,7 @@ def to_sum_mat(ntree: NAryTreeT): sum_mat_labels.reverse() sum_mat_labels = ["total"] + sum_mat_labels + bl_mat_idx_ref - return sum_mat + return sum_mat, sum_mat_labels def project( diff --git a/hts/utilities/load_data.py b/hts/utilities/load_data.py index d4b7903..2d51aef 100644 --- a/hts/utilities/load_data.py +++ b/hts/utilities/load_data.py @@ -63,11 +63,16 @@ def load_hierarchical_sine_data(start, end, n=10000): amplitude = numpy.sin(time) * 10 amplitude += numpy.random.normal(2 * amplitude + 2, 5) df = pandas.DataFrame(index=dti, data={"total": amplitude[0 : len(dti)]}) - df["a"], df["b"], df["c"], df["d"] = partition_column(df.total, n=4) - df["aa"], df["ab"] = partition_column(df.a, n=2) - df["aaa"], df["aab"] = partition_column(df.aa, n=2) - df["ba"], df["bb"], df["bc"] = partition_column(df.b, n=3) - df["ca"], df["cb"], df["cc"], df["cd"] = partition_column(df.c, n=4) + df["a"], df["b"], df["c"] = partition_column(df.total, n=3) + df["a_x"], df["a_y"] = partition_column(df.a, n=2) + df["b_x"], df["b_y"] = partition_column(df.b, n=2) + df["c_x"], df["c_y"] = partition_column(df.c, n=2) + df["a_x_1"], df["a_x_2"] = partition_column(df.a_x, n=2) + df["a_y_1"], df["a_y_2"] = partition_column(df.a_y, n=2) + df["b_x_1"], df["b_x_2"] = partition_column(df.b_x, n=2) + df["b_y_1"], df["b_y_2"] = partition_column(df.b_y, n=2) + df["c_x_1"], df["c_x_2"] = partition_column(df.c_x, n=2) + df["c_y_1"], df["c_y_2"] = partition_column(df.c_y, n=2) return df diff --git a/tests/conftest.py b/tests/conftest.py index 03556de..43b8145 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -133,10 +133,15 @@ def mv_tree(hierarchical_mv_data): def sine_hier(): return { "total": ["a", "b", "c"], - "a": ["aa", "ab"], - "aa": ["aaa", "aab"], - "b": ["ba", "bb"], - "c": ["ca", "cb", "cc", "cd"], + "a": ["a_x", "a_y"], + "b": ["b_x", "b_y"], + "c": ["c_x", "c_y"], + "a_x": ["a_x_1", "a_x_2"], + "a_y": ["a_y_1", "a_y_2"], + "b_x": ["b_x_1", "b_x_2"], + "b_y": ["b_y_1", "b_y_2"], + "c_x": ["c_x_1", "c_x_2"], + "c_y": ["c_y_1", "c_y_2"], } diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index fae6f7d..120b605 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -6,7 +6,7 @@ def test_sum_mat_uv(uv_tree): - mat = to_sum_mat(uv_tree) + mat, sum_mat_labels = to_sum_mat(uv_tree) assert isinstance(mat, numpy.ndarray) shp = mat.shape assert shp[0] == uv_tree.num_nodes() + 1 @@ -14,7 +14,7 @@ def test_sum_mat_uv(uv_tree): def test_sum_mat_mv(mv_tree): - mat = to_sum_mat(mv_tree) + mat, sum_mat_labels = to_sum_mat(mv_tree) assert isinstance(mat, numpy.ndarray) shp = mat.shape assert shp[0] == mv_tree.num_nodes() + 1 @@ -37,7 +37,7 @@ def test_sum_mat_hierarchical(): ) tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, hier_df) - sum_mat = to_sum_mat(tree) + sum_mat, sum_mat_labels = to_sum_mat(tree) expected_sum_mat = numpy.array( [ @@ -53,6 +53,7 @@ def test_sum_mat_hierarchical(): ) # B_Y numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) + assert sum_mat_labels == ["total", "B", "A", "A_X", "A_Y", "A_Z", "B_X", "B_Y"] def test_sum_mat_grouped(): @@ -76,7 +77,7 @@ def test_sum_mat_grouped(): ) tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, grouped_df) - sum_mat = to_sum_mat(tree) + sum_mat, sum_mat_labels = to_sum_mat(tree) expected_sum_mat = numpy.array( [ @@ -93,6 +94,7 @@ def test_sum_mat_grouped(): ) numpy.testing.assert_array_equal(sum_mat, expected_sum_mat) + assert sum_mat_labels == ["total", "Y", "X", "B", "A", "A_X", "A_Y", "B_X", "B_Y"] def test_sum_mat_visnights_hier(visnights_hier): @@ -129,7 +131,7 @@ def test_sum_mat_visnights_hier(visnights_hier): ) tree = hts.hierarchy.HierarchyTree.from_nodes(visnights_hier, hier_df) - sum_mat = to_sum_mat(tree) + sum_mat, sum_mat_labels = to_sum_mat(tree) expected_sum_mat = numpy.array( [ @@ -185,7 +187,7 @@ def test_demo_unique_constraint(): ) tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, hier_df) - sum_mat = to_sum_mat(tree) + sum_mat, sum_mat_labels = to_sum_mat(tree) expected_sum_mat = numpy.array( [ diff --git a/tests/unit/test_ntree.py b/tests/unit/test_ntree.py index e2b161e..12c6c88 100644 --- a/tests/unit/test_ntree.py +++ b/tests/unit/test_ntree.py @@ -84,19 +84,31 @@ def test_from_geo_events(events): def test_create_hierarchical_sine_data_tree(hierarchical_sine_data): hier = { "total": ["a", "b", "c"], - "a": ["aa", "ab"], - "aa": ["aaa", "aab"], - "b": ["ba", "bb"], - "c": ["ca", "cb", "cc", "cd"], + "a": ["a_x", "a_y"], + "b": ["b_x", "b_y"], + "c": ["c_x", "c_y"], + "a_x": ["a_x_1", "a_x_2"], + "a_y": ["a_y_1", "a_y_2"], + "b_x": ["b_x_1", "b_x_2"], + "b_y": ["b_y_1", "b_y_2"], + "c_x": ["c_x_1", "c_x_2"], + "c_y": ["c_y_1", "c_y_2"], } ht = HierarchyTree.from_nodes(hier, hierarchical_sine_data) assert isinstance(ht.to_pandas(), pandas.DataFrame) assert ht.key == "total" assert len(ht.children) == 3 for c in ht.children: - if c.key == "a" or c.key == "b": + if c.key == "a" or c.key == "b" or c.key == "c": assert len(c.children) == 2 - if c.key == "c": + if ( + c.key == "a_x" + or c.key == "b_x" + or c.key == "c_x" + or c.key == "a_y" + or c.key == "b_y" + or c.key == "c_y" + ): assert len(c.children) == 4 From d4855c900a5450238c3115dc51bfbb0fa343509e Mon Sep 17 00:00:00 2001 From: Noah Sayre Date: Mon, 8 Feb 2021 15:54:31 -0500 Subject: [PATCH 5/5] update documentation to hierarchy key requirements, add OLS rec example --- README.rst | 1 + docs/hierarchy.rst | 147 +++++++++++++++++++++++++++++++++++---------- docs/usage.rst | 57 ++++++++++++++++++ 3 files changed, 173 insertions(+), 32 deletions(-) diff --git a/README.rst b/README.rst index 5eafa0b..42b297e 100644 --- a/README.rst +++ b/README.rst @@ -52,6 +52,7 @@ Features * Supported and tested on ``python 3.6``, ``python 3.7`` and ``python 3.8`` * Implementation of Bottom-Up, Top-Down, Middle-Out, Forecast Proportions, Average Historic Proportions, Proportions of Historic Averages and OLS revision methods +* Support for representations of hierarchical and grouped time series * Support for a variety of underlying forecasting models, inlcuding: SARIMAX, ARIMA, Prophet, Holt-Winters * Scikit-learn-like API * Geo events handling functionality for geospatial data, including visualisation capabilities diff --git a/docs/hierarchy.rst b/docs/hierarchy.rst index 97ad22c..8bc4380 100644 --- a/docs/hierarchy.rst +++ b/docs/hierarchy.rst @@ -5,25 +5,36 @@ Hierarchical Representation data structure where each node is specified by: - A human readable key, such as 'germany', 'total', 'berlin', or '881f15ad61fffff' +- Keys should be unique and delimited by underscores. Therfore, using the example below there should not be duplicate values across level 1, 2 or 3. + For example, ``a`` should not also a value in level 2. - An item, represented by a ``pandas.Series`` (or ``pandas.DataFrame`` for multivariate inputs), which contains the actual data about that node .. _`N-Ary Tree`: https://en.wikipedia.org/wiki/M-ary_tree +Hierarchical Structure +---------------------- + +For instance, a tree with nodes and levels as follows: + +- Level 1: a, b, c +- Level 2: x, y +- Level 3: 1, 2 -For instance, a tree with nodes: .. code-block:: python - nodes = {'t': ['a', 'b', 'c'], - 'a': ['aa', 'ab'], - 'b': ['ba', 'bb'], - 'c': ['ca', 'cb'], - 'aa': ['aaa', 'aab'], - 'ab': ['aba', 'abb'] - ... - 'cb': ['cba', 'cbb'] + nodes = {'total': ['a', 'b', 'c'], + 'a': ['a_x', 'a_y'], + 'b': ['b_x', 'b_y'], + 'c': ['c_x', 'c_y'], + 'a_x': ['a_x_1', 'a_x_2'], + 'a_y': ['a_y_1', 'a_y_2'], + 'b_x': ['b_x_1', 'b_x_2'], + 'b_y': ['b_y_1', 'b_y_2'], + 'c_x': ['c_x_1', 'c_x_2'], + 'c_y': ['c_y_1', 'c_y_2'] } @@ -31,15 +42,15 @@ Represents the following structure: .. code-block:: console - Level Node Key # of nodes + Level Node Key # of nodes - 1 t 1 + 1 t 1 - 2 a b c 3 + 2 a b c 3 - 3 aa ab ba bb ca cb 6 + 3 a_x a_y b_x b_y   c_x c_y 6 - 4 aaa aab aba abb baa bab bba bbb caa cab cba cbb 12 + 4 a_x_1 a_x_2 a_y_1 a_y_2 b_x_1 b_x_2 b_y_1 b_y_2 c_x_1 c_x_2 c_y_1 c_y_2 12 @@ -54,30 +65,102 @@ To get a sense of how the hierarchy trees are implemented, some sample data can >>> s, e = datetime(2019, 1, 15), datetime(2019, 10, 15) >>> hsd = load_hierarchical_sine_data(start=s, end=e, n=10000) >>> print(hsd.head()) - total a b c d aa ab aaa aab ba bb bc ca cb cc cd - 2019-01-15 01:29:25.005972 6.345796 1.500952 2.006216 0.016688 2.821940 1.413739 0.087213 0.273000 1.140739 0.572872 0.438739 0.994606 0.008490 0.003722 0.004431 0.000045 - 2019-01-15 01:45:50.195453 9.107371 1.116805 1.091745 5.688870 1.209951 0.291894 0.824912 0.149041 0.142853 0.007558 0.374915 0.709272 1.303977 0.775971 0.288751 3.320171 - 2019-01-15 02:20:51.204587 -6.333233 -1.081240 -0.455464 -2.401480 -2.395049 -0.716773 -0.364467 -0.243496 -0.473276 -0.136318 -0.159603 -0.159543 -0.417023 -0.117741 -1.773234 -0.093482 - 2019-01-15 02:27:46.966530 -2.432930 -0.348840 -0.207461 -0.851828 -1.024801 -0.317890 -0.030949 -0.175013 -0.142877 -0.034511 -0.006034 -0.166916 -0.286929 -0.329183 -0.005672 -0.230045 - 2019-01-15 02:32:09.675895 10.925181 3.820450 1.349626 1.002597 4.752509 3.355709 0.464741 1.596091 1.759618 0.125829 1.206414 0.017383 0.112833 0.515650 0.077102 0.297012 - - >>> hier = {'total': ['a', 'b', 'c'], 'a': ['aa', 'ab'], 'aa': ['aaa', 'aab'], 'b': ['ba', 'bb'], 'c': ['ca', 'cb', 'cc', 'cd']} + total a b c a_x a_y b_x b_y c_x ... a_y_2 b_x_1 b_x_2 b_y_1 b_y_2 c_x_1 c_x_2 c_y_1 c_y_2 + 2019-01-15 01:11:09.255573 2.695133 0.150805 0.031629 2.512698 0.037016 0.113789 0.028399 0.003231 0.268406 ... 0.080803 0.013131 0.015268 0.000952 0.002279 0.175671 0.092734 0.282259 1.962034 + 2019-01-15 01:18:30.753096 -3.274595 -0.199276 -1.624369 -1.450950 -0.117717 -0.081559 -0.300076 -1.324294 -1.340172 ... -0.077289 -0.177000 -0.123075 -0.178258 -1.146035 -0.266198 -1.073975 -0.083517 -0.027260 + 2019-01-15 01:57:48.607109 -1.898038 -0.226974 -0.662317 -1.008747 -0.221508 -0.005466 -0.587826 -0.074492 -0.929464 ... -0.003297 -0.218128 -0.369698 -0.021156 -0.053335 -0.225994 -0.703470 -0.077021 -0.002262 + 2019-01-15 02:06:57.994575 13.904908 6.025506 5.414178 2.465225 5.012228 1.013278 4.189432 1.224746 1.546544 ... 0.467630 1.297829 2.891602 0.671085 0.553661 0.066278 1.480266 0.769954 0.148728 + 2019-01-15 02:14:22.367818 11.028013 3.537919 6.504104 0.985990 2.935614 0.602305 4.503611 2.000493 0.179114 ... 0.091993 4.350293 0.153318 1.349629 0.650864 0.066946 0.112168 0.473987 0.332889 + + + >>> hier = {'total': ['a', 'b', 'c'], + 'a': ['a_x', 'a_y'], + 'b': ['b_x', 'b_y'], + 'c': ['c_x', 'c_y'], + 'a_x': ['a_x_1', 'a_x_2'], + 'a_y': ['a_y_1', 'a_y_2'], + 'b_x': ['b_x_1', 'b_x_2'], + 'b_y': ['b_y_1', 'b_y_2'], + 'c_x': ['c_x_1', 'c_x_2'], + 'c_y': ['c_y_1', 'c_y_2'] + } >>> tree = HierarchyTree.from_nodes(hier, hsd, root='total') >>> print(tree) - total |- a - | |- aa - | | |- aaa - | | - aab - | - ab + | |- a_x + | | |- a_x_1 + | | - a_x_2 + | - a_y + | |- a_y_1 + | - a_y_2 |- b - | |- ba - | - bb + | |- b_x + | | |- b_x_1 + | | - b_x_2 + | - b_y + | |- b_y_1 + | - b_y_2 - c - |- ca - |- cb - |- cc - - cd + |- c_x + | |- c_x_1 + | - c_x_2 + - c_y + |- c_y_1 + - c_y_2 + + +Grouped Structure +----------------- + +In order to create a grouped structure, instead of a strictly hierarchichal structure you must specify +all levels within the grouping strucure dictionary and dataframe as seen below. + +Levels in example: + +- Level 1: A, B +- Level 2: X, Y + +.. code-block:: python + + import hts + import pandas as pd + + >>> hierarchy = { + "total": ["A", "B", "X", "Y"], + "A": ["A_X", "A_Y"], + "B": ["B_X", "B_Y"], + } + + >>> grouped_df = pd.DataFrame( + data={ + "total": [], + "A": [], + "B": [], + "X": [], + "Y": [], + "A_X": [], + "A_Y": [], + "B_X": [], + "B_Y": [], + } + ) + + >>> tree = hts.hierarchy.HierarchyTree.from_nodes(hierarchy, grouped_df) + >>> sum_mat, sum_mat_labels = hts.functions.to_sum_mat(tree) + >>> print(sum_mat) # Commented labels will not appear in the printout, they are here as an example. + [[1. 1. 1. 1.] # totals + [0. 1. 0. 1.] # Y + [1. 0. 1. 0.] # X + [0. 0. 1. 1.] # B + [1. 1. 0. 0.] # A + [1. 0. 0. 0.] # A_X + [0. 1. 0. 0.] # A_Y + [0. 0. 1. 0.] # B_X + [0. 0. 0. 1.]] # B_Y + + >>> print(sum_mat_labels) # Use this if you need to match summing matrix rows with labels. + ['total', 'Y', 'X', 'B', 'A', 'A_X', 'A_Y', 'B_X', 'B_Y'] .. automodule:: hts.hierarchy diff --git a/docs/usage.rst b/docs/usage.rst index 3a06be4..b3c85f9 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,6 +2,9 @@ Usage ===== +Typical Usage +------------- + ``scikit-hts`` has one main class that provides the interface with your desired forecasting methodology and reconciliation strategy. Here you can find how to get started quickly with ``scikit-hts``. We'll use some sample (fake) data. @@ -35,3 +38,57 @@ More extensive usage, including a solution for Kaggle's `M5 Competition`_, can b .. _M5 Competition: https://www.kaggle.com/c/m5-forecasting-accuracy .. _scikit-hts-examples: https://github.com/carlomazzaferro/scikit-hts-examples + +Reconcile Pre-Computed Forecasts +-------------------------------- + +This is an example of creating forecasts outside of scikit-hts and then utilzing scikit-hts to do OLS optimal +reconciliation on the forecasts. + +.. code-block:: python + + >>> from datetime import datetime + >>> import hts + >>> from hts.utilities.load_data import load_hierarchical_sine_data + >>> import statsmodels + >>> import collections + >>> import pandas as pd + + >>> s, e = datetime(2019, 1, 15), datetime(2019, 10, 15) + >>> hsd = load_hierarchical_sine_data(start=s, end=e, n=10000) + >>> hier = {'total': ['a', 'b', 'c'], + 'a': ['a_x', 'a_y'], + 'b': ['b_x', 'b_y'], + 'c': ['c_x', 'c_y'], + 'a_x': ['a_x_1', 'a_x_2'], + 'a_y': ['a_y_1', 'a_y_2'], + 'b_x': ['b_x_1', 'b_x_2'], + 'b_y': ['b_y_1', 'b_y_2'], + 'c_x': ['c_x_1', 'c_x_2'], + 'c_y': ['c_y_1', 'c_y_2'] + } + + >>> tree = hts.hierarchy.HierarchyTree.from_nodes(hier, hsd) + >>> sum_mat, sum_mat_labels = hts.functions.to_sum_mat(tree) + + >>> forecasts = pd.DataFrame(columns=hsd.columns, index=['fake']) + + # Make forecasts made outside of package. Could be any modeling technique. + >>> for col in hsd.columns: + model = statsmodels.tsa.holtwinters.SimpleExpSmoothing(hsd[col].values).fit() + fcst = list(model.forecast(1)) + forecasts[col] = fcst + + >>> pred_dict = collections.OrderedDict() + + # Add predictions to dictionary is same order as summing matrix + >>> for label in sum_mat_labels: + pred_dict[label] = pd.DataFrame(data=forecasts[label].values, columns=['yhat']) + + >>> revised = hts.functions.optimal_combination(pred_dict, sum_mat, method='OLS', mse={}) + + # Put reconciled forecasts in nice DataFrame form + >>> revised_forecasts = pd.DataFrame(data=revised[0:,0:], + index=forecasts.index, + columns=sum_mat_labels) +