In [10]:
from dataclasses import dataclass
from pandas import DataFrame
from collections import Counter
from glob import glob
import xml.etree.ElementTree as ET
from pprint import pprint
import pandas as pd

In [3]:
@dataclass
class AxisDef:
    ScaleType: str
    AxisName: str
    MinScaleValue: int
    MaxScaleValue: int
    Increment: int

@dataclass
class MetaData:
    ScalingFactor: float
    DataType: str
    Nation: str
    TableDescription: str
    AxisDefs: list[AxisDef]

@dataclass
class Table:
    MetaData: MetaData
    Values: DataFrame

@dataclass
class ContentClassification:
    TableIdentity: str
    ProviderDomain: str
    ProviderName: str
    TableReference: str
    ContentType: str
    TableName: str
    TableDescription: str
    Comments: str
    KeyWords: list[str]

@dataclass
class XTbML:
    ContentClassification: ContentClassification
    Tables: list[Table]

In [35]:
glob('archive-2021-Oct-17-051924/*')

r1 = ET.parse('archive-2021-Oct-17-051924/t2696.xml').getroot()
r2 = ET.parse('archive-2021-Oct-17-051924/t2682.xml').getroot()
r3 = ET.parse('archive-2021-Oct-17-051924/t3278.xml').getroot()
tables1 = r1.findall("./Table")
tables2 = r2.findall("./Table")
tables3 = r3.findall("./Table")

In [46]:
@dataclass 
class IndexParams:
    AxisName: str
    MinScaleValue: int
    MaxScaleValue: int
    Increment: int

def getParamsFromAxisDef(axisDef: ET.Element) -> IndexParams:
    return IndexParams(
        axisDef.find('./AxisName').text,
        int(axisDef.find('./MinScaleValue').text),
        int(axisDef.find('./MaxScaleValue').text),
        int(axisDef.find('./Increment').text)
    )

def getIndexInfo(table: ET.Element) -> list[IndexParams]:
    axisDefs = table.findall("./MetaData/AxisDef")
    return [getParamsFromAxisDef(axisDef) for axisDef in axisDefs]

def constructMultiIndex(indices: list[IndexParams]) -> pd.MultiIndex:
    return pd.MultiIndex.from_product(
        [range(index.MinScaleValue, index.MaxScaleValue+1, index.Increment) for index in indices],
        names=[index.AxisName for index in indices])

constructMultiIndex(getIndexInfo(tables1[0]))

def getVals(table: ET.Element) -> list[float]:
    vals = []
    for val in table.iter('Y'):
        vals.append(float(val.text))
    return vals

# Get a dataframe of the values with the multiindex
def getDataFrame(table: ET.Element) -> DataFrame:
    vals = getVals(table)
    index = constructMultiIndex(getIndexInfo(table))
    return DataFrame(vals, index=index, columns=['vals'])

l = getDataFrame(tables3[0])
l.loc[0, 'vals']


Duration
1     0.00029
2     0.00017
3     0.00012
4     0.00010
5     0.00009
6     0.00009
7     0.00009
8     0.00009
9     0.00008
10    0.00008
11    0.00009
12    0.00009
13    0.00010
14    0.00011
15    0.00016
16    0.00025
17    0.00036
18    0.00047
19    0.00053
20    0.00054
21    0.00056
22    0.00057
23    0.00058
24    0.00061
25    0.00064
Name: vals, dtype: float64