In [5]:
%run -i writefile2.py

In [8]:
%%writefile2 --source window_patches_h.ipynb

"""
compute window patch data as matrices X, Y given 
window lengths, lag and linear operator
"""

import nodes

@nodes.store_in_tmp
@nodes.generic_node
def window_patches( 
    
        subset,
        length_l, 
        lag, 
        length_r, 
        linear_operator,
    ):

    import numpy as np
    from tqdm import tqdm
    from types import SimpleNamespace as ns
    import sys
    import require
    n_outcomes = len( require.single( "owid_outcomes" ))
    get_number_of_window_samples = require.single( "get_number_of_window_samples" )
    subset_indexing = require.single( "subset_indexing" )

    def main( training_data_node: nodes.find( "training_data" ).given( )):

        trainset = subset_indexing( training_data_node.result, subset )
        get_n_samples = lambda df: get_number_of_window_samples( df, length_l, lag, length_r )
    
        n_samples_total = sum([ get_n_samples( point.df ) for point in trainset ])
        d = trainset[ 0 ].df.shape[ 1 ]
        
        for i, point in enumerate( trainset ):
    
            assert get_n_samples( point.df ) >= 1, f"dataframe { point.df } at index { i } is too short"    
    
        assert linear_operator.shape[ 1 ] == length_r, f"linear_operator of shape { linear_operator.shape } cannot be applied to a window of shape { length_r, n_outcomes }"
    
        def get_patch( df, i, length_l, lag, length_r ):
            
            left = df.iloc[ i : i + length_l, : ]
            right = df.iloc[ i + length_l + lag : i + length_l + lag + length_r, :n_outcomes ] # assume outcomes are the first columns
            
            assert ( length_l, d ) == left.shape, f"got { left.shape } but expected { length_l, d }"
            assert ( length_r, n_outcomes ) == right.shape, f"got { right.shape } but expected { length_r, n_outcomes } for slice ({ i + length_l + lag }:{ i + length_l + lag + length_r }) and df.shape { df.shape }"
            return left, right
        
        L = np.zeros(( n_samples_total, length_l, d ))
        R = np.zeros(( n_samples_total, length_r, n_outcomes ))
    
        start_index = 0
        with tqdm( file = sys.stdout, desc = "assembling patches", total = n_samples_total ) as bar:
            
            for point in trainset:

                df = point.df
                n_samples = get_n_samples( df )
                
                for t in range( n_samples ):
            
                    L[ start_index + t ], R[ start_index + t ] = get_patch( df, t, length_l, lag, length_r )
        
                # offset for next data frame
                start_index += n_samples
                bar.update( n_samples )
    
        X = L.reshape( L.shape[ 0 ], -1, order = "C" )
        Z = R.reshape( R.shape[ 0 ], -1, order = "C" )
        M = np.kron( linear_operator, np.identity( n_outcomes ))
        Y = Z @ M.T

        info = "".join([
            
            f"Consider the predictor windows $L \in \mathbb{{R}}^{{{ L.shape }}}$ and response windows $R \in \mathbb{{R}}^{{{ R.shape }}}$.",
            f"Let $X \in \mathbb{{R}}^{{{ X.shape }}}$ be a reshaping of $L$ which is directly passed into the model as predictor sample matrix.",
            f"Let latent response $Z \in \mathbb{{R}}^{{{ Z.shape }}}$ be a reshaping of $R$. As the name suggests, this is not given to the model.",
            f"Instead, the model observes a linear transformation of $Z$: We have $Y \in \mathbb{{R}}^{{{ Y.shape }}}=Z (M \otimes I_{{{ n_outcomes }}})^\\top$.",
            f"This applies the linear operator $M \in \mathbb{{R}}^{{{ linear_operator.shape }}}$ to every outcome time series window.",
            f"Hence, the weak learner learns a function $f: \mathbb{{R}}^{{{ L.shape[ 1: ]}}} \\rightarrow \mathbb{{R}}^{{{( linear_operator.shape[ 0 ], n_outcomes )}}}$,",
            f"where $f(x)=y=Mz$."
        ])

        return ns( 
            
            X = X, 
            Y = Y, 
            info = info 
        )
    
    return main

node = window_patches

In [7]:
%%writefile2 --source window_patches_h.ipynb

"""
get the number of window samples resulting from a dataframe using given window lengths and lag
"""

def get_number_of_window_samples( df, length_l, lag, length_r ):
    
    return df.shape[ 0 ] - length_l - lag - length_r + 1