# Implementation of framework
Author: Yvonne Gootzen (Statistics Netherlands & TU/e) yapm.gootzen@cbs.nl

Implementation of the metadata framework with an A* algorithm based on score functions. The notebook contains:
- Class definitions and functions (including score function), for describing the metadata problem.
- Examples of usage of the classes to illustrate the problem.
- A* implementation. 

First, import the required packages.

In [None]:
#import sys
#!{sys.executable} -m pip install ipynb

In [None]:
import time
import matplotlib as plt
import networkx as nx
import copy as copy
import numpy as np
import re
import itertools

# Class definitions
This part of the notebook contains all classes and functions for describing the problem. For more information about the meanings behind each class, see the accompanying presentation and paper.

### Not Initialised Error
To signal the user about situations where an object has not yet been initialised. 

In [None]:
class NotInitialisedError(Exception):
    pass

### Variable
The smallest object in the problem. Each dataset contains multiple variables. Variables can have different levels of granularity. To change from one granularity to another, a conversion or aggregation is needed.

In [None]:
class Variable:
    def __init__(self, name, granularity):
        self.name = name
        self.granularity = granularity

    def __str__(self):
        return str(self.name) + str(self.granularity)
    
    def __eq__(self, other: "Variable"):
        # compare self Variable object to the other Variable object
        # they are equal if the name and granularity are equal
        
        return all([self.name == other.name, 
                    self.granularity == other.granularity])
    
    def __hash__(self):
        # required for usage in sets. Since the str() of self is unique and contains all elements for equality, we use this for the hash.
        return(hash(str(self)))
    
    def equal_name(self, other: "Variable"):
        return self.name == other.name
    
    def get_name(self):
        return self.name
    
    def get_granularity(self):
        return self.granularity

### Data
A data source consists of a set of left-hand variables, a set of right-hand variables and context. The context is largely ignored for the modelling week. The similarity() method is the subject of the modelling week assignment. It is also used in the SetOfSources class, where the individual similarity scores of each data set are combined into a single value.

The assignment of the modelling week is as follows: find a similarity score function that provides a small value for when two data sources have few variables in common, and a larger value when more variables are in common. One disadvantage of the current method is that sources with a large number of variables have a higher similarity score than sources with a smaller number of variables. 

In [None]:
class Data(object):
    def __init__(self, left_variables, right_variables, context, similarity_variant = "normalized_basic"):
        self.left_variables = set(left_variables)
        self.right_variables = set(right_variables)
        self.context = set(context)  # ensure context is a set (argument may be a list)
        self.similarity_variant = similarity_variant
        self.lookup_table_left = None
        self.lookup_table_right = None
        self.lookUpTable = False
        self.path_step = "none"

    def __str__(self):
        left_str_separate = [str(v) for v in self.left_variables]
        left_str_separate.sort()
        left_str = ", ".join(left_str_separate)
        right_str_separate = [str(v) for v in self.right_variables]
        right_str_separate.sort()
        right_str = ", ".join(right_str_separate)
        full_str = "(" + left_str + " | " + right_str + ")" + "_" + str(sorted([str(c) for c in self.context]))
        return full_str
    
    def __eq__(self, other: "Data"):
        # compare self Data object to the other Data object
        # they are equal if all left- and right- sets of variables, and the context, are equal
        
        # use all for efficient evaluation (stops at the first element that is False)
        return all([self.left_variables == other.left_variables,
                    self.right_variables == other.right_variables,
                    self.context == other.context])
    
    def __hash__(self):
        # required for usage in sets. Since the str() of self is unique and contains all elements for equality, we use this for the hash.
        return hash(str(self))
    
    def equal_nocontext(self, other: "Data"):
        # compare self Data object to the other Data object
        # similar to __eq__() except that this function does not care about context
        # they are considered equal if all left- and right- sets of variables, are equal
        
        # use all for efficient evaluation (stops at the first element that is False)
        return all([self.left_variables == other.left_variables,
                    self.right_variables == other.right_variables])
    
    def get_context(self):
        return self.context  # set of contexts
    
    def get_variable_names_left(self):
        return {v.get_name() for v in self.left_variables}
    
    def contains_var_left(self, v_name):
        return v_name in self.get_variable_names_left()
        
    def get_variable_names_right(self):    
        return {v.get_name() for v in self.right_variables}
    
    def contains_var_right(self, v_name):
        return v_name in self.get_variable_names_right()
    
    def get_variables_left(self):
        return self.left_variables
    
    def get_variables_right(self):
        return self.right_variables
    
    def set_lookup_tables(self, lookup_table_left, lookup_table_right):
        self.lookup_table_left = lookup_table_left
        self.lookup_table_right = lookup_table_right
        self.lookUpTable = True
    
    def convert_variable(self, var_remove, var_add):
        # beware: the check if this conversion is allowed should be executed before this method is used
        self.left_variables.remove(var_remove)
        self.left_variables.add(var_add)
        
    def aggregate_variable(self, var_remove, var_add):
        # beware: the check if this aggregation is allowed should be executed before this method is used
        self.right_variables.remove(var_remove)
        self.right_variables.add(var_add)
    
    def similarity(self, other: "Data", weight_right_sim = 1):
        # self is the goal Data
        
        n_goal_vars_left = len(self.left_variables)
        weight_right_eq = 2 * weight_right_sim
        weight_left_sim = n_goal_vars_left * weight_right_eq
        weight_left_eq = 2 * weight_left_sim
        weight_context = 2 * weight_left_eq
#         weight_left_eq, weight_left_sim, weight_right_eq = 50, 20, 10
#         weight_context = 100

        left_equal = len(set(self.left_variables).intersection(other.left_variables))  # number of variables with equal name and granularity
        right_equal = len(set(self.right_variables).intersection(other.right_variables))  # number of variables with equal name and granularity
        
        if self.lookUpTable:
            left_similar = len(self.get_variable_names_left().intersection(other.get_variable_names_left(), self.lookup_table_left))  # number of variables with equal name (those with equal granularity are counted again, so keep this in mind when setting weights)
            right_similar = len(self.get_variable_names_right().intersection(other.get_variable_names_right(), self.lookup_table_right))
        else:
            left_similar = len(self.get_variable_names_left().intersection(other.get_variable_names_left()))  # number of variables with equal name (those with equal granularity are counted again, so keep this in mind when setting weights)
            left_similar -= left_equal    # remove double-counting
            right_similar = len(self.get_variable_names_right().intersection(other.get_variable_names_right()))  # number of variables with equal name (those with equal granularity are counted again, so keep this in mind when setting weights)
            right_similar -= right_equal  # remove double-counting
        
        context_score = weight_context*(self.context == other.context)
        
        left_equal_max = len(set(self.left_variables))  # number of variables with equal name and granularity
        right_equal_max = len(set(self.right_variables))  # number of variables with equal name and granularity

        variant = self.similarity_variant
        
        if variant == 'normalized_basic':   # this is the default
            # normalize score:
            score = sum([weight_left_eq*left_equal, weight_left_sim*left_similar,
                        weight_right_eq*right_equal, weight_right_sim*right_similar,
                        context_score]) / sum([weight_left_eq * left_equal_max, weight_right_eq * right_equal_max, weight_context])
        
        elif variant == 'normalized_coupled':
            # normalize score, but with rhs and lhs dependently (multiply instead of sum)
            score = ((sum([weight_left_eq*left_equal, weight_left_sim*left_similar]) *
                    sum([weight_right_eq*right_equal, weight_right_sim*right_similar, context_score])) /  
                    (weight_left_eq * left_equal_max * (weight_right_eq * right_equal_max + weight_context))) 
                    
        return score
    
    def get_neighbours(self, agg = True):
        # based on conversion, aggregation and combination, give all unique datasets that can be created from self, with exactly one manipulation
        
        # conversion
        neighbours = set()
        for v in self.left_variables:
            # for each of the left variables, it can be converted to one of its connected granularities in the conversion graph
            conversion_graph = ConversionGraph.get(v.get_name())
            connected_granularities = conversion_graph.all_conversions(v.get_granularity())
            for g in connected_granularities:
                v2 = Variable(name=v.get_name(), granularity = g)  # copy the name, but use new granularity
                data_temp = copy.deepcopy(self)  # copy of the current data set
                data_temp.convert_variable(var_remove = v, var_add = v2)  # apply conversion (we have checked that it is valid when creating connected_granularities)
                data_temp.path_step = "convert ("+str(v) + " to " + str(v2) + "): " + str(data_temp)
                neighbours.add(data_temp)
        
        # aggregation 
        if agg:
            for v in self.right_variables:
                # for each of the left variables, it can be converted to one of its connected granularities in the conversion graph
                aggregation_graph = AggregationGraph.get(v.get_name())
                connected_granularities = aggregation_graph.all_aggregations(v.get_granularity())
                for g in connected_granularities:
                    v2 = Variable(name=v.get_name(), granularity = g)  # copy the name, but use new granularity
                    data_temp = copy.deepcopy(self)
                    data_temp.aggregate_variable(var_remove = v, var_add = v2)
                    data_temp.path_step = "aggregate ("+str(v) + " to " + str(v2) + "): " + str(data_temp)
                    neighbours.add(data_temp)
        
        # combination is not relevant when looking at a single data source, because 
        
        return neighbours
    
    def shrink(self, other: "Data"):  ###NEW###
        """
        Returns True if self can be 'shrinked' into other. 

        (Temporary) solution to a combination issue where:

        (a1, b3 | a3, c1)_I  +  (b2, e1 | a3, c1)_I  ->  (a1, b2, b3, e1 | a3, c1)_I

        results in both b2 and b3 being in the dataset. The subdataset() function checks if a data set (goal) is within another dataset. Returns true/false. This is also relevant to the case:

        (a1, b3 | a3, c1)_I  +  (b3, e1 | a3, c1)_I  ->  (a1, b3, e1 | a3, c1)_I

        where the goal is a subset of the result of combining (a1, e1 | a3, c1)_I.

        Left-hand variables can be dropped at any time without messing up the structure of the data set. 
        Right-hand variables can not simply be dropped. If dropped, duplicate units might exist in the 
        dataset because part of the unit descriptor is suddenly missing. Because of this, the sets of 
        right-hand variables must be equal. 
        """
    
        return all([other.left_variables.issubset(self.left_variables),
                self.right_variables == other.right_variables,
                self.context.issubset(other.context)]) 
    
    def shrink_nocontext(self, other: "Data"):  ###NEW###
        """
        Same as shrink() but without the constraint that contexts must be equal
        """
    
        return all([other.left_variables.issubset(self.left_variables),
                self.right_variables == other.right_variables])  

### Conversion Graph

In [None]:
class ConversionGraph:
    instances = []  # class attribute to keep track of class instances
    
    def __init__(self, variable_name, granularities, conversion_edges):
        self.variable_name = variable_name
        self.Graph = nx.Graph()
        self.granularities = granularities

        for g in granularities:
            self.Graph.add_node(g)

        for e in conversion_edges:
            self.Graph.add_edge(*e)  # * unpacks edge tuple
        
        # add self to list of instances
        if self.is_initialised(variable_name):
            # an instance with this variable name was already known, remove that instance (so the new instance is the only one)
            ConversionGraph.instances.remove(self.get(variable_name))
        ConversionGraph.instances.append(self)  # append instance to list of class instances
        
    @classmethod 
    def get(cls: "ConversionGraph", var_name):
        # return the instance of this class for which the value variable_name is equal to var_name
        # each ConversionGraph object should exist exactly once for each variable_name
        
        # first we make a list of "all" instances that statisfy the desired variable name
        
        list_form = [inst for inst in cls.instances if inst.variable_name == var_name]
        if len(list_form) > 0:
            # list is not empty, so return the first element (there should only be one)
            return list_form[0]
        else:
            # no instance was found
            raise NotInitialisedError("ConversionGraph " + var_name)
            
    def get_max_granularities(self):
        return max(self.granularities)
        
    def is_initialised(cls: "ConversionGraph", var_name):
        list_form = [inst for inst in cls.instances if inst.variable_name == var_name]
        return len(list_form) > 0

    def add_granularity(self, new_granularity):
        self.Graph.add_node(new_granularity)

    def add_conversion_edge(self, new_edge):
        self.Graph.add_edge(*new_edge)

    def plot_graph(self):
        nx.draw(self.Graph, with_labels=True, node_color="lightgrey")
        
    def check_conversion(self, granularity_from, granularity_to):
        # true: if there is a conversion path between granularity_from to granularity_to
        # false: otherwise
        return nx.has_path(self.Graph, granularity_from, granularity_to)
    
    def all_conversions(self, granularity_from):
        # returns all possible granularities that can be reached from the granularity_from
        connected_set = nx.node_connected_component(self.Graph, granularity_from)  # this includes the starting node
        connected_set.remove(granularity_from)  # exclude starting node
        return connected_set

### Aggregation Graph

In [None]:
class AggregationGraph:
    instances = []  # class attribute to keep track of class instances
    
    def __init__(self, variable_name, granularities, aggregation_edges):
        self.variable_name = variable_name
        self.Graph = nx.DiGraph()
        self.granularities = granularities

        for g in granularities:
            self.Graph.add_node(g)

        for e in aggregation_edges:
            self.Graph.add_edge(*e)  # * unpacks edge tuple
            
         # add self to list of instances
        if self.is_initialised(variable_name):
            # an instance with this variable name was already known, remove that instance (so the new instance is the only one)
            AggregationGraph.instances.remove(self.get(variable_name))
        AggregationGraph.instances.append(self)  # append instance to list of class instances
            
    @classmethod 
    def get(cls: "AggregationGraph", var_name):
        # return the instance of this class for which the value variable_name is equal to var_name
        # each AggregationGraph object should exist exactly once for each variable_name
        
        # first we make a list of "all" instances that statisfy the desired variable name
        list_form = [inst for inst in cls.instances if inst.variable_name == var_name]
        if len(list_form) > 0:
            # list is not empty, so return the first element (there should only be one)
            return list_form[0]
        else:
            # no instance was found
            raise NotInitialisedError("AggregationGraph " + var_name)
            
    def is_initialised(cls: "AggregationGraph", var_name):
        list_form = [inst for inst in cls.instances if inst.variable_name == var_name]
        return len(list_form) > 0

    def get_max_granularities(self):
        return max(self.granularities)        

    def add_granularity(self, new_granularity):
        self.Graph.add_node(new_granularity)

    def add_conversion_edge(self, new_edge):
        self.Graph.add_edge(*new_edge)  # * unpacks edge tuple

    def plot_graph(self): 
        nx.draw(self.Graph, with_labels=True, node_color="lightgrey")
        
    def check_aggregation(self, granularity_from, granularity_to):
        # true: if there is an aggregation path between granularity_from to granularity_to
        # false: otherwise
        return nx.has_path(self.Graph, granularity_from, granularity_to)
    
    def all_aggregations(self, granularity_from):
        # returns all possible granularities that can be reached from the granularity_from
        reacheable_set = nx.descendants(self.Graph, granularity_from) 
        
        return reacheable_set
        
    def all_aggregations_reversed(self, granularity_to):
        # returns all possible granularities that can be reached from the granularity_from
        reacheable_set = nx.ancestors(self.Graph, granularity_to) 
        
        return reacheable_set

### Combining 
Combining depends on two data sources. It is allowed when their right variables are equal. Column wise combining: when the contexts of both input sources have some overlap (non-empty intersection), the union of all left variables is available in the new data source, for the intersection of the context. Row wise combining: when the contexts of both sources have no overlap (empty intersection), the intersection of the left variables is available in the new data source, but the new context is the union of the context of the two input sources. Sometimes, both row-wise and column-wise combination are possible, resulting into two different outcomes. The combines() function checks both options and will return a tuple with te row- and column-wise combination results respectively. If a combination is not possible, the result will be False.

In [None]:
def combines(data1: Data, data2: Data):
    # Create rowwise and colwise (in case combinations are not possible)
    rowwise, colwise = False, False
    
    # Combinations are only possible if the right-hand side variables are equal
    if data1.right_variables == data2.right_variables:
        # The result will have the same right-hand side variables 
        right3 = data1.right_variables
       
        # row-wise combination
        if set(data1.left_variables) & set(data2.left_variables):
            # there is overlap between the left variables 
            
            # row-wise merge possible     
            # no overlap between context of both sources, so only the same left-hand side variables can be merged
            left3 = set(data1.left_variables).intersection(set(data2.left_variables))  # intersection of L1 and L2
            context3 = data1.context.union(data2.context)  # union of C1 and C3
            
            rowwise = Data(right_variables = right3, left_variables = left3, context = context3)
            
        # column-wise combination 
        if data1.context & data2.context:
            # set1 & set2: checks if there exists an intersection between two sets
        
            # column-wise merge possible
            # context of both input sources have overlap, so the left-hand variables can be merged
            left3 = set(data1.left_variables).union(set(data2.left_variables))  # union of L1 and L2
            context3 = data1.context.intersection(data2.context)  # intersection of C1 and C2
            
            colwise = Data(right_variables = right3, left_variables = left3, context = context3)
        
    return rowwise, colwise
        
   

### Set of Sources
Two variants of the similarity score function are implemented: the sum and max of the individual data scores from the data source similarity. 

In [1]:
class SetOfSources:
    def __init__(self, start_set):
        self.set_of_sources = set(start_set)
        self.path = ["start_set"]  # for keeping track of the path that created the current set
        self.tree = []  # for keeping track of which iterations of the algorithm added to this path
        
    def __str__(self):
        full_str = "{" + ",\n ".join([str(d) for d in self.set_of_sources]) + "\n}"
        return full_str
    
    def __eq__(self, other):
        # check if this set equals the other set
        # since the set contains Data objects, which have a __eq__() method, it suffices to rely on that method
        # considered equal if the sets have the same data Data objects, 
        # regardless of order in which the Data objects appear
        return self.set_of_sources == other.set_of_sources
    
    def add_data_source(self, data_new: Data, part_of_path, iteration):
        self.set_of_sources = self.set_of_sources.union({data_new})
        self.add_to_path(part_of_path)
        self.tree.append(iteration)
        
    def add_to_path(self, part_of_path: str):
        # For keeping track of the path (in words).
        self.path.append(part_of_path)
        
    def contains(self, data_source: Data): 
        if data_source in self.set_of_sources:
            return True
        else:
            # check if any of the data sources in self can be shrinked into data_source
            # this takes longer to compute, so only do this step when data_source is not exactly in self
            return any([data_in_self.shrink(data_source) for data_in_self in self.set_of_sources])
        
    def contains_nocontext(self, data_source: Data): ###NEW### (updated)
        # Same as contains() except here, we do NOT care about the context 
        # This is used for some models
        
        matching_sources = [ds for ds in self.set_of_sources if ds.equal_nocontext(data_source)]
        
        if data_source in self.set_of_sources:
            return matching_sources
        else:
            # check if any of the data sources in self can be shrinked into data_source
            # this takes longer to compute, so only do this step when data_source is not exactly in self
            
            matching_sources = [data_in_self for data_in_self in self.set_of_sources if data_in_self.shrink_nocontext(data_source)]
            return matching_sources
        
        
    def data_sources_with_var_left(self, v_name):
        # TODO this code (from the students) can be sped up by list comprehension
        data_sources = set()
        
        for d in self.set_of_sources:
            if d.contains_var_left(v_name):
                data_sources.add(d)
                
        return data_sources
    
    def data_sources_with_var_right(self, v_name):
        data_sources = set()
        
        for d in self.set_of_sources:
            if d.contains_var_right(v_name):
                data_sources.add(d)
                
        return data_sources
            
    def similarity_sum(self, goal_data: Data):
        return sum((goal_data.similarity(d) for d in self.set_of_sources))  
    
    def similarity_max(self, goal_data: Data):
        return max((goal_data.similarity(d) for d in self.set_of_sources)) 
    
    def similarity_mean(self, goal_data: Data):
        return np.mean(list(goal_data.similarity(d) for d in self.set_of_sources))

    def similarity_median(self, goal_data: Data):
        return np.median(list(goal_data.similarity(d) for d in self.set_of_sources))
    
    def similarity_min(self, goal_data: Data):
        return min((goal_data.similarity(d) for d in self.set_of_sources))
    
    def similarity_minmax(self, goal_data: Data):
        return max((goal_data.similarity(d) for d in self.set_of_sources)) * (min((goal_data.similarity(d) for d in self.set_of_sources)))
    
    def similarity_maxmean(self, goal_data: Data):
        return max((goal_data.similarity(d) for d in self.set_of_sources)) + np.mean(list(goal_data.similarity(d) for d in self.set_of_sources))
    
    def similarity_maxmeanmin(self, goal_data: Data):
        return max((goal_data.similarity(d) for d in self.set_of_sources)) * np.mean(list(goal_data.similarity(d) for d in self.set_of_sources)) * min((goal_data.similarity(d) for d in self.set_of_sources))
    
    def similarity_max_per_variable(self, goal_data: Data):        
        idx = 0
        
        goal_contexts = goal_data.get_context()
        maxs = np.zeros(len(goal_data.left_variables)*len(goal_contexts))
        
        for context in goal_contexts:
            
            for var in goal_data.left_variables:
                data_sources = self.data_sources_with_var_left(var.get_name())
                
                if data_sources:
                    maxs[idx] = max((goal_data.similarity(d) for d in data_sources if context in d.get_context()))
                idx += 1
            
        return np.mean(maxs)
    
    def similarity_max_per_variable_bonus(self, goal_data: Data):
        # Reimplemented from the students' version, disregarding context for now
        
        # TODO: check all context-related code by the students, this was not well-defined,
        # It seems context was assumed to be a list. It is now set-based. 
        
        # TODO: check this with the report and reimplement 
        idx = 0
        
        maxs = np.zeros(len(goal_data.left_variables))
        maxs_data_sources = []
        
        for left_var in goal_data.left_variables:
            # get all data sources in current_set that have the same left variable
            data_sources_left_match = list(self.data_sources_with_var_left(left_var.get_name()))

            if len(data_sources_left_match) > 0:
                # one or more data sources were found
                
                scores_list = [goal_data.similarity(d) for d in data_sources_left_match]
                max_idx = np.argmax(scores_list)
                max_data_source = data_sources_left_match[max_idx]
                maxs_data_sources.append(max_data_source.right_variables)
                maxs[idx] = scores_list[max_idx]
            idx += 1
                
        bonus_mult = len(set.intersection(*maxs_data_sources)) / len(set.union(*maxs_data_sources))
        
        return np.mean(maxs) * (bonus_mult + 1)/2    # don't start at 0
                        
    def get_sources(self):
        return self.set_of_sources
    
    def get_neighbours(self, agg = True):
        # based on conversion, aggregation and combination, give all unique datasets that can be created from the current set, with exactly one manipulation
        # since combination often leads quicker to results, we start by combination
        
        all_neighbours = set()
        
        # Conversion and aggregating
        for d in self.set_of_sources:
            # add all items in the set d.get_neighbours() to all_neighbours
            # These neighbours come from the individual datasets (and already have their path noted). 
            all_neighbours.update(d.get_neighbours(agg))
        
        # Combination
        set_of_sources_temp = list(self.set_of_sources)  # temporarily make the set of sources into a list, so the indices are fixed
        
        for i, j in zip(*np.triu_indices(len(set_of_sources_temp), k=1)):
            # Loop through all combinations of (two) available data sources by using the indices of the upper 
            # triangle without diagonal (offset k=1) of a matrix of size n by n, where n = len(self.set_of_sources)
        
            combines_temp_row, combines_temp_col = combines(set_of_sources_temp[i], set_of_sources_temp[j])
            if combines_temp_row:
                # Rowwise combination was possible, so add result to neighbours
                combines_temp_row.path_step = "combine (rowwise): " + str(combines_temp_row)
                all_neighbours.add(combines_temp_row)  
                
            if combines_temp_col:
                # Columnwise combination was possible, so add result to neighbours
                combines_temp_col.path_step = "combine (columnwise): " + str(combines_temp_col)
                all_neighbours.add(combines_temp_col) 
                        
        return all_neighbours
    

    def get_neighbours_models(self, models = None):
        # based on modelling, give all unique datasets that can be created from the current set, with exactly one manipulation
        # since combination often leads quicker to results, we start by combination
        
        all_neighbours = set()
        
        # Modelling
        if models is not None:
            # Since the case study only uses models with exactly two input sources, restrict the combinations
            # to exactly two sources. The code below allows for all combinations of any number, but this increases
            # the running time.
            
            # Get all (unique) subsets of all available sources (like a powerset, except for the emptyset and single sources)
            #data_combos = list(subset_combos(set_of_sources_temp))
            #for dataset_combo in data_combos:  # (then use this instead of the (i, j) loop)
            
            set_of_sources_temp = list(self.set_of_sources)  # temporarily make the set of sources into a list, so the indices are fixed
        
            for i, j in zip(*np.triu_indices(len(set_of_sources_temp), k=1)):
                # Loop through all combinations of (two) available data sources by using the indices of the upper 
                # triangle without diagonal (offset k=1) of a matrix of size n by n, where n = len(self.set_of_sources)

                # Loop through all combinations of available data sources and see if any models are applicable
                for model_tmp in models:
                    # If the model is applicable, the list of output sources will be returned. Note: multiple outputs
                    # are possible. 
                    model_output = model_tmp.apply(potential_input = [set_of_sources_temp[i], set_of_sources_temp[j]])
                    if model_output != False:
                        # The model was applicable and returned output. Add this output to the list of all neighbours.
                        # When unapplicable the value of model_output is False.
                        for mo in model_output:
                            mo.path_step = "model (" + model_tmp.name + "):" + str(mo)
                
                        all_neighbours.update(set(model_output))
                        
        return all_neighbours
    
    

NameError: name 'Data' is not defined

In [None]:
class TestCase:
    def __init__(self, goal, start_set, graphs, models=None):
        self.goal = goal
        self.start_set = start_set
        self.graphs = graphs
        self.models = models

### Modelling
Modelling is currently implemented as exceptions to the rules of the other manipulations. Each available model, however trivial, must be specified. A model is based on a set of input data and output data. If the input data is available, then the output data can be acchieved. 

In [None]:
class Model:
    def __init__(self, input_data, output_data, context_rule):
        self.input_data = set(input_data)  # can be multiple data sources
        self.output_data = output_data
        self.context_rule = context_rule
            
    def apply(self, potential_input):
        # Note: many context rules can be thought of. If they get so specific that the relation between sources and context matter, 
        # the intention is to write a child class that overwrites the apply() function for the specific case
            
        # If each source in the required input (self.input_data) is present in the potential_input, then the model is applicable
        # It gets a bit tricky to check this, because sometimes, the context is less strictly required then other times
        
        if self.context_rule == "exact":
            # Check if the all input sources are available in the potential input, with exact context matches
            
            if all([input_source in potential_input for input_source in self.input_data]):
                # based on exact matches
                return [self.output_data]
            else:
                # if for all required data sources, there is a source in the potential data that can be shrinked into the required data source, 
                # the model can also be applied
                results = []
                for required_data in self.input_data:
                    results.append(any([pi.shrink(required_data) for pi in potential_input]))
                    
                # check if shrinking was enough to satisfy requirements
                if all(results):
                    return [self.output_data]
                else:
                    return False
        
        elif self.context_rule in ["intersection", "union", "equal"]:
            # For these context_rule's we'll need to do some set manipulation to find out if the model requirements are met
            
            output_list = []  # here we will add any outcomes for the inputs that satisfy the context requirements
            context_matches = []  # list of lists 
            
            for input_data_temp in self.input_data:
                # for each required input_data source, check if there are matches in potential_input. If so, add their contexts to the list.
                context_matches_temp = [ds.get_context() for ds in potential_input if ds.equal_nocontext(input_data_temp)]
                context_matches.append(context_matches_temp)
            
            if all(len(context_matches_temp)>0 for context_matches_temp in context_matches):
                # For all required input sources, at least one available data source was found
                
                for context_permutation in itertools.product(*context_matches):
                    # Note: * unpacks the list context_matches once. This results in context_permutations being a tuple of the same length
                    # as the number of required input sources.
                    
                    if self.context_rule == "intersection":
                        # The context of the output_data is the intersection of all contexts of the input data. 
                        # If no intersection is possible, the model cannot be applied
            
                        # Use * to unpack the tuple context_permutaiton, and then we calculate the intersection of the various contexts
                        context_intersection = set.intersection(*context_permutation)
                        if len(context_intersection) > 0:
                            # If this resulted in a non-empty set, we have found a permutation that works! 
                            new_context = context_intersection
                            
                    elif self.context_rule == "union":
                        # Use * to unpack the tuple context_permutaiton, and then we calculate the union of the various contexts
                        context_union = set.union(*context_permutation)
                        if len(context_union) > 0:
                            # If this resulted in a non-empty set, we have found a permutation that works! 
                            new_context = context_union
                            
                    elif self.context_rule == "equal":
                        context_union = set.union(*context_permutation)  # calculate union of all context
                        
                        if all([cp == context_union for cp in context_permutation]):
                            # If the union of all sets are equal to all of the sets, then all sets are equal
                            new_context = context_union
        
                    # Now, we can generate an output of the model.
                    output_data_temp = copy.deepcopy(self.output_data)  # copy the output_data
                    output_data_temp.context = new_context  # overwrite the context 
                    output_list.append(output_data_temp)

                if len(output_list) > 0:
                    # One ore more results were found, these can now be returned
                    return output_list
            else:
                return False
                 
        else: 
            print("Modelling error: the rules regarding context were not clear")
            return False
        

# Helper functions

In [None]:
def subset_combos(s):
    # start at range 2 because we do not need the emptyset or single sources
    return [combo for r in range(2, len(s) + 1) for combo in itertools.combinations(s, r)]
