-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_generation.py
143 lines (103 loc) · 5.06 KB
/
data_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import random
from tokenize import Double
from aalpy.SULs import MealySUL
from aalpy.base.SUL import CacheSUL
from aalpy.learning_algs import run_Lstar
from aalpy.utils.HelperFunctions import all_prefixes
class DataSet:
def __init__(self, data, size, steps) -> None:
self.data = data
self.size = size
self.steps = steps
def average_len(self) -> Double:
return self.steps / self.size
def data_from_l_star_E_set(hypothesis, e_set, include_extended_s_set=True, prefix_closed=True, verbose=False):
observation_table_data = []
prefixes = [state.prefix for state in hypothesis.states]
if include_extended_s_set:
extended_prefixes = []
for p in prefixes:
for a in hypothesis.get_input_alphabet():
extended_prefixes.append(p + tuple([a]))
prefixes.extend(extended_prefixes)
data_set_tmp = set()
data_set = set()
for prefix in prefixes:
for suffix in e_set:
cell = prefix + suffix
data_set_tmp.add(cell)
if prefix_closed:
data_set.update(all_prefixes(cell))
else:
data_set.add(cell)
sequence_step_sum = sum([len(i) for i in data_set_tmp])
data_set_size = len(data_set_tmp)
if verbose:
print(f'Number of samples provided to RPNI: {len(data_set_tmp)}')
print(f'Average length of samples provided to RPNI: {round(sequence_step_sum/data_set_size, 2)}')
for seq in list(data_set):
output = hypothesis.compute_output_seq(hypothesis.initial_state, seq)[-1]
observation_table_data.append((seq, output))
return DataSet(observation_table_data, data_set_size, sequence_step_sum)
def data_from_computed_e_set(hypothesis, include_extended_s_set=True, prefix_closed=True, verbose=False):
return data_from_l_star_E_set(hypothesis, hypothesis.compute_characterization_set(), include_extended_s_set,
prefix_closed, verbose)
def minimized_char_set_data(hypothesis, include_extended_s_set=True, prefix_closed=True, verbose=False):
from aalpy.learning_algs.deterministic_passive.rpni_helper_functions import extract_unique_sequences, createPTA
data = data_from_computed_e_set(hypothesis, include_extended_s_set, prefix_closed)
input_sequences = []
for seq in extract_unique_sequences(createPTA(data.data, automaton_type='mealy')):
input_sequences.append([io[0][0] for io in seq])
sequence_step_sum = sum([len(i) for i in input_sequences])
average_length = sequence_step_sum / len(input_sequences)
data_set_size = len(input_sequences)
data_set_steps = sequence_step_sum
if verbose:
print(f'Number of samples provided to RPNI: {data_set_size}')
print(f'Average length of samples provided to RPNI: {round(average_length, 2)}')
if prefix_closed:
prefix_closed_seq = set()
for seq in input_sequences:
prefix_closed_seq.update(all_prefixes(seq))
input_sequences = list(prefix_closed_seq)
pruned_data = []
for seq in input_sequences:
output = hypothesis.compute_output_seq(hypothesis.initial_state, seq)[-1]
pruned_data.append((seq, output))
return DataSet(pruned_data, data_set_size, data_set_steps)
def generate_random_data(model, num_sequences, min_sequence_len, max_sequence_len, verbose=False, prefix_closed=True):
data = []
input_alphabet = model.get_input_alphabet()
random_sequences = [random.choices(input_alphabet, k=random.randint(min_sequence_len, max_sequence_len))
for _ in range(num_sequences)]
sequence_step_sum = sum([len(i) for i in random_sequences])
data_length = len(random_sequences)
if verbose:
print(f'Number of samples provided to RPNI: {len(random_sequences)}')
print(f'Average length of samples provided to RPNI: {round(sequence_step_sum/data_length, 2)}')
if prefix_closed:
prefix_closed_seq = set()
for seq in random_sequences:
prefix_closed_seq.update(all_prefixes(seq))
random_sequences = list(prefix_closed_seq)
for seq in random_sequences:
output = model.compute_output_seq(model.initial_state, seq)[-1]
data.append((seq, output))
return DataSet(data, data_length, sequence_step_sum)
def l_star_with_populated_cache(model, cache_data, eq_oracle):
# Note: This circumvents the fact if the data is not prefix-closed
cache_io_seq = []
for seq in cache_data:
i = seq[0]
o = model.compute_output_seq(model.initial_state, i)
cache_io_seq.append(zip(i, o))
sul = CacheSUL(MealySUL(model))
for sample in cache_io_seq:
sul.cache.reset()
for i, o in sample:
sul.cache.step_in_cache(i, o)
alphabet = model.get_input_alphabet()
l_star_model, data = run_Lstar(alphabet, sul, eq_oracle, 'mealy', print_level=0, return_data=True)
cache_hits = sul.num_cached_queries
queries_posed = data['queries_learning'] + data['queries_eq_oracle'] - cache_hits
return l_star_model, queries_posed, cache_hits, data['learning_rounds']