In [17]:
!pip install python-Levenshtein

Looking in indexes: https://smartiproxy.mgmt.netflix.net/pypi


In [1]:
from experiments import ExperimentRunner
import params

In [2]:
exp = ExperimentRunner(params.experiment1_test, 1)

In [3]:
exp.uuid

'0a7c63314af944219a589e5f9d998d31'

In [4]:
tuple(sorted(exp.__dict__.items()))

(('experiment_parameters',
  {'augment': {None},
   'pre_train': {False},
   'lr': {0.1},
   'decoder_type': {'rnn'},
   'domain_name': {'geoquery'},
   'aug_frac': {1.0},
   'embed_size': {128},
   'hidden_size': {128},
   'seed': {0},
   'dropout': {0.3},
   'cuda': {False},
   'batch_size_train': {256},
   'batch_size_dev': {128},
   'valid_niter': {100},
   'max_epoch': {5},
   'beam_size': {5},
   'max_sentence_length': {1000},
   'encoder_type': {'brnn'},
   'file_path_train': {'data/geo880_train600.tsv'},
   'file_path_dev': {'data/geo880_test280.tsv'},
   'file_path_model': {'model.bin'}}),
 ('n_jobs', 1),
 ('uuid', '0a7c63314af944219a589e5f9d998d31'))

In [5]:
param_grid = exp._get_param_grid(exp.experiment_parameters)

In [6]:
import run

In [24]:
runner = run.Runner(**param_grid[0])

In [25]:
import hashlib

In [28]:
hashlib.sha224(bytearray('_'.join([f'{k}_{v}' for k, v in frozenset(runner.__dict__.items())]), 'utf8')).hexdigest()

'5e7621a769cc753f025f68a33c1fc3bad7fd9f598818f518eabdd454'

In [29]:
a = {'a': 1, 'c': 2, 'b': 3}

In [31]:
sorted(a.keys())

['a', 'b', 'c']

In [40]:
from typing import Dict

In [41]:
def hash_dict(d: Dict) -> str:
    dict_str_rep = '_'.join([f'{key}_{d[key]}' for key in sorted(d.keys())])
    return hashlib.sha224(bytearray(dict_str_rep, 'utf8')).hexdigest()

In [82]:
from typing import NamedTuple, Optional

In [83]:
class ExperimentResults(NamedTuple):
    sequence_correct: int
    sequence_total: int
    sequence_accuracy: float
    token_correct: int
    token_total: int
    token_accuracy: float
    denotation_correct: Optional[int]
    denotation_total: Optional[int]
    denotation_accuracy: Optional[float]

In [86]:
a = ExperimentResults(1, 2, 3, 4, 5, 6, 7, 8, 9)

In [90]:
b = a._asdict()

In [116]:
from typing import List, Tuple, Set
from collections import defaultdict
DomainDataTokenized = List[Tuple[List[str], List[str]]]
DomainDataString = List[Tuple[str, str]]


from grammar import Grammar
from utils import stopwords

class CoOccurrence:
    def __init__(self, src_tgt: DomainDataString):
        self.window_size = 3
        self.token_size_threshold = 3
        self.support = 0.3
        self.support_abs = 2
        self.aug_prob = 0.5

        self.src_tgt = src_tgt
        self.lexicon_src_tgt = self._generate_lexicon(self.src_tgt, is_reverse=False)
        self.lexicon_tgt_src = self._generate_lexicon([(y, x) for x, y in self.src_tgt], is_reverse=True)

    def _generate_lexicon(self, src_tgt: DomainDataString, is_reverse: bool = False) -> Dict[str, Set[str]]:
        cnt = defaultdict(lambda: defaultdict(int))
        cnt_tgt = defaultdict(int)

        # _answer appears everywhere for target -> source
        stop_words = set(stopwords).union({'_answer'} if is_reverse else set())

        for src, tgt in src_tgt:
            src_tokens = src.split()
            tgt_tokens = tgt.split()

            for tgt_token in tgt_tokens:
                cnt_tgt[tgt_token] += 1

            for i, src_token in enumerate(src_tokens):
                if len(src_token) >= self.token_size_threshold and src_token not in stop_words:
                    for j in range(-self.window_size, self.window_size + 1):
                        if (0 <= i + j < len(tgt_tokens) and len(tgt_tokens[i + j]) >= self.token_size_threshold and
                                tgt_tokens[i + j] not in stop_words):
                            cnt[tgt_tokens[i + j]][src_token] += 1

        # set of src words that positionally (within a window) co-occur with some min support with the target
        x = {
            tgt: {src: cnt[tgt][src] for src in cnt[tgt] if
                  cnt[tgt][src] >= self.support * cnt_tgt[tgt] and cnt[tgt][src] >= self.support_abs}
            for tgt in cnt
            if
            len({src for src in cnt[tgt] if
                 cnt[tgt][src] >= self.support * cnt_tgt[tgt] and cnt[tgt][src] >= self.support_abs}) >= 2
        }

        # lookup for co-occurring tokens
        out = defaultdict(set)
        for val_dict in x.values():
            val_vals = val_dict.keys()
            for val in val_vals:
                for val_map_to in val_vals:
                    if val_map_to != val:
                        out[val].add(val_map_to)

        return out

    def _aug(self, token: str, src: bool) -> str:
        lexicon = self.lexicon_src_tgt if src else self.lexicon_tgt_src
        if token in lexicon:
            do_aug = random.random() <= self.aug_prob
            return random.choice(list(lexicon[token])) if do_aug else token
        else:
            return token

    def _sample_item(self, x_str: str, y_str: str) -> Tuple[str, str]:
        x_lst, y_lst = x_str.split(), y_str.split()
        xs_aug = ' '.join([self._aug(x, src=True) for x in x_lst])
        ys_aug = ' '.join([self._aug(y, src=False) for y in y_lst])
        return xs_aug, ys_aug

    # def sample(self, n) -> CoOccurrenceData:
    #     src_tgt_n = random.sample(self.src_tgt, n)
    #     return [self._sample_item(x_str, y_str) for x_str, y_str in src_tgt_n]

    def __call__(self, src_tgt: DomainDataString, n: Optional[int] = None) -> DomainDataString:
        if n:
            src_tgt = random.sample(src_tgt, n)
        return [self._sample_item(x_str, y_str) for x_str, y_str in src_tgt]

In [117]:
import random

In [120]:
co = CoOccurrence([('a b c', 'c d e'), ('a b c', 'a d e')])

In [123]:
co([('a b', 'c d')], 2)

ValueError: Sample larger than population or is negative

In [70]:
class A:
    def __init__(self, a):
        self.a = a
        self.uuid = self._get_uuid()
        self.c = a + 1
        
    def _get_uuid(self):
        return hash_dict(self.__dict__)

In [71]:
a = A(5)

In [72]:
a.uuid

'62357b2f9297ef4295f9c41d8f259a9f0c100d1ff67b518ef3525a52'

In [77]:
a.c = 20

In [78]:
a.uuid

'62357b2f9297ef4295f9c41d8f259a9f0c100d1ff67b518ef3525a52'

In [38]:
a.b['b']['b']['b']

{'a': 5, 'b': {...}}

In [20]:
from Levenshtein import distance

In [131]:
data = 'data/geo880_train600.tsv'
src_tgt = [line.strip().split('\t') for line in open(data)]

In [166]:
b = {'?', 'florida', 'highest', 'in', 'is', 'point', 'the', 'what'}
c = {'?',
   'could',
   'highest',
   'in',
   'is',
   'me',
   'of',
   'oregon',
   'point',
   'state',
   'tell',
   'the',
   'what',
   'you'}
len(b.intersection(c)) == max(len(b), len(c)) - 1

False

In [168]:
mapping = {'florida': 'alabama'}

In [167]:
[list(b)

['?', 'the', 'in', 'what', 'point', 'highest', 'florida', 'is']

In [164]:
srcs = [set(src.split()) for src, _ in src_tgt]

co_occur_map = defaultdict(set)

co_occur = [
    (list(b.difference(c))[0], list(c.difference(b))[0])
    for b in srcs
    for c in srcs
    if (b != c
        and len(b.intersection(c)) == len(b) - 1
        and len(b) == len(c)
        and list(b.difference(c))[0] not in stopwords
        and list(c.difference(b))[0] not in stopwords)
]

for a, b in co_occur:
    co_occur_map[a].add(b)
    co_occur_map[b].add(a)
    
co_occur_map

defaultdict(set,
            {'florida': {'arizona',
              'california',
              'colorado',
              'country',
              'delaware',
              'kansas',
              'louisiana',
              'ohio',
              'texas',
              'us',
              'wyoming'},
             'us': {'alaska',
              'america',
              'arizona',
              'california',
              'colorado',
              'country',
              'florida',
              'georgia',
              'idaho',
              'iowa',
              'kansas',
              'mississippi',
              'nebraska',
              'ohio',
              'oregon',
              'pennsylvania',
              'texas',
              'usa',
              'wyoming'},
             'wyoming': {'arizona',
              'california',
              'colorado',
              'country',
              'delaware',
              'florida',
              'georgia',
              'kansas',
      

In [None]:
[
    for src, _ in src_tgt
    for i, x in enumerate(src.split())
]

In [142]:
Counter([
    (i, x)
    for src, _ in src_tgt
    for i, x in enumerate(src.split())
    if len(x) >= 3 and x not in stopwords
]).most_common()

[((1, 'many'), 90),
 ((1, 'states'), 63),
 ((1, 'state'), 60),
 ((3, 'population'), 41),
 ((2, 'border'), 34),
 ((4, 'city'), 32),
 ((4, 'point'), 31),
 ((2, 'states'), 31),
 ((4, 'river'), 28),
 ((2, 'people'), 26),
 ((3, 'highest'), 25),
 ((1, 'rivers'), 24),
 ((7, 'state'), 24),
 ((3, 'capital'), 23),
 ((3, 'live'), 22),
 ((4, 'cities'), 22),
 ((3, 'largest'), 22),
 ((4, 'state'), 21),
 ((3, 'major'), 20),
 ((6, 'state'), 20),
 ((2, 'rivers'), 19),
 ((5, 'river'), 18),
 ((6, 'states'), 17),
 ((3, 'longest'), 17),
 ((6, 'texas'), 16),
 ((3, 'area'), 15),
 ((3, 'biggest'), 15),
 ((3, 'cities'), 15),
 ((3, 'lowest'), 14),
 ((5, 'population'), 14),
 ((5, 'texas'), 13),
 ((8, 'state'), 13),
 ((5, 'states'), 11),
 ((3, 'states'), 11),
 ((4, 'highest'), 11),
 ((3, 'smallest'), 11),
 ((4, 'largest'), 11),
 ((2, 'run'), 11),
 ((2, 'major'), 10),
 ((3, 'border'), 10),
 ((7, 'states'), 10),
 ((5, 'city'), 9),
 ((3, 'shortest'), 9),
 ((0, 'give'), 9),
 ((6, 'river'), 9),
 ((2, 'cities'), 9),
 (

In [135]:
for i in range(max(map(lambda x: len(x[0]), src_tgt))):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [128]:
from collections import Counter

In [129]:
src_tgt[0]

NameError: name 'src_tgt' is not defined

In [130]:


Counter([
    (x, y)
    for src, tgt in src_tgt
    for i, (x, y) in enumerate(zip(src.split(), tgt.split()))
    if distance(x, y) <= 2 and len(x) >= 3 and len(y) >= 3
])

NameError: name 'distance' is not defined

In [41]:
friends = set()
friends.union({1, 2, 3})

{1, 2, 3}

In [58]:
friends = []

window_size = 5
distance_threshold = 3
token_size_threshold = 3

for src, tgt in src_tgt:
    src_tokens = src.split()
    tgt_tokens = tgt.split()
    
    for i, src_token in enumerate(src_tokens):
        if len(src_token) >= token_size_threshold:
            friends += [
                (src_token, tgt_tokens[i + j])
                for j in range(-window_size, window_size + 1)
                if (0 <= i + j < len(tgt_tokens) and len(tgt_tokens[i + j]) >= token_size_threshold and distance(tgt_tokens[i + j], src_token) <= distance_threshold)
            ]

In [59]:
Counter(friends)

Counter({('highest', '_highest'): 26,
         ('state', '_state'): 62,
         ('mountain', '_mountain'): 4,
         ('capital', '_capital'): 25,
         ('populations', '_population'): 7,
         ('name', '_lake'): 1,
         ('lakes', '_lake'): 4,
         ('states', '_state'): 96,
         ('population', '_population'): 37,
         ('rivers', '_river'): 31,
         ('lowest', '_lowest'): 14,
         ('capitals', '_capital'): 4,
         ('size', '_size'): 5,
         ('shortest', '_shortest'): 9,
         ('river', '_river'): 32,
         ('major', '_major'): 27,
         ('most', '_most'): 13,
         ('long', '_len'): 7,
         ('the', '_len'): 18,
         ('area', '_area'): 15,
         ('longest', '_longest'): 21,
         ('smallest', '_smallest'): 21,
         ('largest', '_largest'): 42,
         ('city', '_city'): 26,
         ('over', '_river'): 1,
         ('are', '_lake'): 1,
         ('density', '_density'): 8,
         ('mountains', '_mountain'): 1,
       

In [61]:
tuples = [
    (x, y)
    for src, tgt in src_tgt
    for i, (x, y) in enumerate(zip(src.split(), tgt.split()))
    # if distance(x, y) <= 2 and len(x) >= 3 and len(y) >= 3
]

In [62]:
from collections import defaultdict

In [73]:
cnt = defaultdict(lambda: defaultdict(int))

In [74]:
for src, tgt in tuples:
    if len(src) >= 3 and len(tgt) >= 3:
        cnt[tgt][src] += 1

In [81]:
for src in cnt['_city']:
    print(src)

the
are
border
pennsylvania
usa
virginia
texas
citizens
borders
california
population


In [None]:
{
    ''
}

{'_answer': {'what': 384, 'which': 53, 'how': 111, 'where': 17},
 '_largest': {'state': 13, 'city': 19}}

In [102]:
from nltk.corpus import stopwords

In [139]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [134]:
def generate_lexicon(src_tgt, is_reverse=False):
    cnt = defaultdict(lambda: defaultdict(int))
    cnt_tgt = defaultdict(int)

    window_size = 3
    # distance_threshold = 3
    token_size_threshold = 3
    support = 0.3
    support_abs = 2
    stop_words = set(stopwords.words('english')).union({'_answer'} if is_reverse else set())


    for src, tgt in src_tgt:
        src_tokens = src.split()
        tgt_tokens = tgt.split()

        for tgt_token in tgt_tokens:
            cnt_tgt[tgt_token] += 1

        for i, src_token in enumerate(src_tokens):
            if len(src_token) >= token_size_threshold and src_token not in stop_words:
                for j in range(-window_size, window_size + 1):
                    if (0 <= i + j < len(tgt_tokens) and len(tgt_tokens[i + j]) >= token_size_threshold and tgt_tokens[i + j] not in stop_words):
                        cnt[tgt_tokens[i + j]][src_token] += 1

    x = {
        tgt: {src: cnt[tgt][src] for src in cnt[tgt] if cnt[tgt][src] >= support * cnt_tgt[tgt] and cnt[tgt][src] >= support_abs}
        for tgt in cnt
        if len({src for src in cnt[tgt] if cnt[tgt][src] >= support * cnt_tgt[tgt] and cnt[tgt][src] >= support_abs}) >= 2
    }
    
    out = defaultdict(set)
    for val_dict in x.values():
        val_vals = val_dict.keys()
        for val in val_vals:
            for val_map_to in val_vals:
                if val_map_to != val:
                    out[val].add(val_map_to)
                    
    return out

In [146]:
random.sample([(1, 2), (3, 4), (5, 6)], 2)

[(1, 2), (3, 4)]

In [140]:
import random

In [143]:
random.random()

0.6232124459954171

In [144]:
random.choice({1, 2, 3})

TypeError: 'set' object does not support indexing

In [115]:
tgt_src = 

In [136]:
generate_lexicon(src_tgt, is_reverse=False)

defaultdict(set,
            {'highest': {'point'},
             'point': {'highest', 'lowest'},
             'lakes': {'states'},
             'states': {'borders',
              'cities',
              'lakes',
              'least',
              'many',
              'population',
              'rivers',
              'state'},
             'lowest': {'point'},
             'many': {'cities', 'rivers', 'states'},
             'cities': {'major', 'many', 'rivers', 'states'},
             'rivers': {'cities', 'many', 'states'},
             'shortest': {'river'},
             'river': {'length', 'longest', 'shortest'},
             'major': {'cities'},
             'state': {'borders',
              'largest',
              'least',
              'population',
              'smallest',
              'states'},
             'length': {'river'},
             'longest': {'river'},
             'smallest': {'population', 'state'},
             'population': {'density', 'smallest', 'state

In [137]:
generate_lexicon(tgt_src, is_reverse=True)

defaultdict(set,
            {'_state': {'_count',
              '_density',
              '_largest',
              '_population',
              '_river'},
             '_river': {'_city', '_longest', '_population', '_state'},
             '_lake': {'_major'},
             '_major': {'_count', '_lake'},
             '_population': {'_river', '_state'},
             '_count': {'_major', '_state'},
             '_largest': {'_density', '_state'},
             '_density': {'_area', '_largest', '_state'},
             '_place': {'_loc'},
             '_loc': {'_place'},
             '_area': {'_density'},
             '_longest': {'_river'},
             '_city': {'_river'},
             '_smallest': {'_fewest'},
             '_fewest': {'_smallest'}})

In [191]:
from typing import List, Tuple, Dict, Set
import random

stopwords = {'a',
             'about',
             'above',
             'after',
             'again',
             'against',
             'ain',
             'all',
             'am',
             'an',
             'and',
             'any',
             'are',
             'aren',
             "aren't",
             'as',
             'at',
             'be',
             'because',
             'been',
             'before',
             'being',
             'below',
             'between',
             'both',
             'but',
             'by',
             'can',
             'couldn',
             "couldn't",
             'd',
             'did',
             'didn',
             "didn't",
             'do',
             'does',
             'doesn',
             "doesn't",
             'doing',
             'don',
             "don't",
             'down',
             'during',
             'each',
             'few',
             'for',
             'from',
             'further',
             'had',
             'hadn',
             "hadn't",
             'has',
             'hasn',
             "hasn't",
             'have',
             'haven',
             "haven't",
             'having',
             'he',
             'her',
             'here',
             'hers',
             'herself',
             'him',
             'himself',
             'his',
             'how',
             'i',
             'if',
             'in',
             'into',
             'is',
             'isn',
             "isn't",
             'it',
             "it's",
             'its',
             'itself',
             'just',
             'll',
             'm',
             'ma',
             'me',
             'mightn',
             "mightn't",
             'more',
             'most',
             'mustn',
             "mustn't",
             'my',
             'myself',
             'needn',
             "needn't",
             'no',
             'nor',
             'not',
             'now',
             'o',
             'of',
             'off',
             'on',
             'once',
             'only',
             'or',
             'other',
             'our',
             'ours',
             'ourselves',
             'out',
             'over',
             'own',
             're',
             's',
             'same',
             'shan',
             "shan't",
             'she',
             "she's",
             'should',
             "should've",
             'shouldn',
             "shouldn't",
             'so',
             'some',
             'such',
             't',
             'than',
             'that',
             "that'll",
             'the',
             'their',
             'theirs',
             'them',
             'themselves',
             'then',
             'there',
             'these',
             'they',
             'this',
             'those',
             'through',
             'to',
             'too',
             'under',
             'until',
             'up',
             've',
             'very',
             'was',
             'wasn',
             "wasn't",
             'we',
             'were',
             'weren',
             "weren't",
             'what',
             'when',
             'where',
             'which',
             'while',
             'who',
             'whom',
             'why',
             'will',
             'with',
             'won',
             "won't",
             'wouldn',
             "wouldn't",
             'y',
             'you',
             "you'd",
             "you'll",
             "you're",
             "you've",
             'your',
             'yours',
             'yourself',
             'yourselves'}


CoOccurrenceData = List[Tuple[str, str]]

class CoOccurrence:
    def __init__(self, src_tgt: CoOccurrenceData):
        self.window_size = 3
        self.token_size_threshold = 3
        self.support = 0.3
        self.support_abs = 2
        self.aug_prob = 0.5

        self.src_tgt = src_tgt
        self.lexicon_src_tgt = self._generate_lexicon(self.src_tgt, is_reverse=False)
        self.lexicon_tgt_src = self._generate_lexicon([(y, x) for x, y in self.src_tgt], is_reverse=True)

    def _generate_lexicon(self, src_tgt: List[Tuple[str, str]], is_reverse: bool = False) -> Dict[str, Set[str]]:
        cnt = defaultdict(lambda: defaultdict(int))
        cnt_tgt = defaultdict(int)

        # _answer appears everywhere for target -> source
        stop_words = set(stopwords).union({'_answer'} if is_reverse else set())

        for src, tgt in src_tgt:
            src_tokens = src.split()
            tgt_tokens = tgt.split()

            for tgt_token in tgt_tokens:
                cnt_tgt[tgt_token] += 1

            for i, src_token in enumerate(src_tokens):
                if len(src_token) >= self.token_size_threshold and src_token not in stop_words:
                    for j in range(-self.window_size, self.window_size + 1):
                        if (0 <= i + j < len(tgt_tokens) and len(tgt_tokens[i + j]) >= self.token_size_threshold and
                                tgt_tokens[i + j] not in stop_words):
                            cnt[tgt_tokens[i + j]][src_token] += 1

        # set of src words that positionally (within a window) co-occur with some min support with the target
        x = {
            tgt: {src: cnt[tgt][src] for src in cnt[tgt] if
                  cnt[tgt][src] >= self.support * cnt_tgt[tgt] and cnt[tgt][src] >= self.support_abs}
            for tgt in cnt
            if
            len({src for src in cnt[tgt] if
                 cnt[tgt][src] >= self.support * cnt_tgt[tgt] and cnt[tgt][src] >= self.support_abs}) >= 2
        }

        # lookup for co-occurring tokens
        out = defaultdict(set)
        for val_dict in x.values():
            val_vals = val_dict.keys()
            for val in val_vals:
                for val_map_to in val_vals:
                    if val_map_to != val:
                        out[val].add(val_map_to)

        return out

    def _aug(self, token: str, src: bool) -> str:
        lexicon = self.lexicon_src_tgt if src else self.lexicon_tgt_src
        if token in lexicon:
            do_aug = random.random() <= self.aug_prob
            return random.choice(list(lexicon[token])) if do_aug else token
        else:
            return token

    def _sample_item(self, x_str: str, y_str: str) -> Tuple[str, str]:
        x_lst, y_lst = x_str.split(), y_str.split()
        xs_aug = ' '.join([self._aug(x, src=True) for x in x_lst])
        ys_aug = ' '.join([self._aug(y, src=False) for y in y_lst])
        return xs_aug, ys_aug

    def sample(self, n) -> CoOccurrenceData:
        src_tgt_n = random.sample(self.src_tgt, n)
        return [self._sample_item(x_str, y_str) for x_str, y_str in src_tgt_n]

In [192]:
co = CoOccurrence(src_tgt)

In [193]:
co.lexicon_tgt_src

defaultdict(set,
            {'_state': {'_count',
              '_density',
              '_largest',
              '_population',
              '_river'},
             '_river': {'_city', '_longest', '_population', '_state'},
             '_lake': {'_major'},
             '_major': {'_count', '_lake'},
             '_population': {'_river', '_state'},
             '_count': {'_major', '_state'},
             '_largest': {'_density', '_state'},
             '_density': {'_area', '_largest', '_state'},
             '_place': {'_loc'},
             '_loc': {'_place'},
             '_area': {'_density'},
             '_longest': {'_river'},
             '_city': {'_river'},
             '_smallest': {'_fewest'},
             '_fewest': {'_smallest'}})

In [197]:
co.sample(5)

[('what is the biggest city in usa ?',
  '_answer ( A , _largest ( A , ( _city ( A ) , _loc ( A , B ) , _const ( B , _countryid ( usa ) ) ) ) )'),
 ('what cities in texas have the highest number of citizens ?',
  '_answer ( A , _density ( B , ( _city ( A ) , _loc ( A , C ) , _const ( C , _stateid ( texas ) ) , _state ( A , B ) ) ) )'),
 ('what states border hawaii ?',
  '_answer ( A , ( _state ( A ) , _next_to ( A , B ) , _const ( B , _stateid ( hawaii ) ) ) )'),
 ('where is mount whitney ?',
  "_answer ( A , ( _place ( B , A ) , _const ( B , _placeid ( ' mount whitney ' ) ) ) )"),
 ('where is the lowest spot in iowa ?',
  '_answer ( A , _lowest ( A , ( _loc ( A ) , _place ( A , B ) , _const ( B , _stateid ( iowa ) ) ) ) )')]