## PCFG (White and Cotterell, 2021)

In [1]:
import sys

sys.path.append("..")
from src.length_sampling.sampler import construct_pcfg_sampler
from src.length_sampling.grammars.pcfg import Grammar
from src.length_sampling.grammars.cfg import Nonterminal
from src.length_sampling.util import group_by, get_random_generator_and_seed

In [2]:
start = Nonterminal("S")
pcfg = Grammar.from_file(
    file_path="/cluster/home/tsomeya/projects/impossible_inherent_entropy/data/grammars/variations/initial_exp/0002002.gr",
    start=start,
    normalize=True,
)

In [3]:
sampler = construct_pcfg_sampler(pcfg)
generator, random_seed = get_random_generator_and_seed(0)

In [4]:
def sample_with_length_constraint(sampler, generator, valid_lengths, sample_size):
    samples = []
    while len(samples) < sample_size:
        length = generator.choice(valid_lengths)
        samples.append(list(sampler.sample(length, generator)))

    return samples

In [12]:
valid_lengths = sampler.valid_lengths(1, 5)
samples = sample_with_length_constraint(sampler, generator, valid_lengths, 10)
samples

[['sub', 'la', 'crotifieda'],
 ['la', 'sub', 'scuppeda'],
 ['lawticaters', 'sub', 'peachicianeda'],
 ['se', 'si', 'lu', 'sub', 'madressized'],
 ['jeethe', 'stroater', 'sub', 'maksizes'],
 ['fampenates', 'rel', 'bellion', 'sub', 'bunessistes'],
 ['jont', 'businer', 'sub', 'flapperates'],
 ['sub', 'striberist', 'stroves'],
 ['la', 'sub', 'yawleda'],
 ['wolarifieda', 'rel', 'penursifers', 'sub', 'yolveda']]

In [13]:
import gzip

with gzip.open(
    "/cluster/home/tsomeya/projects/impossible_inherent_entropy/data/variations_wc/min1_max20_10K/0002002/samples.txt.gz",
    "rt",
) as f:
    samples = [line.strip().split() for line in f]

In [20]:
for sample in samples:
    sent = " ".join(sample)
    if len(sample) < 4:
        print(sent)

sub pi crotified
sub bo benaticianes
la sub peachiciane
fantivist sub wolarified
sub la duppeda
frawkesses sub nestatate
la sub dupp
sub me rawned
drousemans sub tiraste
sub mofustator visified
drinimat sub elusifies
sub se mauces
justators sub calikateda
si sub crotify
sub ja engaried
sub si portify
la sub povicate
prither sub stoppisted
ja sub stroves
ja sub crotifies
sub me fampenates
sub tibs bottisteda
fragion sub miticurizes
sub ja nestatated
sub povicians poutereda
sub ja piliciated
noacher sub intrinsed
la sub stoppisteda
ja sub froachered
se sub yawls
si sub forigate
sub se jantomized
si sub madressize
wolarists sub thespirateda
sub bo smeshs
sub si mangerizeda
la sub inthippenate
sub me scupped
pi sub swarmicated
sub froachers phanicizeda
sub businator scupped
sub fumitists spectifieda
bo sub yolves
moldician sub apturicized
senths sub rolve
sub botigners elusifieda
me sub gratenizes
sub guendor lurchified
thullamor sub mued
sub ja prausifies
sub bellion phanicized
sub miltur

## calculate log probs

In [15]:
from pathlib import Path
import gzip
import pandas as pd
from collections import Counter

import sys

sys.path.append("..")
from src.length_sampling.sampler import construct_pcfg_sampler
from src.length_sampling.grammars.pcfg import Grammar
from src.length_sampling.grammars.cfg import Nonterminal
from src.length_sampling.util import group_by, get_random_generator_and_seed

In [16]:
start = Nonterminal("S")
pcfg = Grammar.from_file(
    file_path="../data_gen/base-grammar_eos_zipf.gr", start=start, normalize=True
)
sampler = construct_pcfg_sampler(pcfg)

In [47]:
# # output
# with gzip.open(args.output_path, "wt", encoding="utf-8") as f:
#     sentence_counts.write_csv(f)
import json

with gzip.open(
    "/cluster/home/tsomeya/projects/impossible_inherent_entropy/results/length_sampling/100M_samples_eos_zipf_min1_max20/probabilities_split_1_of_10.json.gz",
    "rt",
    encoding="utf-8",
) as f:
    probabilities = json.load(f)

In [48]:
probabilities

{'sentence': 'baniticians sub pi sub spectifies da onstigipates sa kurcheda rel baniticians sub maksizeda sa crail [eos]',
 'count': 1,
 'true_log_prob': -43.88993971696075,
 'true_prob': 8.686430912790711e-20}

In [49]:
with gzip.open(
    "/cluster/home/tsomeya/projects/impossible_inherent_entropy/results/length_sampling/100M_samples_eos_zipf_min1_max20/sample_counts.gz",
    "rt",
    encoding="utf-8",
) as f:
    probabilities = json.load(f)