In [7]:
from pathlib import Path
import numpy as np
import random
from omnibelt import load_json

In [8]:
path = '../data/cladder-v1-aggregate.json'
path = '../data/cladder-v1-common-easy.json'

In [68]:
full = load_json(path)
full_ids = {entry['question_id']: entry for entry in full}
models = load_json('../data/cladder-v1-models.json')
model_table = {info['model_id']: info for info in models}
len(full), len(models)

(10392, 7688)

In [10]:

class SkipItem(Exception):
	pass

def _extract_key(entry, models):

	meta = entry.get('meta', {})

	model = models.get(entry.get('model_id', None), {})

	story_id = meta.get('story_id', None)
	graph_id = meta.get('graph_id', None)
	query_type = meta.get('query_type', None)
	answer = entry.get('answer', None)

	if query_type == 'correlation' and graph_id == 'collision':
		raise SkipItem


	return (
		meta.get('story_id', None),
        meta.get('graph_id', None),
        meta.get('query_type', None),
        entry.get('answer', None),
        model.get('difficulty', None),
        # 'not-anti' if model.get('anticommonsense', None) is None else 'anticommonsense',
        # 'nonsense' if meta.get('story_id', '').startswith('nonsense') else 'not-nonsense',
	        )


In [11]:

stats = {}
for entry in full:
	try:
		stats.setdefault(_extract_key(entry, model_table), []).append(entry['question_id'])
	except SkipItem:
		pass
len(stats)

480

In [12]:
keys = random.choices(list(stats.keys()), k=10)
keys

[('gender_admission', 'mediation', 'marginal', 'no', None),
 ('smoking_frontdoor', 'frontdoor', 'marginal', 'yes', None),
 ('celebrity', 'collision', 'marginal', 'no', None),
 ('smoking_gene_cancer', 'arrowhead', 'marginal', 'yes', None),
 ('college_wage', 'IV', 'ate', 'yes', None),
 ('firing_employee', 'diamondcut', 'ett', 'yes', None),
 ('encouagement_program', 'mediation', 'backadj', 'no', None),
 ('water_cholera', 'IV', 'marginal', 'yes', None),
 ('simpson_kidneystone', 'confounding', 'correlation', 'yes', None),
 ('college_salary', 'chain', 'ett', 'yes', None)]

In [64]:
cols = ['story_id', 'graph_id', 'query_type', 'answer', 'difficulty']

def find_indices(**conditions):
	for key, inds in stats.items():
		for k, v in conditions.items():
			if key[cols.index(k)] != v:
				break
		else:
			yield from inds

In [82]:
template = r'''\begin{table}[h!]
    \centering
    \small
    \setlength\tabcolsep{5pt}
    \begin{tabular}{r | p{10cm}}
        \hline
         \textbf{Section} & \textbf{Text} \\
         \hline
         \textbf{Graph Structure} & (background) \\
         \textbf{Available Data} & (given_info)  \\
         \textbf{Question} & (question)  \\
         \hline
         \textbf{Answer} & (answer)  \\
         \hline
         \textbf{Reasoning Step \stepone{}} & (step1) \\
         \textbf{Step \steptwo{}} & (step2) \\
         \textbf{Step \stepthree{}} & (step3) \\
         \textbf{Step \stepfour{}} & (step4) \\
         \textbf{Step \stepfive{}} & (step5) \\
         \hline
         \textbf{Ground-Truth Scalar Value} & (groundtruth)  \\
        \hline
    \end{tabular}
    \caption{Example question asking about (query_type) for the ``(graph_id)'' graph using the story ``(story_id)''.}
    \label{tab:dataexample(example_id)}
\end{table}'''

template = template.replace('{', '{{').replace('}', '}}').replace('(', '{').replace(')', '}')
print(template)

\begin{{table}}[h!]
    \centering
    \small
    \setlength\tabcolsep{{5pt}}
    \begin{{tabular}}{{r | p{{10cm}}}}
        \hline
         \textbf{{Section}} & \textbf{{Text}} \\
         \hline
         \textbf{{Background}} & {background} \\
         \textbf{{Given Info}} & {given_info}  \\
         \textbf{{Question}} & {question}  \\
         \hline
         \textbf{{Answer}} & {answer}  \\
         \hline
         \textbf{{Reasoning Step \stepone{{}}}} & {step1} \\
         \textbf{{Step \steptwo{{}}}} & {step2} \\
         \textbf{{Step \stepthree{{}}}} & {step3} \\
         \textbf{{Step \stepfour{{}}}} & {step4} \\
         \textbf{{Step \stepfive{{}}}} & {step5} \\
         \hline
         \textbf{{Ground-Truth Scalar Value}} & {groundtruth}  \\
        \hline
    \end{{tabular}}
    \caption{{Example question asking about {query_type} for the ``{graph_id}'' graph using the story ``{story_id}''.}}
    \label{{tab:dataexample{example_id}}}
\end{{table}}


In [90]:
id_num = 0

In [150]:
index = 0
index = random.choice(range(len(full)))

options = list(find_indices(
							story_id='encouagement_program',
                            query_type='marginal',
                            # graph_id='mediation',
                            ))
print(len(options))

index = random.choice(options)
print(options)
# index = options[0]
# index = 9426
index

46
[5288, 5316, 5331, 5344, 5359, 5372, 5386, 5400, 5415, 5442, 5484, 5499, 5512, 5568, 5583, 5596, 5611, 5624, 5638, 5652, 5667, 5680, 5694, 5289, 5303, 5317, 5358, 5387, 5401, 5414, 5443, 5457, 5471, 5485, 5513, 5526, 5569, 5582, 5597, 5610, 5625, 5639, 5653, 5666, 5681, 5695]


5653

In [151]:
9420, 9426

(9420, 9426)

In [152]:
query_types = {
	'ate': 'ATE',
	'ett': 'ATT',
	'nde': 'NDE',
	'nie': 'NIE',
	'correlation': 'Correlation',
	'marginal': 'Marginal',
	'collider_bias': 'Collider Bias',
	'exp_away': 'Explaining Away',
	'backadj': 'Backdoor Adjustment Set',
	'det-counterfactual': 'Counterfactual',
}

In [153]:
def extract_data(entry):
	meta = entry.get('meta', {})

	model = model_table.get(meta.get('model_id', None), {})

	return {
		'background': model['background'],
		'given_info': entry['given_info'],
		'question': entry['question'],
		'answer': entry['answer'].capitalize(),
		'step1': entry['reasoning']['step1'],
		'step2': entry['reasoning']['step2'],
		'step3': entry['reasoning']['step3'],
		'step4': entry['reasoning']['step4'],
		'step5': entry['reasoning']['step5'],
		'groundtruth': f'{meta["groundtruth"]:.2f}' if isinstance(meta['groundtruth'], (float, int)) else meta['groundtruth'],# else str(set(meta['groundtruth'])),
		'query_type': query_types.get(meta['query_type'], meta['query_type']),
		'story_id': meta['story_id'].replace('_', r'\_'),
		'graph_id': meta['graph_id'].capitalize(),
	}

entry = full_ids[index]

info = extract_data(entry)
info = {k: str(v) for k, v in info.items()}

for k,v in info.items():
	if k.startswith('step'):
		v = r' \newline '.join(f'${vx}$' for vx in v.split('\n')).replace('[', '(').replace(']', ')')\
		.replace('->', r'\rightarrow ').replace('<-', r'\leftarrow ')
		#.replace('{', r'\{').replace('}', r'\}')
		info[k] = v

example = template.format(example_id=id_num, **info).replace('%', r'\%').replace("do(", r"\text{do}(")
id_num += 1
print(example)

\begin{table}[h!]
    \centering
    \small
    \setlength\tabcolsep{5pt}
    \begin{tabular}{r | p{10cm}}
        \hline
         \textbf{Section} & \textbf{Text} \\
         \hline
         \textbf{Background} & Imagine a self-contained, hypothetical world with only the following conditions, and without any unmentioned factors or causal relationships: Encouragement level has a direct effect on studying habit and exam score. Studying habit has a direct effect on exam score. \\
         \textbf{Given Info} & The overall probability of encouragement is 9\%. For students who are not encouraged, the probability of high exam score is 6\%. For students who are encouraged, the probability of high exam score is 49\%.  \\
         \textbf{Question} & Is high exam score less likely than low exam score overall?  \\
         \hline
         \textbf{Answer} & Yes  \\
         \hline
         \textbf{Reasoning Step \stepone{}} & $X\rightarrow V2,X\rightarrow Y,V2\rightarrow Y$ \\
         \textbf{S