In [None]:
purpose: learn how instructions change model behaviour

questions:
- how sensetive is the model to vague vs precise prompts?
- does assigning a persona change factual correctness or style?
- does reordering instructions/examples affect output?
- does step-by-step improve accuracy in reasoning?

concepts:
- zero-shot: clear instructions
- few-shot: give examples 
- chain-of-thought: ask model to think step-by-step to improve reasoning
- self-consistency: generate multiple reasoning paths and pick/guide incrementally 

### exeriment: sensitivity - vague vs precise prompts

- hypothesis: precise prompts -> higher accuracy, less output variance
- dataset: 150-300 short tasks (QA, extraction, summarization)
- conditions: 
    - model
    - temp=0, 0.7
    - 3 seeds per item
- metrics: exact-match/f1, token-length, embedding cosine variance across seeds
- analysi: paired t-test, or wilcoxon for metric differences; bootstrap CI. 

In [None]:
import os, json, time, math, random, boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

# to reset
# pd.reset_option("display.max_columns")
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_colwidth")

dataset_path = "data/qa_or_sum.csv"
mode = "qa"
n_samples = 200
brk_model_id = "anthropic.claude-3-5-haiku-20241022-v1:0"
output_log = ""
batch_delay = 0.1
temperatures = [0.0, 0.7]
seeds = [37, 20, 9]
max_tokens = 256

retry_config = {
    "max_attempts": 5,
    "mode": "adaptive",
}

# retry_config = {
#     "max_attempts": 5,
#     "mode": "standard",
#     "retryable_exceptions": [
#         "ThrottlingException",
#         "ProvisionedThroughputExceededException",
#         "RequestLimitExceeded",
#         "ServiceUnavailableException",
#         "InternalServerException",
#     ],
# }

brk_rt = boto3.client(
    service_name="bedrock-runtime", 
    region_name="us-east-1",
    config=boto3.session.Config(
        retries=retry_config,
        read_timeout=60,
        connect_timeout=60,
        max_pool_connections=100,
    )
)

In [None]:
# brk = boto3.client(service_name="bedrock", region_name="us-east-1")

# response = brk.list_foundation_models()
# df = pd.DataFrame(response["modelSummaries"])
# df.head()

# df.groupby("providerName")["modelId"].apply(lambda x: x).loc["Anthropic"]
# df.loc[df["providerName"] == "Anthropic", ["modelId", "modelArn"]]

0          anthropic.claude-opus-4-1-20250805-v1:0
39              anthropic.claude-instant-v1:2:100k
40                     anthropic.claude-instant-v1
41                       anthropic.claude-v2:0:18k
42                      anthropic.claude-v2:0:100k
43                       anthropic.claude-v2:1:18k
44                      anthropic.claude-v2:1:200k
45                           anthropic.claude-v2:1
46                             anthropic.claude-v2
47     anthropic.claude-3-sonnet-20240229-v1:0:28k
48    anthropic.claude-3-sonnet-20240229-v1:0:200k
49         anthropic.claude-3-sonnet-20240229-v1:0
50      anthropic.claude-3-haiku-20240307-v1:0:48k
51     anthropic.claude-3-haiku-20240307-v1:0:200k
52          anthropic.claude-3-haiku-20240307-v1:0
53       anthropic.claude-3-opus-20240229-v1:0:12k
54       anthropic.claude-3-opus-20240229-v1:0:28k
55      anthropic.claude-3-opus-20240229-v1:0:200k
56           anthropic.claude-3-opus-20240229-v1:0
57       anthropic.claude-3-5-s

In [None]:
def invoke_bedrock(model_id, inputs):
    body = json.dumps(inputs["body"])
    inference_config = {
        "temperature": inputs.get("temperature", 0.0),
        "maxTokens": inputs.get("max_tokens", 1024),
        "stopSequences": inputs.get("stop_sequences", ["\n\n"]),
    }
    response = brt.invoke_model(
        modelId=model_id,
        body=body,
        contentType="application/json",
        accept="application/json",
        inferenceConfig=inference_config
    )
    raw = response.get("body").read().decode("utf-8")
    try:
        parsed = json.loads(raw)
    except Exception:
        parsed = {"error": "Failed to parse response", "raw": raw}
    return parsed

### experiment: instruction ordering

- Hypothesis: Order alters adherence when instructions conflict or format matters.
- Dataset: 150 tasks where output format is strict (e.g., produce JSON schema).
- Prompts: same instruction blocks in different orders (A then B, B then A, interleave).
- Metrics: format-compliance rate (regex/JSON parse), content correctness, - instruction-following score.
- Analysis: chi-square on compliance; logistic regression with order as factor.
- Tip: randomize order per item to avoid dataset-order confound.

### experiment: cot vs no-cot

- Hypothesis: CoT improves multi-step reasoning accuracy (esp. math/logical).
- Dataset: 200 reasoning problems (GSM8K-style).
- Prompts:
    - No-CoT: Answer: {q}
    - CoT: Answer: Let's think step-by-step. {q}
    - Metrics: accuracy, token cost, average reasoning length.
    - Analysis: McNemar test for paired accuracy; report cost vs gain.