In [None]:
# 1) Imports & immediate stdout flushing
import torch, random, sys, logging
import numpy as np
from tqdm.auto import trange
from argparse import Namespace
import pretrained_rl_hangman as prl   # your uploaded script

# 2) Build args exactly how youâ€™d on the CLI
args = Namespace(
    words         = "words_250000_train.txt",
    save          = "best_qhead.pth",
    model_name    = "prajjwal1/bert-tiny",
    bert_dim      = 128,               # hidden size of bert-tiny
    episodes      = 20000,
    batch_size    = 128,
    lr            = 1e-3,
    gamma         = 0.99,
    max_wrong     = 6,
    memory_size   = 50000,
    hidden_dim    = 128,
    eps_start     = 1.0,
    eps_end       = 0.01,
    eps_decay     = 20000,
    target_update = 1000,
    eval_interval = 50,
    log_interval  = 1,
    test_size     = 0.2,
    seed          = 42
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 3) Inline training loop with prints every episode + running accuracy
def train_inline(args):
    train_words, test_words = prl.load_and_split(
        args.words, test_size=args.test_size, seed=args.seed
    )

    vocab_size = len(prl.HangmanEnv(train_words).vocab)
    agent = prl.Agent(args, device, vocab_size)

    best_acc = 0.0
    train_wins = 0

    for ep in trange(1, args.episodes + 1, desc="Episodes"):
        env     = prl.HangmanEnv(train_words, max_wrong=args.max_wrong)
        state   = env.reset()
        mask    = env.legal_mask()
        total_r = 0.0
        done    = False
        info    = {'win': False}

        # play one episode
        while not done:
            a, = [agent.select(state, mask)]
            s2, r, done, info = env.step(a)
            m2      = env.legal_mask()
            agent.push(state, mask, a, r, s2, m2, done)
            state, mask = s2, m2
            total_r   += r
            agent.optimize()

        # count wins
        if info.get('win'):
            train_wins += 1

        # update target network
        if ep % args.target_update == 0:
            agent.update_target()

        # per-episode log
        if ep % args.log_interval == 0:
            eps = args.eps_end + (args.eps_start - args.eps_end) * \
                  np.exp(-agent.steps_done / args.eps_decay)
            train_acc = train_wins / ep * 100
            print(f"[{ep}/{args.episodes}] "
                  f"{'WIN ' if info['win'] else 'LOSS'}"
                  f"  R={total_r:.2f}  Eps={eps:.3f}"
                  f"  TrainAcc={train_acc:.2f}%",
                  flush=True)

        # periodic evaluation
        if ep % args.eval_interval == 0:
            wins = 0
            sample = random.sample(test_words, min(5, len(test_words)))
            for _ in sample:
                e = prl.HangmanEnv(sample, max_wrong=args.max_wrong)
                s0, m0 = e.reset(), e.legal_mask();
                d = False;
                inf = {}
                while not d:
                    a0 = agent.select(s0, m0)
                    s0, _, d, inf = e.step(a0);
                    m0 = e.legal_mask()
                if inf.get('win'): wins += 1
            acc = wins / len(test_words) * 100
            print(f"â†’ Eval @ {ep}: Test Acc = {acc:.2f}%", flush=True)

            best_acc = acc
            print("Saving File")
            torch.save(agent.policy.state_dict(), args.save)

    print(f"\nðŸ“ˆ Done â€” best test acc = {best_acc:.2f}% "
          f"(saved to {args.save})", flush=True)

# 4) Run it!
train_inline(args)

Episodes:   0%|          | 0/20000 [00:00<?, ?it/s]

[1/20000] LOSS  R=1.10  Eps=1.000  TrainAcc=0.00%
[2/20000] LOSS  R=-30.00  Eps=0.999  TrainAcc=0.00%
[3/20000] LOSS  R=2.11  Eps=0.999  TrainAcc=0.00%
[4/20000] LOSS  R=22.68  Eps=0.998  TrainAcc=0.00%
[5/20000] LOSS  R=12.04  Eps=0.998  TrainAcc=0.00%
[6/20000] LOSS  R=2.48  Eps=0.997  TrainAcc=0.00%
[7/20000] LOSS  R=-9.27  Eps=0.997  TrainAcc=0.00%
[8/20000] LOSS  R=-9.02  Eps=0.997  TrainAcc=0.00%
[9/20000] LOSS  R=23.22  Eps=0.996  TrainAcc=0.00%
[10/20000] LOSS  R=-8.94  Eps=0.996  TrainAcc=0.00%
[11/20000] LOSS  R=-19.35  Eps=0.995  TrainAcc=0.00%
[12/20000] LOSS  R=-19.29  Eps=0.995  TrainAcc=0.00%
[13/20000] LOSS  R=11.68  Eps=0.994  TrainAcc=0.00%
[14/20000] LOSS  R=1.61  Eps=0.994  TrainAcc=0.00%
[15/20000] LOSS  R=-30.00  Eps=0.994  TrainAcc=0.00%
[16/20000] LOSS  R=1.83  Eps=0.993  TrainAcc=0.00%
[17/20000] LOSS  R=2.60  Eps=0.993  TrainAcc=0.00%
[18/20000] LOSS  R=-19.30  Eps=0.992  TrainAcc=0.00%
[19/20000] LOSS  R=0.92  Eps=0.992  TrainAcc=0.00%
[20/20000] LOSS  R=-8.2