# 토크나이저

In [3]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text = "  which Harry watched fly around the room wishing he still had his fulls"

tokens = tokenizer.encode(text)

print("글자수 :", len(text))  # 글자수: 26
print("토큰수 :", len(tokens))  # 토큰수: 6
print(tokens)  # [15496, 2159, 257, 281, 3453, 13]
print(tokenizer.decode(tokens))  # Harry Potter was a wizard.
for token in tokens:
    print(token, ":", tokenizer.decode([token]))

글자수 : 72
토큰수 : 15
[220, 543, 5850, 7342, 6129, 1088, 262, 2119, 24433, 339, 991, 550, 465, 1336, 82]
  which Harry watched fly around the room wishing he still had his fulls
220 :  
543 :  which
5850 :  Harry
7342 :  watched
6129 :  fly
1088 :  around
262 :  the
2119 :  room
24433 :  wishing
339 :  he
991 :  still
550 :  had
465 :  his
1336 :  full
82 : s


In [4]:
from transformers import AutoTokenizer

# 1. GPT-2 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# 2. GPT-2는 기본 pad_token이 없어서 직접 설정
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("vocab_size:", tokenizer.vocab_size)
print("pad_token:", tokenizer.pad_token, "id:", tokenizer.pad_token_id)
print("eos_token:", tokenizer.eos_token, "id:", tokenizer.eos_token_id)

  from .autonotebook import tqdm as notebook_tqdm


vocab_size: 50257
pad_token: <|endoftext|> id: 50256
eos_token: <|endoftext|> id: 50256


In [5]:
def load_text_samples(path: str):
    with open(path, "r", encoding="utf-8") as f:
        data = f.read().strip()
    # 샘플 사이에 빈 줄 하나 넣어놨으니까 "\n\n" 기준으로 자름
    samples = [s.strip() for s in data.split("\n\n") if s.strip()]
    return samples

train_samples = load_text_samples("lotto_train.txt")
print("샘플 개수:", len(train_samples))

example = train_samples[0]
print("원본 샘플:")
print(example)

샘플 개수: 10000
원본 샘플:
money=4000
winning=1,11,15,18,29,38
bonus=33
###
티켓수=4
구매번호:
[2,7,9,15,16,18]
[3,6,28,35,38,44]
[2,6,14,15,33,39]
[2,13,27,35,36,42]
3개일치=0
4개일치=0
5개일치=0
5개보너스일치=0
6개일치=0
수익률=0.0%


In [6]:
max_len = 256  # 임시

enc = tokenizer(
    example,
    max_length=max_len,
    padding="max_length",
    truncation=True,
    return_tensors="pt",  # PyTorch 텐서로
)

input_ids = enc["input_ids"]        # shape: (1, max_len)
attention_mask = enc["attention_mask"]  # shape: (1, max_len)

print("input_ids shape:", input_ids.shape)
print("attention_mask shape:", attention_mask.shape)

# 디코딩해서 잘 복원되는지 확인
decoded = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("디코딩된 텍스트:")
print(decoded)

input_ids shape: torch.Size([1, 256])
attention_mask shape: torch.Size([1, 256])
디코딩된 텍스트:
money=4000
winning=1,11,15,18,29,38
bonus=33
###
티켓수=4
구매번호:
[2,7,9,15,16,18]
[3,6,28,35,38,44]
[2,6,14,15,33,39]
[2,13,27,35,36,42]
3개일치=0
4개일치=0
5개일치=0
5개보너스일치=0
6개일치=0
수익률=0.0%


In [None]:
from torch.utils.data import Dataset
import torch

class LottoDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len: int):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.eos = tokenizer.eos_token

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        row_txt = self.texts[idx]

        txt = raw_txt + self.eos

        # max_len+1 길이로 토큰화 → x:[:-1], y:[1:] 사용
        enc = self.tokenizer(
            txt,
            max_length=self.max_len + 1,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        ids = enc["input_ids"][0]        # (max_len+1,)
        attn_mask = enc["attention_mask"][0]  # (max_len+1,)

        # 언어모델용 input/target
        x = ids[:-1]        # (max_len,)
        y = ids[1:]         # (max_len,)
        x_mask = attn_mask[:-1]

        return x, y, x_mask

In [8]:
max_len = 256
train_ds = LottoDataset(train_samples, tokenizer, max_len=max_len)

x, y, x_mask = train_ds[0]

print("x shape:", x.shape)          # torch.Size([256])
print("y shape:", y.shape)          # torch.Size([256])
print("x_mask shape:", x_mask.shape)

print("x[:20]:", x[:20])
print("y[:20]:", y[:20])

x shape: torch.Size([256])
y shape: torch.Size([256])
x_mask shape: torch.Size([256])
x[:20]: tensor([26316,    28, 27559,   198, 14463,    28,    16,    11,  1157,    11,
         1314,    11,  1507,    11,  1959,    11,  2548,   198,  4189,   385])
y[:20]: tensor([   28, 27559,   198, 14463,    28,    16,    11,  1157,    11,  1314,
           11,  1507,    11,  1959,    11,  2548,   198,  4189,   385,    28])


DataSet 출력값 예시

X TEXT:
money=8000
winning=1,2,3,4,5,6
bonus=7
###

Y TEXT:
oney=8000
winning=1,2,3,4,5,6
bonus=7
###
티

# 전처리 중간정리
1.	GPT-2 토크나이저 선택
2.	AutoTokenizer.from_pretrained("gpt2")로 로딩
3.	pad_token 설정 (eos 재사용)
4.	lotto_train.txt에서 샘플 읽음
5.	토크나이저 encode/decode 테스트
6.	그걸 쓰는 LottoDataset 클래스 정의

In [9]:
from transformers import AutoTokenizer

# GPT-2 tokenizer 준비
tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("pad_token:", tokenizer.pad_token)
print("pad_token_id:", tokenizer.pad_token_id)
print("vocab_size:", tokenizer.vocab_size)


# lotto_train.txt 읽기
def load_text_samples(path):
    with open(path, "r", encoding="utf-8") as f:
        data = f.read().strip()
    samples = [s.strip() for s in data.split("\n\n") if s.strip()]
    return samples

train_samples = load_text_samples("lotto_train.txt")

# 샘플 하나
example = train_samples[0]
# print("=== ORIGINAL TEXT ===")
# print(example)

# encode
enc = tokenizer(
    example,
    max_length=256,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

input_ids = enc["input_ids"][0]
attention_mask = enc["attention_mask"][0]

print("\ninput_ids shape:", input_ids.shape)
print("attention_mask shape:", attention_mask.shape)

# decode
decoded = tokenizer.decode(input_ids, skip_special_tokens=True)

print("\n=== DECODED TEXT ===")
print(decoded)

pad_token: <|endoftext|>
pad_token_id: 50256
vocab_size: 50257

input_ids shape: torch.Size([256])
attention_mask shape: torch.Size([256])

=== DECODED TEXT ===
money=4000
winning=1,11,15,18,29,38
bonus=33
###
티켓수=4
구매번호:
[2,7,9,15,16,18]
[3,6,28,35,38,44]
[2,6,14,15,33,39]
[2,13,27,35,36,42]
3개일치=0
4개일치=0
5개일치=0
5개보너스일치=0
6개일치=0
수익률=0.0%


# 데이터셋 확인

In [10]:
def load_text_samples(path: str):
    with open(path, "r", encoding="utf-8") as f:
        data = f.read().strip()
    # 샘플 사이에 빈 줄 하나씩 있다고 가정 → "\n\n" 기준 split
    samples = [s.strip() for s in data.split("\n\n") if s.strip()]
    return samples

train_texts = load_text_samples("lotto_train.txt")
print("train 샘플 수:", len(train_texts))
print("첫 샘플 원본:")
print(train_texts[0])

train 샘플 수: 10000
첫 샘플 원본:
money=4000
winning=1,11,15,18,29,38
bonus=33
###
티켓수=4
구매번호:
[2,7,9,15,16,18]
[3,6,28,35,38,44]
[2,6,14,15,33,39]
[2,13,27,35,36,42]
3개일치=0
4개일치=0
5개일치=0
5개보너스일치=0
6개일치=0
수익률=0.0%


In [11]:
from torch.utils.data import DataLoader

max_len = 256  # 일단 256 정도로 가정
batch_size = 4

train_ds = LottoDataset(train_texts, tokenizer, max_len=max_len)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

# 배치 하나만 꺼내서 확인해보자
x_batch, y_batch, mask_batch = next(iter(train_loader))

print("x_batch shape:", x_batch.shape)        # (B, T) = (4, 256)
print("y_batch shape:", y_batch.shape)        # (4, 256)
print("mask_batch shape:", mask_batch.shape)  # (4, 256)

x_batch shape: torch.Size([4, 256])
y_batch shape: torch.Size([4, 256])
mask_batch shape: torch.Size([4, 256])


In [12]:
# 배치에서 첫 번째 샘플만 보자
x = x_batch[0]        # (T,)
y = y_batch[0]        # (T,)
mask = mask_batch[0]  # (T,)

# 텐서를 리스트로 변환
x_ids = x.tolist()
y_ids = y.tolist()

# 텍스트로 복원
x_text = tokenizer.decode(x_ids, skip_special_tokens=True)
y_text = tokenizer.decode(y_ids, skip_special_tokens=True)

print("===== X TEXT (모델 입력) =====")
print(x_text)

print("\n===== Y TEXT (정답 타깃) =====")
print(y_text)

===== X TEXT (모델 입력) =====
money=10000
winning=5,15,22,25,35,42
bonus=21
###
티켓수=10
구매번호:
[3,15,30,32,41,42]
[10,16,24,27,37,39]
[17,20,24,27,35,44]
[1,2,5,23,27,36]
[11,13,30,33,35,37]
[1,13,16,20,22,30]
[4,10,21,26,39,41]
[2,7,17,26,38,41]
[4,10,22,24,28,36]
[1,3,25,36,40,42]
3개일치=0
4개일치=0
5개일치=0
5개보너스일치=0
6개일�

===== Y TEXT (정답 타깃) =====
=10000
winning=5,15,22,25,35,42
bonus=21
###
티켓수=10
구매번호:
[3,15,30,32,41,42]
[10,16,24,27,37,39]
[17,20,24,27,35,44]
[1,2,5,23,27,36]
[11,13,30,33,35,37]
[1,13,16,20,22,30]
[4,10,21,26,39,41]
[2,7,17,26,38,41]
[4,10,22,24,28,36]
[1,3,25,36,40,42]
3개일치=0
4개일치=0
5개일치=0
5개보너스일치=0
6개일치


In [13]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class GPTConfig:
    def __init__(
        self,
        vocab_size: int,
        n_layer: int = 4,
        n_head: int = 4,
        d_model: int = 256,
        d_ff: int = 1024,
        max_len: int = 256,
        dropout: float = 0.1,
        pad_id: int = 0,
    ):
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.d_model = d_model
        self.d_ff = d_ff
        self.max_len = max_len
        self.dropout = dropout
        self.pad_id = pad_id

In [14]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.n_head = config.n_head
        self.d_model = config.d_model
        self.dropout = nn.Dropout(config.dropout)

        self.qkv = nn.Linear(config.d_model, 3 * config.d_model)
        self.proj = nn.Linear(config.d_model, config.d_model)

        mask = torch.tril(torch.ones(config.max_len, config.max_len))
        self.register_buffer(
            "causal_mask",
            mask.view(1, 1, config.max_len, config.max_len)
        )
    def forward(self, x, attn_mask=None):
        # x: (B, T, C)
        B, T, C = x.size()
        H = self.n_head
        head_dim = C // H

        qkv = self.qkv(x)              # (B, T, 3C)
        q, k, v = qkv.split(C, dim=2)  # (B, T, C) each

        q = q.view(B, T, H, head_dim).transpose(1, 2)  # (B, H, T, head_dim)
        k = k.view(B, T, H, head_dim).transpose(1, 2)
        v = v.view(B, T, H, head_dim).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) / (head_dim ** 0.5)  # (B, H, T, T)

        causal_mask = self.causal_mask[:, :, :T, :T]
        att = att.masked_fill(causal_mask == 0, float("-inf"))

        if attn_mask is not None:
            # attn_mask: (B, T) → (B, 1, 1, T)
            pad_mask = attn_mask.view(B, 1, 1, T)
            att = att.masked_fill(pad_mask == 0, float("-inf"))

        att = torch.softmax(att, dim=-1)
        att = self.dropout(att)

        y = att @ v                    # (B, H, T, head_dim)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.proj(y)
        y = self.dropout(y)
        return y

In [15]:
class TransformerBlock(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.d_model)
        self.attn = CausalSelfAttention(config)
        self.ln2 = nn.LayerNorm(config.d_model)
        self.ff = nn.Sequential(
            nn.Linear(config.d_model, config.d_ff),
            nn.GELU(),
            nn.Linear(config.d_ff, config.d_model),
            nn.Dropout(config.dropout),
        )

    def forward(self, x, attn_mask=None):
        x = x + self.attn(self.ln1(x), attn_mask=attn_mask)
        x = x + self.ff(self.ln2(x))
        return x

In [16]:
class MiniGPT(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config

        self.tok_emb = nn.Embedding(config.vocab_size, config.d_model)
        self.pos_emb = nn.Embedding(config.max_len, config.d_model)
        self.dropout = nn.Dropout(config.dropout)

        self.blocks = nn.ModuleList(
            [TransformerBlock(config) for _ in range(config.n_layer)]
        )
        self.ln_f = nn.LayerNorm(config.d_model)
        self.head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # (선택) 입력 임베딩과 출력 head weight tying
        self.head.weight = self.tok_emb.weight

    def forward(self, idx, attn_mask=None):
        # idx: (B, T)
        B, T = idx.size()
        device = idx.device

        pos = torch.arange(0, T, dtype=torch.long, device=device)
        pos = pos.unsqueeze(0).expand(B, T)  # (B, T)

        x = self.tok_emb(idx) + self.pos_emb(pos)
        x = self.dropout(x)

        for block in self.blocks:
            x = block(x, attn_mask=attn_mask)

        x = self.ln_f(x)
        logits = self.head(x)  # (B, T, vocab_size)
        return logits

In [17]:
@torch.no_grad()
def generate_text(model, tokenizer, prompt: str, max_new_tokens: int = 200, device="cpu"):
    model.eval()
    max_len = model.config.max_len

    enc = tokenizer(
        prompt,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    x = enc["input_ids"].to(device)  # (1, T)
    attn_mask = enc["attention_mask"].to(device)

    for _ in range(max_new_tokens):
        logits = model(x, attn_mask=attn_mask)        # (1, T, vocab)
        last_logits = logits[:, -1, :]                # (1, vocab)
        probs = torch.softmax(last_logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)  # (1,1)

        # 뒤에 토큰 붙이고, 너무 길어지면 앞에서 잘라냄
        x = torch.cat([x, next_id], dim=1)
        attn_mask = torch.cat(
            [attn_mask, torch.ones_like(next_id, device=device)], dim=1
        )

        if x.size(1) > max_len:
            x = x[:, -max_len:]
            attn_mask = attn_mask[:, -max_len:]

    out_ids = x[0].tolist()
    text = tokenizer.decode(out_ids, skip_special_tokens=True)
    return text


In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)


pad_id = tokenizer.pad_token_id
vocab_size = tokenizer.vocab_size

# 2) 데이터 로딩
train_texts = load_text_samples("lotto_train.txt")
val_texts = load_text_samples("lotto_val.txt")  # 없다면 주석 처리하고 train만 써도 됨

print("train samples:", len(train_texts))
print("val samples:", len(val_texts))

max_len = 256

train_ds = LottoDataset(train_texts, tokenizer, max_len=max_len)
val_ds = LottoDataset(val_texts, tokenizer, max_len=max_len)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# 3) 모델 & optimizer & loss
config = GPTConfig(
    vocab_size=vocab_size,
    n_layer=4,
    n_head=4,
    d_model=256,
    d_ff=1024,
    max_len=max_len,
    dropout=0.1,
    pad_id=pad_id,
)

model = MiniGPT(config).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

device: cuda
train samples: 10000
val samples: 1000


#학습


In [28]:
epochs = 30  # ← epoch 크게 잡고 싶다 했으니까 예시로 30으로 설정
best_val_loss = float("inf")

for epoch in range(1, epochs + 1):
    # ----- Train -----
    model.train()
    total_loss = 0.0

    for x, y, mask in train_loader:
        x = x.to(device)       # (B, T)
        y = y.to(device)       # (B, T)
        mask = mask.to(device) # (B, T)

        logits = model(x, attn_mask=mask)  # (B, T, vocab)
        
        loss = criterion(
            logits.view(-1, logits.size(-1)),  # (B*T, vocab)
            y.view(-1)                         # (B*T)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # ----- Validation -----
    model.eval()
    val_loss_sum = 0.0
    with torch.no_grad():
        for x, y, mask in val_loader:
            x = x.to(device)
            y = y.to(device)
            mask = mask.to(device)

            logits = model(x, attn_mask=mask)
            loss = criterion(
                logits.view(-1, logits.size(-1)),
                y.view(-1)
            )
            val_loss_sum += loss.item()

    avg_val_loss = val_loss_sum / len(val_loader)

    # ----- Epoch 결과 출력 -----
    print(f"[Epoch {epoch:03d}] train_loss={avg_train_loss:.4f}, val_loss={avg_val_loss:.4f}")

    # ----- 모델 저장 -----
    # 1) 매 epoch마다 체크포인트 저장
    ckpt_path = f"lotto_gpt_epoch{epoch:03d}.pt"
    torch.save(model.state_dict(), ckpt_path)

    # 2) best val loss 갱신 시 별도 저장
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "lotto_gpt_best.pt")

[Epoch 001] train_loss=1.3327, val_loss=0.9760
[Epoch 002] train_loss=1.1994, val_loss=0.9647
[Epoch 003] train_loss=1.0896, val_loss=0.9421
[Epoch 004] train_loss=1.0263, val_loss=0.9360
[Epoch 005] train_loss=0.9879, val_loss=0.9191
[Epoch 006] train_loss=0.9570, val_loss=0.8993
[Epoch 007] train_loss=0.9278, val_loss=0.8709
[Epoch 008] train_loss=0.8958, val_loss=0.8439
[Epoch 009] train_loss=0.8700, val_loss=0.8283
[Epoch 010] train_loss=0.8524, val_loss=0.8137
[Epoch 011] train_loss=0.8398, val_loss=0.8071
[Epoch 012] train_loss=0.8284, val_loss=0.7964
[Epoch 013] train_loss=0.8213, val_loss=0.7900
[Epoch 014] train_loss=0.8155, val_loss=0.7891
[Epoch 015] train_loss=0.8112, val_loss=0.7873
[Epoch 016] train_loss=0.8073, val_loss=0.7881
[Epoch 017] train_loss=0.8033, val_loss=0.7855
[Epoch 018] train_loss=0.7996, val_loss=0.7828
[Epoch 019] train_loss=0.7958, val_loss=0.7838
[Epoch 020] train_loss=0.7933, val_loss=0.7831
[Epoch 021] train_loss=0.7911, val_loss=0.7814
[Epoch 022] t

In [19]:
@torch.no_grad()
def generate_text(model, tokenizer, prompt: str, max_new_tokens: int = 200, device="cpu"):
    
    model.eval()
    max_len = model.config.max_len

    # 1) 처음에는 패딩 없이 실제 길이만큼만 인코딩
    enc = tokenizer(
        prompt,
        return_tensors="pt",
    )
    x = enc["input_ids"].to(device)      # (1, T0)
    attn_mask = enc["attention_mask"].to(device)  # (1, T0)

    for _ in range(max_new_tokens):
        # 2) 모델에 넣기 전에 길이가 max_len을 넘으면 뒤에서 max_len만 유지
        if x.size(1) > max_len:
            x = x[:, -max_len:]
            attn_mask = attn_mask[:, -max_len:]

        # 3) forward
        logits = model(x, attn_mask=attn_mask)        # (1, T, vocab)
        last_logits = logits[:, -1, :]                # (1, vocab)

        probs = torch.softmax(last_logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)  # (1,1)

        # 4) 새 토큰 이어붙이기
        x = torch.cat([x, next_id], dim=1)  # (1, T+1)
        next_mask = torch.ones_like(next_id, device=device)
        attn_mask = torch.cat([attn_mask, next_mask], dim=1)

    # 5) 결과 디코딩
    out_ids = x[0].tolist()
    text = tokenizer.decode(out_ids, skip_special_tokens=True)
    return text

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

max_len = 256  # 학습 때 사용한 값과 동일하게

config = GPTConfig(
    vocab_size=vocab_size,
    n_layer=4,
    n_head=4,
    d_model=256,
    d_ff=1024,
    max_len=max_len,
    dropout=0.1,
    pad_id=pad_id,
)
model = MiniGPT(config).to(device)

# 체크포인트 로드 (파일 이름 맞게 수정 가능)
ckpt_path = "lotto_gpt_best.pt"   # 또는 "lotto_gpt_epoch030.pt" 등
state = torch.load(ckpt_path, map_location=device)
model.load_state_dict(state)

model.eval()
print("checkpoint loaded from:", ckpt_path)

device: cuda
checkpoint loaded from: lotto_gpt_best.pt


In [38]:
@torch.no_grad()
def generate_text(model, tokenizer, prompt: str, max_new_tokens: int = 200, device="cpu"):
    model.eval()
    max_len = model.config.max_len
    eos_id = tokenizer.eos_token_id

    enc = tokenizer(prompt, return_tensors="pt")
    x = enc["input_ids"].to(device)  # (1, T0)

    for _ in range(max_new_tokens):
        if x.size(1) > max_len:
            x = x[:, -max_len:]

        logits = model(x)               # (1, T, vocab)
        last_logits = logits[:, -1, :]  # (1, vocab)

        probs = torch.softmax(last_logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)  # (1,1)
        next_token = next_id.item()

        # 1) EOS 나오면 바로 멈추기
        if next_token == eos_id:
            break

        x = torch.cat([x, next_id], dim=1)

    out_ids = x[0].tolist()
    text = tokenizer.decode(out_ids, skip_special_tokens=True)
    return text

In [59]:
example_prompt = (
    "money=5000\n"
    "winning=1,2,3,4,5,6\n"
    "bonus=7\n"
    "###\n"
)

print("prompt len (tokens):", tokenizer(example_prompt, return_tensors="pt")["input_ids"].shape)
print("model max_len:", model.config.max_len)

generated = generate_text(model, tokenizer, example_prompt, max_new_tokens=200, device=device)
print("=== SAMPLE GENERATION ===")
print(generated)

prompt len (tokens): torch.Size([1, 25])
model max_len: 256
=== SAMPLE GENERATION ===
money=5000
winning=1,2,3,4,5,6
bonus=7
###
티켓수=5
구매번호:
[13,17,24,20,26,33]
[1,2,3,8,27,29]
[5,21,29,30,31,36]
[12,19,19,20,33,42]
[5,6,16,21,24,42]
3개일치=0
4개일치=0
5개일치=0
5개보너스일치=0
6개일치=0
수익률=0.0%%%%%%%%%%%%%%%%%%%%%%%


### repetition loop

수익률=0.0%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
마지막에 %이 계속해서 반복되는 현상

멈춰야하는 신호를 주지않았기 때문에 발생한 문제
1. EOS토큰을 stop 조건으로 안씀
2. 학습과정에서 eos 토큰을 따로 붙여준 적이 없음

-> 문자열 종료에 대한 학습을 시켜 준 적이 없음

```python
from torch.utils.data import Dataset
import torch

class LottoDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len: int):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.eos = tokenizer.eos_token

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        row_txt = self.texts[idx]

        txt = raw_txt + self.eos

        # max_len+1 길이로 토큰화 → x:[:-1], y:[1:] 사용
        enc = self.tokenizer(
            txt,
            max_length=self.max_len + 1,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        ids = enc["input_ids"][0]        # (max_len+1,)
        attn_mask = enc["attention_mask"][0]  # (max_len+1,)

        # 언어모델용 input/target
        x = ids[:-1]        # (max_len,)
        y = ids[1:]         # (max_len,)
        x_mask = attn_mask[:-1]

        return x, y, x_mask
```

self.eos = tokenizer.eos_token 

txt = raw_txt + self.eos

해당 부분 추가로 개선