In [3]:
from google.colab import drive
drive.mount('/content/drive')



ModuleNotFoundError: No module named 'google'

In [None]:
import torch
import pickle
import random


In [2]:
import torch
import torch.nn as nn

class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super().__init__()
        self.chomp_size = chomp_size
    def forward(self, x):
        return x[:, :, :-self.chomp_size]

class TemporalBlock(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size, stride, dilation, padding, dropout=0.1):
        super().__init__()
        self.conv1 = nn.Conv1d(in_ch, out_ch, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(out_ch, out_ch, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
        self.downsample = nn.Conv1d(in_ch, out_ch, 1) if in_ch != out_ch else None
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

class PGTCN(nn.Module):
    def __init__(self, vocab_size, emb_size=128, num_channels=[128]*3, kernel_size=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        layers = []
        for i, out_ch in enumerate(num_channels):
            dilation = 2 ** i
            in_ch = emb_size if i == 0 else num_channels[i-1]
            layers.append(
                TemporalBlock(in_ch, out_ch, kernel_size, stride=1,
                              dilation=dilation, padding=(kernel_size-1)*dilation)
            )
        self.tcn = nn.Sequential(*layers)
        self.fc = nn.Linear(num_channels[-1], vocab_size)

    def forward(self, x):
        emb = self.embedding(x).transpose(1, 2)
        tcn_out = self.tcn(emb).transpose(1, 2)
        return self.fc(tcn_out)




ImportError: cannot import name 'deprecated' from 'typing_extensions' (/Users/lapac/Documents/PES/Capstone Project/.venv/lib/python3.13/site-packages/typing_extensions.py)

In [None]:
import pickle

path = "/content/drive/MyDrive/pgtcn_model/"
stoi = pickle.load(open(path + "stoi.pkl", "rb"))
itos = pickle.load(open(path + "itos.pkl", "rb"))

vocab_size = len(stoi)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = PGTCN(vocab_size).to(device)

state_dict = torch.load(path + "pgtcn_model.pt", map_location=device)
model.load_state_dict(state_dict)

model.eval()


PGTCN(
  (embedding): Embedding(104, 128)
  (tcn): Sequential(
    (0): TemporalBlock(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,))
      (chomp1): Chomp1d()
      (relu1): ReLU()
      (dropout1): Dropout(p=0.1, inplace=False)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,))
      (chomp2): Chomp1d()
      (relu2): ReLU()
      (dropout2): Dropout(p=0.1, inplace=False)
      (net): Sequential(
        (0): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,))
        (1): Chomp1d()
        (2): ReLU()
        (3): Dropout(p=0.1, inplace=False)
        (4): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,))
        (5): Chomp1d()
        (6): ReLU()
        (7): Dropout(p=0.1, inplace=False)
      )
      (relu): ReLU()
    )
    (1): TemporalBlock(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
      (chomp1): Chomp1d()
      (relu1): ReLU()
      (dropout1): Dro

In [None]:
import torch.nn.functional as F
import random

def generate_password(model, stoi, itos, max_len=20, temperature=1.0, start_char="<SOS>"):
    model.eval()
    device = next(model.parameters()).device

    if start_char in stoi:
        x = torch.tensor([[stoi[start_char]]], dtype=torch.long, device=device)
    else:
        # fallback: start with a random character
        x = torch.tensor([[random.randint(0, len(stoi)-1)]], dtype=torch.long, device=device)

    pwd = ""
    for _ in range(max_len):
        logits = model(x)[:, -1, :]              # logits at last step
        probs = F.softmax(logits / temperature, dim=-1)
        idx = sample_from_logits(logits[-1], temperature=0.7, top_k=20)

        char = itos[idx]

        if char == "<EOS>":  # stop if EOS token exists
            break
        pwd += char

        x = torch.cat([x, torch.tensor([[idx]], device=device)], dim=1)

    return pwd


In [None]:
generated = [generate_password(model, stoi, itos, max_len=20) for _ in range(100000)]

with open("generated.txt", "w") as f:
    for pwd in generated:
        f.write(pwd + "\n")


NameError: name 'sample_from_logits' is not defined

In [None]:
# Load your dataset
with open("/content/drive/MyDrive/pgtcn_model/myspace.txt") as f:
    data = [line.strip() for line in f if line.strip()]

random.shuffle(data)
split = int(0.8 * len(data))   # 80/20 split
test_set = set(data[split:])


In [None]:
import torch.nn.functional as F

def sample_from_logits(logits, temperature=0.7, top_k=20):
    logits = logits / temperature
    probs = F.softmax(logits, dim=-1)
    top_probs, top_idx = torch.topk(probs, k=top_k)
    idx = torch.multinomial(top_probs, 1).item()
    return top_idx[idx].item()


In [None]:
import torch
import torch.nn.functional as F
import random
from pathlib import Path

device = next(model.parameters()).device

def sample_topk_batch(model, stoi, itos, batch_size=512, seq_len=20, temperature=0.7, top_k=20):
    """
    Generates `batch_size` passwords in parallel, each up to seq_len.
    Returns list of strings.
    """
    model.eval()
    vocab_size = len(stoi)
    # initialize with a random start token index for each sequence
    x = torch.randint(0, vocab_size, (batch_size, 1), dtype=torch.long, device=device)
    finished = [False] * batch_size
    outputs = [[] for _ in range(batch_size)]

    with torch.no_grad():
        for t in range(seq_len):
            logits = model(x)  # shape: (batch_size, seq_len_sofar, vocab)
            last_logits = logits[:, -1, :]  # (batch_size, vocab)
            # apply temperature
            last_logits = last_logits / temperature
            probs = F.softmax(last_logits, dim=-1)

            # top-k sampling per batch item
            top_probs, top_idx = torch.topk(probs, k=top_k, dim=-1)  # (batch_size, top_k)
            # sample indices from top_probs
            samp = torch.multinomial(top_probs, 1).squeeze(1)  # (batch_size,)
            idx = top_idx[torch.arange(batch_size), samp]  # chosen token ids (batch_size,)

            # append chars and update finished flags
            for i in range(batch_size):
                ch = itos[idx[i].item()]
                # if you have an <EOS> token, handle it; otherwise we treat all chars as regular
                if ch == "<EOS>":
                    finished[i] = True
                else:
                    outputs[i].append(ch)
            # prepare next input x by concatenating chosen idx
            x = torch.cat([x, idx.unsqueeze(1)], dim=1)

            # stop early if all finished
            if all(finished):
                break

    # join outputs and fallback for empties
    pwds = [''.join(chars) if len(chars)>0 else random.choice(list(stoi.keys())) for chars in outputs]
    return pwds

# streaming 1M generation example (adjust N and batch_size as you wish)
out_path = Path("/content/drive/MyDrive/pgtcn_model/generated_streamed.txt")
N = 1_000_000
batch_size = 1024   # try 512/1024 depending on available memory
seq_len = 20
temperature = 0.7
top_k = 20

out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as fout:
    generated_count = 0
    while generated_count < N:
        b = min(batch_size, N - generated_count)
        pwds = sample_topk_batch(model, stoi, itos, batch_size=b, seq_len=seq_len,
                                 temperature=temperature, top_k=top_k)
        for p in pwds:
            fout.write(p + "\n")
        generated_count += b
        # optional: print progress every X batches
        if generated_count % (batch_size * 10) == 0:
            print(f"generated {generated_count} / {N}")
print("done generating ->", out_path)


generated 10240 / 1000000
generated 20480 / 1000000
generated 30720 / 1000000
generated 40960 / 1000000
generated 51200 / 1000000
generated 61440 / 1000000
generated 71680 / 1000000
generated 81920 / 1000000
generated 92160 / 1000000
generated 102400 / 1000000
generated 112640 / 1000000
generated 122880 / 1000000
generated 133120 / 1000000
generated 143360 / 1000000
generated 153600 / 1000000
generated 163840 / 1000000
generated 174080 / 1000000
generated 184320 / 1000000
generated 194560 / 1000000
generated 204800 / 1000000
generated 215040 / 1000000
generated 225280 / 1000000
generated 235520 / 1000000
generated 245760 / 1000000
generated 256000 / 1000000
generated 266240 / 1000000
generated 276480 / 1000000
generated 286720 / 1000000
generated 296960 / 1000000
generated 307200 / 1000000
generated 317440 / 1000000
generated 327680 / 1000000
generated 337920 / 1000000
generated 348160 / 1000000
generated 358400 / 1000000
generated 368640 / 1000000
generated 378880 / 1000000
generated 

In [None]:
# ðŸ”¹ Load your test set again (20% split, like before)
with open("/content/drive/MyDrive/pgtcn_model/myspace.txt") as f:
    data = [line.strip() for line in f if line.strip()]

import random
random.shuffle(data)
split = int(0.8 * len(data))   # 80/20 split
test_set = set(data[split:])

print(f"Test set size: {len(test_set)}")


matches = set()
count = 0

with open("/content/drive/MyDrive/pgtcn_model/generated_streamed.txt") as f:
    for line in f:
        pwd = line.strip()
        count += 1
        if pwd in test_set:
            matches.add(pwd)

print(f"Generated: {count}")
print(f"Matches: {len(matches)} / {len(test_set)}")
print(f"Match rate: {len(matches)/len(test_set):.4%}")


Test set size: 7426
Generated: 3368950
Matches: 661 / 7426
Match rate: 8.9012%
