In [1]:
import torch
import numpy
import json
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.functional import one_hot

In [2]:
class PasswordDataset(Dataset):

    def __init__(self, root = "train_lite.txt", seq_len = 20) -> None:
        super(PasswordDataset, self).__init__()
        self.root = root
        self.seq_len = seq_len
        self.vocabs = build_vocab_from_iterator(self.get_vocab(), specials=["<start>", "'"], max_tokens=100)
        self.stoi = self.vocabs.get_stoi()
        self.itos = self.vocabs.get_itos()
        self.inputs = self.get_inputs()
        json.dump({"stoi": self.stoi, "itos": self.itos}, open("password_charmap.json", "w"))

    def get_vocab(self):
        fd = open(self.root, "r")
        for text in fd:
            yield [chars for chars in str(text).strip()]
        fd.close()

    def get_inputs(self):
        fd = open(self.root, "r")
        inputs = []
        for text in fd:
            text_data = [self.stoi[chars] for chars in str(text).strip()]
            text_data.insert(0, self.stoi["<start>"])
            while len(text_data) < self.seq_len:
                text_data.append(self.stoi["'"])
            inputs.append(text_data[:self.seq_len])
        inputs = numpy.array(inputs, dtype=numpy.int32)
        fd.close()
        return inputs
    
    def __getitem__(self, index):
        return self.inputs[index]

    def __len__(self):
        return len(self.inputs)

In [9]:
class ResBlock(torch.nn.Module):
 
    def __init__(self, in_channel): 
        super(ResBlock, self).__init__()
        self.conv_layer = torch.nn.Sequential(
            torch.nn.LeakyReLU(0.02),
            torch.nn.Conv1d(in_channel, in_channel, 3, padding=1, bias=False),
            torch.nn.BatchNorm1d(in_channel),
            torch.nn.LeakyReLU(0.02),
            torch.nn.Conv1d(in_channel, in_channel, 3, padding=1, bias=False),
            torch.nn.BatchNorm1d(in_channel))
        
    def forward(self, inputs):
        outputs = self.conv_layer(inputs)
        return (outputs * 0.3) + inputs
    
class NetG(torch.nn.Module):
    
    def __init__(self, seq_len, vocab_len):
        super(NetG, self).__init__()
        self.seq_len = seq_len
        self.fc_layer = torch.nn.Sequential(
            torch.nn.Linear(128, 512, bias=False),
            torch.nn.LeakyReLU(0.02),
            torch.nn.Linear(512, 128 * seq_len, bias=False))
        
        self.conv_layer = torch.nn.Sequential(
            ResBlock(128),
            ResBlock(128), 
            ResBlock(128),
            ResBlock(128),             
            ResBlock(128),
            torch.nn.Conv1d(128, vocab_len, 3, padding=1, bias=False),
            torch.nn.Softmax(1))

    def forward(self, inputs):
        outputs = self.fc_layer(inputs)
        outputs = outputs.reshape(-1, 128, self.seq_len)
        outputs = self.conv_layer(outputs)
        outputs = outputs.permute(0, 2, 1)
        return outputs

In [10]:
class NetD(torch.nn.Module):
    
    def __init__(self, seq_len, vocab_len):
        super(NetD, self).__init__()
        self.conv_layer = torch.nn.Sequential(
            torch.nn.Conv1d(vocab_len, 128, 3, padding=1, bias=False),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            ResBlock(128),
            ResBlock(128), 
            ResBlock(128),
            ResBlock(128),             
            ResBlock(128))
        
        self.fc_layer = torch.nn.Sequential(
            torch.nn.Flatten(),
            torch.nn.Linear(128 * seq_len, 512, bias=False),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 1, bias=False))        
    
    def forward(self, inputs):
        inputs = inputs.permute(0, 2, 1)
        outputs = self.conv_layer(inputs)
        outputs = self.fc_layer(outputs)
        return outputs    

In [11]:
epochs = 20
lr = 0.0001
batch_size = 64
seq_len = 20
c = 0.01
n_c = 5

dataset = PasswordDataset(seq_len=seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

vocab_size = len(dataset.stoi)
print(len(dataset))
print(vocab_size)

G = NetG(seq_len, vocab_size)
D = NetD(seq_len, vocab_size)

G = G.cuda()
D = D.cuda()

g_optim = torch.optim.Adam(params=G.parameters(), lr=lr, betas=(0.5, 0.9))
d_optim = torch.optim.Adam(params=D.parameters(), lr=lr, betas=(0.5, 0.9))

In [None]:
for epoch in range(epochs):
    for iters, inputs in enumerate(dataloader, 0):
        
        real_data = inputs.long().cuda()
        fake_data = torch.randn(batch_size, 128).float().cuda()
        
        real_data = one_hot(real_data, vocab_size).float()
        
        for _ in range(n_c):
            with torch.no_grad():
                generate_data = G(fake_data)
            d_real = D(real_data)
            d_fake = D(generate_data)  
            d_real_loss = -torch.mean(d_real)
            d_fake_loss = torch.mean(d_fake)
            d_loss = d_fake_loss + d_real_loss
            
            d_optim.zero_grad()
            d_loss.backward()
            d_optim.step()
            
            for p in D.parameters():
                p.data = p.data.clamp(-c, c)
        
        fake_data = torch.randn(batch_size, 128).float().cuda()   
        generate_data = G(fake_data)
        d_fake = D(generate_data)
        g_loss = -torch.mean(d_fake)
        
        g_optim.zero_grad()
        g_loss.backward()
        g_optim.step()
        
        if iters % 10 == 0:
            print("[+] Epoch: [%d/%d] G_Loss: %.4f D_Loss: %.4f" % (epoch+1, epochs, g_loss, d_loss))
            with torch.no_grad():
                sample_size = 5
                fake_data = torch.randn(sample_size, 128).float().cuda()  
                generate_data = G(fake_data) 
                generate_data = torch.distributions.Categorical(probs=generate_data)
                generate_data = generate_data.sample()
                text = ''
                for i in range(sample_size):
                    for j in range(seq_len):
                        text += dataset.itos[generate_data[i, j]]
                    text += "\n"
                print(text.rstrip("\n"))

G = G.cpu()
D = D.cpu()

torch.save(G.state_dict(), "passgan_modelG.pth")
torch.save(D.state_dict(), "passgan_modelD.pth")

In [None]:
fd = open(f"gen_passgan_password.txt", "w", encoding='utf-8')
itos = json.load(open("./password_charmap.json", "r"))['itos']
model = NetG(seq_len, vocab_size)
model.load_state_dict(torch.load("./passgan_modelG.pth"))
model.eval()

sample_size = 100000
batch_size = 200
current_size = 0
seq_len = 20

while current_size < sample_size:
    fake_data = torch.randn(batch_size, 128).float() 
    generate_data = G(fake_data) 
    generate_data = torch.distributions.Categorical(probs=generate_data)
    generate_data = generate_data.sample()
    for i in range(batch_size):
        text = ""
        for j in range(seq_len):
            if itos[generate_data[i, j]] == "'":
                break
            text += itos[generate_data[i, j]]
        fd.write(text + "\n")
    current_size += batch_size
    print("[+] Generate Data: %d" % current_size)
fd.close()