In [None]:
 import json
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re


In [None]:
import os

for fname in os.listdir("/content"):
    print(fname)


.config
sample_data


In [None]:
from google.colab import files
uploaded = files.upload()


Saving training_data_20000_gpt_like.json to training_data_20000_gpt_like.json


In [None]:
import os
print(os.listdir("/content"))


['.config', 'training_data_20000_gpt_like.json', 'sample_data']


In [None]:
json_path = "/content/training_data_20000_gpt_like.json"


In [None]:
import json
import torch
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split


json_path = "/content/training_data_20000_gpt_like.json"

all_words = []
with open(json_path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        words = obj["text"].strip().split()
        all_words.extend(words)


vocab_size = 5000
word_counts = Counter(all_words)
most_common = word_counts.most_common(vocab_size - 2)  # 0: PAD, 1: UNK
word2idx = {"<PAD>": 0, "<UNK>": 1}
for i, (word, _) in enumerate(most_common, start=2):
    word2idx[word] = i
idx2word = {i: w for w, i in word2idx.items()}


with open("word_vocab.json", "w", encoding="utf-8") as f:
    json.dump(word2idx, f, ensure_ascii=False, indent=2)


def words_to_tensor(text, max_len=60):
    tokens = text.strip().split()
    ids = [word2idx.get(w, 1) for w in tokens]  # 1: UNK
    ids += [0] * (max_len - len(ids))
    return ids[:max_len]

MAX_LEN = 60
all_data = []

with open(json_path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        vec = obj["vector"]
        ids = words_to_tensor(obj["text"], MAX_LEN)
        all_data.append(json.dumps({"vector": vec, "text_ids": ids}))


part_size = len(all_data) // 4
for i in range(4):
    part = all_data[i*part_size:(i+1)*part_size]
    with open(f"worddata_part{i+1}.json", "w", encoding="utf-8") as f:
        for line in part:
            f.write(line + "\n")

print("‚úÖ Word-level veri hazƒ±r ve 4 par√ßaya b√∂l√ºnd√º!")


‚úÖ Word-level veri hazƒ±r ve 4 par√ßaya b√∂l√ºnd√º!


In [None]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import os


with open("word_vocab.json", "r", encoding="utf-8") as f:
    word2idx = json.load(f)
vocab_size = len(word2idx)


class WordPersonaDataset(Dataset):
    def __init__(self, path):
        self.data = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                vec = torch.tensor(obj["vector"]).float()
                ids = torch.tensor(obj["text_ids"]).long()
                self.data.append((vec, ids))
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx]


class ScaledDotProductAttention(nn.Module):
    def forward(self, Q, K, V):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / Q.size(-1) ** 0.5
        return torch.matmul(torch.softmax(scores, dim=-1), V)

class MultiHeadAttention(nn.Module):
    def __init__(self, dim, heads):
        super().__init__()
        self.q = nn.Linear(dim, dim)
        self.k = nn.Linear(dim, dim)
        self.v = nn.Linear(dim, dim)
        self.out = nn.Linear(dim, dim)
        self.heads = heads
        self.dim = dim // heads
        self.attn = ScaledDotProductAttention()

    def forward(self, x):
        B, T, D = x.size()
        Q = self.q(x).view(B, T, self.heads, self.dim).transpose(1, 2)
        K = self.k(x).view(B, T, self.heads, self.dim).transpose(1, 2)
        V = self.v(x).view(B, T, self.heads, self.dim).transpose(1, 2)
        out = self.attn(Q, K, V).transpose(1, 2).contiguous().view(B, T, D)
        return self.out(out)

class TransformerBlock(nn.Module):
    def __init__(self, dim, heads, ff_dim):
        super().__init__()
        self.attn = MultiHeadAttention(dim, heads)
        self.norm1 = nn.LayerNorm(dim)
        self.ff = nn.Sequential(nn.Linear(dim, ff_dim), nn.ReLU(), nn.Linear(ff_dim, dim))
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x):
        x = self.norm1(x + self.attn(x))
        return self.norm2(x + self.ff(x))


class WordLevelModel(nn.Module):
    def __init__(self, persona_dim, vocab_size, emb_dim, heads, ff_dim, layers):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.persona_proj = nn.Sequential(nn.Linear(persona_dim, emb_dim), nn.ReLU())
        self.blocks = nn.ModuleList([TransformerBlock(emb_dim, heads, ff_dim) for _ in range(layers)])
        self.out = nn.Linear(emb_dim, vocab_size)

    def forward(self, persona, seq):
        x = self.word_emb(seq)
        p = self.persona_proj(persona).unsqueeze(1)
        x = x + p
        for block in self.blocks:
            x = block(x)
        return self.out(x)


EMBED_DIM = 256
FF_DIM = 512
HEADS = 8
LAYERS = 4
MAX_LEN = 60
EPOCHS = 30
BATCH_SIZE = 8
LR = 5e-4
PATIENCE = 5

model = WordLevelModel(25, vocab_size, EMBED_DIM, HEADS, FF_DIM, LAYERS)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

best_loss = float("inf")
patience_count = 0


for part in range(1, 5):
    print(f"\nüß© Part {part}/4 ba≈ülƒ±yor...")
    dataset = WordPersonaDataset(f"worddata_part{part}.json")
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(EPOCHS):
        total_loss = 0
        model.train()
        for persona_vec, ids in loader:
            optimizer.zero_grad()
            output = model(persona_vec, ids[:, :-1])
            loss = criterion(output.reshape(-1, vocab_size), ids[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"üìò Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss:.4f}")
        if (epoch + 1) % 10 == 0:
            torch.save(model.state_dict(), f"wordlevel_part{part}_epoch{epoch+1}.pth")
            print("üíæ Kaydedildi.")

        if best_loss - total_loss > 0.1:
            best_loss = total_loss
            patience_count = 0
            torch.save(model.state_dict(), f"wordlevel_best_part{part}.pth")
        else:
            patience_count += 1
            if patience_count >= PATIENCE:
                print("üõë Early Stopping")
                break



üß© Part 1/4 ba≈ülƒ±yor...
üìò Epoch 1/30 | Loss: 148.2188
üìò Epoch 2/30 | Loss: 37.6365
üìò Epoch 3/30 | Loss: 31.6012
üìò Epoch 4/30 | Loss: 23.3262
üìò Epoch 5/30 | Loss: 12.2155
üìò Epoch 6/30 | Loss: 3.3135
üìò Epoch 7/30 | Loss: 2.1541
üìò Epoch 8/30 | Loss: 2.4175
üìò Epoch 9/30 | Loss: 1.7403
üìò Epoch 10/30 | Loss: 1.8015
üíæ Kaydedildi.
üìò Epoch 11/30 | Loss: 1.6903
üìò Epoch 12/30 | Loss: 1.7021
üìò Epoch 13/30 | Loss: 1.6711
üìò Epoch 14/30 | Loss: 1.6513
üõë Early Stopping

üß© Part 2/4 ba≈ülƒ±yor...
üìò Epoch 1/30 | Loss: 33.2472
üõë Early Stopping

üß© Part 3/4 ba≈ülƒ±yor...
üìò Epoch 1/30 | Loss: 20.0836
üõë Early Stopping

üß© Part 4/4 ba≈ülƒ±yor...
üìò Epoch 1/30 | Loss: 14.2070
üõë Early Stopping


In [None]:
import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output, _ = self.attn(x, x, x)
        x = self.norm1(x + attn_output)
        x = self.norm2(x + self.ff(x))
        return x

class LifeStoryModel(nn.Module):
    def __init__(self, persona_dim, vocab_size, embed_dim, num_heads, ff_dim, num_layers):
        super().__init__()
        self.persona_proj = nn.Linear(persona_dim, embed_dim)
        self.word_embed = nn.Embedding(vocab_size, embed_dim)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim)
            for _ in range(num_layers)
        ])
        self.out = nn.Linear(embed_dim, vocab_size)

    def forward(self, persona_vec, seq):
        seq_embed = self.word_embed(seq)
        persona_embed = self.persona_proj(persona_vec).unsqueeze(1)
        x = seq_embed + persona_embed
        for block in self.transformer_blocks:
            x = block(x)
        return self.out(x)



In [None]:

torch.save(model.state_dict(), "/content/drive/MyDrive/deeppersona_weights/deeppersona_word_final.pth")
print("‚úÖ Model ba≈üarƒ±yla kaydedildi.")


‚úÖ Model ba≈üarƒ±yla kaydedildi.


In [None]:

model = LifeStoryModel(
    persona_dim=25,
    vocab_size=len(word2idx),
    embed_dim=192,
    num_heads=6,
    ff_dim=384,
    num_layers=3
)

model.load_state_dict(torch.load(
    "/content/drive/MyDrive/deeppersona_weights/deeppersona_word_final.pth",
    map_location="cpu"
))
model.eval()
print("‚úÖ Word-level model ba≈üarƒ±yla y√ºklendi.")


‚úÖ Word-level model ba≈üarƒ±yla y√ºklendi.


In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/deeppersona_weights/deeppersona_word_final.pth")
print("‚úÖ Word-level model ba≈üarƒ±yla kaydedildi.")


‚úÖ Word-level model ba≈üarƒ±yla kaydedildi.


In [None]:

model = LifeStoryModel(
    persona_dim=25,
    vocab_size=len(word2idx),
    embed_dim=192,
    num_heads=6,
    ff_dim=384,
    num_layers=3
)


model.load_state_dict(torch.load(
    "/content/drive/MyDrive/deeppersona_weights/deeppersona_word_final.pth",
    map_location="cpu"
))
model.eval()

print("‚úÖ Model ba≈üarƒ±yla y√ºklendi ve test moduna alƒ±ndƒ±.")


‚úÖ Model ba≈üarƒ±yla y√ºklendi ve test moduna alƒ±ndƒ±.


In [None]:
import json

with open("/content/worddata_part1.json", encoding="utf-8") as f:
    first_line = f.readline()
    data = json.loads(first_line)
    print("üì¶ JSON Anahtarlarƒ±:", data.keys())
    print("üîé √ñrnek veri:", data)


üì¶ JSON Anahtarlarƒ±: dict_keys(['vector', 'text_ids'])
üîé √ñrnek veri: {'vector': [0.483101, -0.510216, -0.720924, -0.79501, 0.481335, 0.090733, 0.180985, -0.936435, -0.81261, -0.534678, 0.204037, 0.12249, 0.432039, 0.40265, -0.16096, -0.101582, -0.443619, 0.738601, 0.517615, -0.680681, -0.154771, -0.444257, -0.569372, 0.526988, -0.795579], 'text_ids': [82, 3923, 3, 4, 88, 5, 2, 10, 6, 7, 11, 8, 34, 35, 36, 37, 38, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [None]:
import torch
import json

# S√∂zl√ºƒü√º y√ºkle
with open("/content/word_vocab.json", "r", encoding="utf-8") as f:
    vocab = json.load(f)
    id2word = {int(v): k for k, v in vocab.items()}

# Test √∂rneƒüini al
with open("/content/worddata_part1.json", encoding="utf-8") as f:
    example = json.loads(f.readline())
    test_vec = torch.tensor(example["vector"]).unsqueeze(0).float()
    input_ids = example["text_ids"]

# Orijinal hikayeyi geri d√∂n√º≈üt√ºr (ID ‚Üí kelime)
words = [id2word[i] for i in input_ids if i in id2word]
original_text = " ".join(words)

print("‚úÖ Vekt√∂r ba≈üarƒ±yla y√ºklendi.")
print("üìù Eƒüitimde kullanƒ±lan ger√ßek metin:\n", original_text)


‚úÖ Vekt√∂r ba≈üarƒ±yla y√ºklendi.
üìù Eƒüitimde kullanƒ±lan ger√ßek metin:
 North Satrettinbury ≈üehrinde ya≈üayan 60 ya≈üƒ±nda bir kadƒ±n. Kendini genellikle mutlu hissediyor. K√º√ß√ºk ya≈ülardan beri yazƒ± yazmaya meraklƒ±ydƒ±. <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
def generate_word_level(model, persona_vec, word2idx, idx2word, max_len=50, temperature=0.8):
    model.eval()
    input_ids = [word2idx["<BOS>"]]

    for _ in range(max_len):
        input_tensor = torch.tensor(input_ids).unsqueeze(0)  # (1, seq_len)
        with torch.no_grad():
            logits = model(persona_vec, input_tensor)[:, -1, :]  # son token √ßƒ±ktƒ±sƒ±
            probs = torch.softmax(logits / temperature, dim=-1).squeeze()
            next_id = torch.multinomial(probs, num_samples=1).item()

        if next_id == word2idx["<EOS>"]:
            break
        input_ids.append(next_id)

    return " ".join([idx2word[i] for i in input_ids[1:]])  # <BOS> dahil deƒüil


In [None]:
word2idx["<BOS>"] = len(word2idx)
word2idx["<EOS>"] = len(word2idx)
word2idx["<PAD>"] = len(word2idx)

# Aynƒ± ≈üekilde idx2word de g√ºncellenmeli:
idx2word[word2idx["<BOS>"]] = "<BOS>"
idx2word[word2idx["<EOS>"]] = "<EOS>"
idx2word[word2idx["<PAD>"]] = "<PAD>"


In [None]:
import json

with open("word_vocab.json", "w", encoding="utf-8") as f:
    json.dump(word2idx, f, ensure_ascii=False, indent=2)


In [None]:
# √ñzel tokenlar eksikse ekle
special_tokens = ["<BOS>", "<EOS>", "<PAD>"]

for token in special_tokens:
    if token not in word2idx:
        word2idx[token] = len(word2idx)
        idx2word[word2idx[token]] = token


In [None]:
def generate_word_level(model, persona_vec, word2idx, idx2word, max_len=50, temperature=0.8):
    model.eval()
    input_ids = [word2idx["<BOS>"]]  # Ba≈ülangƒ±√ß tokenƒ±

    for _ in range(max_len):
        input_tensor = torch.tensor(input_ids).unsqueeze(0)  # (1, seq_len)
        with torch.no_grad():
            logits = model(persona_vec, input_tensor)[:, -1, :]  # son token √ßƒ±ktƒ±sƒ±
            probs = torch.softmax(logits / temperature, dim=-1).squeeze()
            next_id = torch.multinomial(probs, num_samples=1).item()

        if next_id == word2idx["<EOS>"]:
            break
        input_ids.append(next_id)

    return " ".join([idx2word[i] for i in input_ids[1:]])


In [None]:
def generate_word_level(model, persona_vec, word2idx, idx2word, max_len=50, temperature=0.8):
    model.eval()

    vocab_size = len(word2idx)
    input_ids = [word2idx.get("<BOS>", 0)]

    for _ in range(max_len):
        input_tensor = torch.tensor(input_ids).unsqueeze(0)

        with torch.no_grad():
            logits = model(persona_vec, input_tensor)[:, -1, :]
            probs = torch.softmax(logits / temperature, dim=-1).squeeze()

        if torch.isnan(probs).any():
            probs = torch.ones_like(probs) / len(probs)

        next_id = torch.multinomial(probs, num_samples=1).item()


        if next_id >= vocab_size or next_id < 0:
            next_id = word2idx.get("<EOS>", 0)


        if idx2word.get(next_id, "") == "<EOS>":
            break

        input_ids.append(next_id)


    return " ".join([idx2word.get(i, "") for i in input_ids[1:] if i in idx2word])




In [None]:
def generate_word_level(model, persona_vec, word2idx, idx2word, max_len=50, temperature=0.8):
    model.eval()

    vocab_size = len(word2idx)
    bos_id = word2idx.get("<BOS>", 0)
    eos_id = word2idx.get("<EOS>", vocab_size - 1)

    input_ids = [bos_id]

    for _ in range(max_len):

        safe_input_ids = [i if i < vocab_size else word2idx.get("<UNK>", 0) for i in input_ids]
        input_tensor = torch.tensor(safe_input_ids).unsqueeze(0)

        with torch.no_grad():
            logits = model(persona_vec, input_tensor)[:, -1, :]  # son token √ßƒ±ktƒ±sƒ±
            probs = torch.softmax(logits / temperature, dim=-1).squeeze()


        if torch.isnan(probs).any():
            probs = torch.ones_like(probs) / len(probs)


        next_id = torch.multinomial(probs, num_samples=1).item()


        if next_id >= vocab_size or next_id < 0:
            next_id = eos_id
        if next_id == eos_id:
            break

        input_ids.append(next_id)


    return " ".join([idx2word.get(i, "<UNK>") for i in input_ids[1:] if i in idx2word])


In [None]:
print("Vocab size:", len(word2idx))


Vocab size: 5002


In [None]:
# input_ids'in sƒ±nƒ±r dƒ±≈üƒ±na √ßƒ±kan deƒüerleri kontrol et
input_ids = [word2idx.get("<BOS>", 0)]
for _ in range(50):
    if any(i >= len(word2idx) or i < 0 for i in input_ids):
        print(" Hatalƒ± index bulundu:", input_ids)
        break


In [None]:
def generate_word_level(model, persona_vec, word2idx, idx2word, max_len=50, temperature=0.8):
    model.eval()

    vocab_size = len(word2idx)
    bos_id = word2idx.get("<BOS>", 0)
    eos_id = word2idx.get("<EOS>", vocab_size - 1)
    unk_id = word2idx.get("<UNK>", 0)

    input_ids = [bos_id]

    for _ in range(max_len):

        safe_input_ids = [i if i < vocab_size else unk_id for i in input_ids]
        input_tensor = torch.tensor(safe_input_ids).unsqueeze(0)

        with torch.no_grad():
            logits = model(persona_vec, input_tensor)[:, -1, :]
            probs = torch.softmax(logits / temperature, dim=-1).squeeze()


        if torch.isnan(probs).any():
            probs = torch.ones_like(probs) / len(probs)


        next_id = torch.multinomial(probs, num_samples=1).item()


        if next_id >= vocab_size or next_id < 0:
            next_id = unk_id

        if next_id == eos_id:
            break

        input_ids.append(next_id)


    return " ".join([idx2word.get(i, "<UNK>") for i in input_ids[1:]])


In [None]:
unk_id = word2idx.get("<UNK>", 0)

safe_input_ids = []
for i in input_ids:
    if isinstance(i, int) and 0 <= i < vocab_size:
        safe_input_ids.append(i)
    else:
        safe_input_ids.append(unk_id)



In [None]:
temperature = 0.8


In [None]:
bos_id = word2idx.get("<BOS>", 0)
eos_id = word2idx.get("<EOS>", 1)
unk_id = word2idx.get("<UNK>", 2)
vocab_size = len(word2idx)


In [None]:
print("test_vec shape:", test_vec.shape)


test_vec shape: torch.Size([1, 25])


In [None]:
print("first few words in idx2word:", list(idx2word.items())[:5])


first few words in idx2word: [(0, '<PAD>'), (1, '<UNK>'), (2, 'bir'), (3, '≈üehrinde'), (4, 'ya≈üayan')]


In [None]:
print("Embedding shape:", model.word_embed.weight.shape)
print("Maximum used input id:", max(input_ids))


Embedding shape: torch.Size([5000, 192])
Maximum used input id: 5000


In [None]:
safe_input_ids = []
unk_id = word2idx.get("<UNK>", 1)

for i in input_ids:
    if isinstance(i, int) and 0 <= i < 5000:
        safe_input_ids.append(i)
    else:
        safe_input_ids.append(unk_id)


In [None]:
input_tensor = torch.tensor([safe_input_ids], dtype=torch.long)


In [None]:
def generate_word_level(model, persona_vec, word2idx, idx2word, max_len=50, temperature=0.8):
    model.eval()

    vocab_size = len(word2idx)
    bos_id = word2idx.get("<BOS>", 0)
    eos_id = word2idx.get("<EOS>", 1)
    unk_id = word2idx.get("<UNK>", 2)

    input_ids = [bos_id]

    for _ in range(max_len):

        input_tensor = torch.tensor([input_ids], dtype=torch.long)

        with torch.no_grad():
            logits = model(persona_vec, input_tensor)[:, -1, :]
            probs = torch.softmax(logits / temperature, dim=-1)


            if torch.isnan(probs).any() or probs.sum() == 0:
                probs = torch.ones_like(probs) / probs.shape[-1]


            if probs.dim() > 1:
                probs = probs.squeeze()


            next_id = torch.multinomial(probs, num_samples=1).item()


        if next_id >= vocab_size or next_id < 0:
            next_id = unk_id

        if next_id == eos_id:
            break

        input_ids.append(next_id)


    return " ".join([idx2word.get(i, "<UNK>") for i in input_ids[1:]])



In [None]:
def generate_word_level(model, persona_vec, word2idx, idx2word, max_len=50, temperature=0.8):
    model.eval()
    vocab_size = len(word2idx)
    bos_id = word2idx.get("<BOS>", 0)
    eos_id = word2idx.get("<EOS>", 1)
    unk_id = word2idx.get("<UNK>", 2)

    input_ids = [bos_id]
    generated_words = []

    for step in range(max_len):

        input_ids = [i if 0 <= i < vocab_size else unk_id for i in input_ids]
        input_tensor = torch.tensor([input_ids], dtype=torch.long)

        with torch.no_grad():
            logits = model(persona_vec, input_tensor)[:, -1, :]
            probs = torch.softmax(logits / temperature, dim=-1).squeeze()


        if torch.isnan(probs).any() or probs.sum() == 0:
            print("‚ö†Ô∏èprobs bozuldu, e≈üit daƒüƒ±lƒ±m atanƒ±yor.")
            probs = torch.ones(vocab_size) / vocab_size

        next_id = torch.multinomial(probs, 1).item()

        if next_id >= vocab_size or next_id < 0:
            print(f" next_id ge√ßersiz: {next_id}, <UNK> atanƒ±yor.")
            next_id = unk_id

        word = idx2word.get(next_id, "<UNK>")
        if next_id == eos_id:
            break

        generated_words.append(word)
        input_ids.append(next_id)

    return " ".join(generated_words)


