In [1]:
import torch
from torch import nn

In [None]:
with open('doc_examples/first500gen.txt','r') as f:
    docs = f.read().split('\n<NEXTDOCUMENT>\n')
distinct_chars = list(set(''.join(docs)))
n_chars = len(distinct_chars)
char_dict = {c:idx for idx,c in enumerate(distinct_chars)}
one_hot_docs = [nn.functional.one_hot(torch.tensor([char_dict[c] for c in doc]),num_classes=n_chars).float() for doc in docs]

In [413]:
class character_embedder(nn.Module):
    def __init__(self,n_chars):
        super().__init__()
        self.hidden = nn.Linear(n_chars,16)
        self.dense = nn.Linear(16,n_chars)
        self.act = nn.ReLU()
        self.sig = nn.Sigmoid()

    def forward(self,x):
        x = self.act(self.hidden(x))
        return self.sig(self.dense(x))
    

class character_embedder_sftmx(nn.Module):
    def __init__(self,n_chars):
        super().__init__()
        self.hidden = nn.Linear(n_chars,16)
        self.dense = nn.Linear(16,n_chars)
        self.sft = nn.Softmax(1)

    def forward(self,x):
        x = torch.sin(self.hidden(x))
        return self.sft(self.dense(x))
    

class character_embedder_any_n_next(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(64,2)
        self.dense = nn.Linear(2,2)
        self.sft = nn.Softmax(1)

    def forward(self,x):
        x = torch.sin(self.hidden(x))
        return self.sft(self.dense(x))


def create_n_next_samples(doc,next_model,prev_model,char_dict,n=5):
    next_features = next_model.hidden.weight.T[[char_dict[c] for c in doc[:-n]]]
    prev_features = prev_model.hidden.weight.T[[char_dict[c] for c in doc[:-n]]]
    samples = torch.vstack([torch.hstack([next_features,prev_features,
                                          next_model.hidden.weight.T[[char_dict[c] for _ in doc[:-n]]],
                                          prev_model.hidden.weight.T[[char_dict[c] for _ in doc[:-n]]]])
                                          for c in char_dict.keys()])
    labels = torch.tensor([1 if c_target in doc[idx:idx+n] else 0 for idx in range(len(doc)-n) for c_target in char_dict.keys()])
    return samples,labels



In [414]:
#doc1_ch_feats,doc1_ch_labels = create_n_next_samples(docs[0],character_embedder_model_sft,character_embedder_model_sft,char_dict)

In [415]:
from collections import Counter
from functools import reduce

In [416]:
character_embedder_model_sft = character_embedder_sftmx(n_chars)

In [421]:
objective2 = nn.BCELoss()
optimizer2 = torch.optim.SGD(character_embedder_model_sft.parameters(),.9)

In [418]:
next_counters_dict = {c:Counter(reduce(list.__add__,[[d[i+1] for i in range(len(d)-1) if d[i]==c] for d in docs])) for c in char_dict.keys()}
next_totals = [sum(next_counters_dict[c].values()) for c in char_dict.keys()]
next_expecteds = torch.tensor([[next_counters_dict[c][c2]/t for c2 in char_dict.keys()]
             for c,t in zip(char_dict.keys(),next_totals)]).float()

In [451]:
epochs = 100000
average_loss = 0
reporting_cadence = 10
print_cadence = 3
best_average = torch.tensor([torch.inf])
c = 0
break_criteria = 10
bad_in_a_row = 0
char_tensors = torch.eye(len(distinct_chars))
for epoch in range(epochs):
    preds = character_embedder_model_sft(char_tensors)
    loss = objective2(preds,next_expecteds)
    loss.backward()
    optimizer2.step()
    optimizer2.zero_grad()
    average_loss += loss
    c += 1
    if c>=print_cadence:
        print(epoch,round(average_loss.detach().item()/print_cadence,6),round(best_average.detach().item()/print_cadence,6),' '*10,end='\r')
        if average_loss<best_average:
            best_average = average_loss
        c = 0
        average_loss = 0

99998 0.022148 0.022148           

In [452]:
character_embedder_model_sft(nn.functional.one_hot(torch.tensor([char_dict['Q'],char_dict['q'],char_dict[' ']]),num_classes=n_chars).float())[:,[89,51,23]]

tensor([[4.7621e-02, 4.4432e-04, 9.4717e-01],
        [5.1258e-05, 3.3872e-04, 9.9832e-01],
        [1.9248e-04, 2.0082e-02, 1.3834e-02]], grad_fn=<IndexBackward0>)

In [428]:
character_prev_embedder_model_sft = character_embedder_sftmx(n_chars)

In [431]:
objective = nn.BCELoss()
optimizer = torch.optim.SGD(character_prev_embedder_model_sft.parameters(),.9)

In [430]:
prev_counters_dict = {c:Counter(reduce(list.__add__,[[d[i-1] for i in range(1,len(d)) if d[i]==c] for d in docs])) for c in char_dict.keys()}
totals = [sum(prev_counters_dict[c].values()) for c in char_dict.keys()]
expecteds = torch.tensor([[prev_counters_dict[c][c2]/t for c2 in char_dict.keys()]
             for c,t in zip(char_dict.keys(),totals)]).float()

In [446]:
epochs = 100000
average_loss = 0
reporting_cadence = 10
print_cadence = 3
best_average = torch.tensor([torch.inf])
c = 0
break_criteria = 10
bad_in_a_row = 0
char_tensors = torch.eye(len(distinct_chars))
for epoch in range(epochs):
    preds = character_prev_embedder_model_sft(char_tensors)
    loss = objective(preds,expecteds)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    average_loss += loss
    c += 1
    if c>=print_cadence:
        print(epoch,round(average_loss.detach().item()/print_cadence,6),round(best_average.detach().item()/print_cadence,6),' '*10,end='\r')
        if average_loss<best_average:
            best_average = average_loss
        c = 0
        average_loss = 0

99998 0.020371 0.020371           

In [447]:
character_prev_embedder_model_sft(nn.functional.one_hot(torch.tensor([char_dict['?']]),num_classes=n_chars).float())[:,[89,51,14]]

tensor([[0.0015, 0.1215, 0.1648]], grad_fn=<IndexBackward0>)

In [458]:
char_reps = torch.hstack((torch.sin(character_embedder_model_sft.hidden(char_tensors)),torch.sin(character_prev_embedder_model_sft.hidden(char_tensors))))

In [493]:
torch.save(char_reps,'vector_reps/Char_reps/char_reps_02072025.pt')

In [494]:
with open('vector_reps/Char_reps/char_map.txt','w') as f:
    f.write('<NEXT_CHAR>'.join(distinct_chars))