In [None]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
from torch.utils.data import DataLoader, TensorDataset


# --------------------------------------------------
# 1) TEXT PROCESSOR
# --------------------------------------------------
class TextProcessor:
    def __init__(self,
                 embedding_dim=50,
                 window=5,
                 min_count=1,
                 workers=4,
                 unk_token='<UNK>'):
        self.embedding_dim = embedding_dim
        self.window        = window
        self.min_count     = min_count
        self.workers       = workers
        self.unk_token     = unk_token

        self.w2v        = None
        self.word2idx   = None
        self.idx2word   = None
        self.vocab_size = None

    def _read_text(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            return f.read()

    def _clean_and_tokenize(self, text):
        text = re.sub(r'[^A-Za-z\s]', ' ', text).lower()
        return text.split()

    def _split_sentences(self, text):
        raw_sents = re.split(r'[\.!?]\s*', text)
        return [self._clean_and_tokenize(s) for s in raw_sents if s]

    def _build_vocab(self, w2v):
        vocab = list(w2v.wv.index_to_key)
        w2i   = {w:i for i,w in enumerate(vocab)}
        i2w   = {i:w for w,i in w2i.items()}
        unk_i = len(w2i)
        w2i[self.unk_token] = unk_i
        i2w[unk_i]           = self.unk_token
        return w2i, i2w

    def fit(self, text_path=None, raw_text=None):
        if text_path:
            raw = self._read_text(text_path)
        elif raw_text:
            raw = raw_text
        else:
            raise ValueError("Provide text_path or raw_text")
        # Train Word2Vec
        sents = self._split_sentences(raw)
        self.w2v = Word2Vec(
            sentences   = sents,
            vector_size = self.embedding_dim,
            window      = self.window,
            min_count   = self.min_count,
            workers     = self.workers
        )
        # Build vocab with <UNK>
        self.word2idx, self.idx2word = self._build_vocab(self.w2v)
        self.vocab_size = len(self.word2idx)
        return self

    def encode(self, text):
        toks = self._clean_and_tokenize(text)
        return [self.word2idx.get(t, self.word2idx[self.unk_token])
                for t in toks]

    def get_embedding(self, idx):
        w = self.idx2word.get(idx, self.unk_token)
        if w in self.w2v.wv:
            return self.w2v.wv[w]
        return np.zeros(self.embedding_dim)

# --------------------------------------------------

class RNN:
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 w2v_model,
                 idx2word,
                 learning_rate=1e-2,
                 seq_length=10,
                 clip_rate=5.0):

        self.Wxh = np.random.randn(input_size,  hidden_size) * 0.1
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.1
        self.Why = np.random.randn(hidden_size, output_size) * 0.1
        self.bh  = np.zeros((1, hidden_size))
        self.by  = np.zeros((1, output_size))

        self.mWxh = np.zeros_like(self.Wxh)
        self.mWhh = np.zeros_like(self.Whh)
        self.mWhy = np.zeros_like(self.Why)
        self.mbh  = np.zeros_like(self.bh)
        self.mby  = np.zeros_like(self.by)

        self.hidden_size  = hidden_size
        self.seq_length   = seq_length
        self.learning_rate= learning_rate
        self.clip_rate    = clip_rate

        self.w2v     = w2v_model
        self.idx2word= idx2word
        self.hprev   = np.zeros((1, hidden_size))
        self.smooth_loss = None

    def forward(self, inputs, reset_state=False):
        if reset_state:
            self.hprev = np.zeros_like(self.hprev)

        hs, ys, ps = {}, {}, {}
        hs[-1] = self.hprev.copy()

        for t, x in enumerate(inputs):
            x = x.reshape(1,-1)
            hs[t] = np.tanh(x.dot(self.Wxh) + hs[t-1].dot(self.Whh) + self.bh)
            ys[t] = hs[t].dot(self.Why) + self.by
            y = ys[t] - np.max(ys[t])
            ps[t]= np.exp(y)/np.sum(np.exp(y),axis=1,keepdims=True)

        self.hprev = hs[len(inputs)-1]
        return ys, ps, hs

    def backward(self, inputs, targets, ps, hs):
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dWhy = np.zeros_like(self.Why)
        dbh  = np.zeros_like(self.bh)
        dby  = np.zeros_like(self.by)
        dhnext = np.zeros_like(self.hprev)

        T = len(inputs)
        for t in reversed(range(T)):
            dy = ps[t].copy()
            dy[0, targets[t]] -= 1
            dWhy += hs[t].T.dot(dy)
            dby  += dy

            dh = dy.dot(self.Why.T) + dhnext
            dhraw = (1-hs[t]**2)*dh
            dbh  += dhraw
            dWxh += inputs[t].reshape(-1,1).dot(dhraw)
            dWhh += hs[t-1].T.dot(dhraw)
            dhnext = dhraw.dot(self.Whh.T)

        for d in [dWxh,dWhh,dWhy,dbh,dby]:
            np.clip(d, -self.clip_rate, self.clip_rate, out=d)

        return dWxh,dWhh,dWhy,dbh,dby

    def sample(self, seed_idx, length=30, temperature=1.0):
        h = np.zeros_like(self.hprev)
        idx = seed_idx
        out = []
        for _ in range(length):
            x = self.w2v.wv[self.idx2word[idx]].reshape(1,-1)
            h = np.tanh(x.dot(self.Wxh) + h.dot(self.Whh) + self.bh)
            y = h.dot(self.Why) + self.by
            y = y/temperature
            y = y - np.max(y)
            p = np.exp(y)/np.sum(np.exp(y))
            idx = np.random.choice(range(self.Why.shape[1]), p=p.ravel())
            out.append(self.idx2word[idx])
        return ' '.join(out)

    def train(self, data_idxs, epochs=100, print_every=10):
        n = len(data_idxs)
        vocab_size = self.Why.shape[1]
        if self.smooth_loss is None:
            self.smooth_loss = -np.log(1.0/vocab_size)*self.seq_length

        for epoch in range(1, epochs+1):
            h_backup = self.hprev.copy()
            epoch_loss = 0.0
            count = 0

            # create sliding windows of length seq_length
            for i in range(0, n - self.seq_length, self.seq_length):
                Xb = data_idxs[i : i+self.seq_length]
                yb = data_idxs[i+1 : i+self.seq_length+1]

                emb = [self.w2v.wv[self.idx2word[x]] for x in Xb]
                _, ps, hs = self.forward(emb, reset_state=(i==0))

                loss = sum(-np.log(ps[t][0,yb[t]] + 1e-12) for t in range(len(yb)))
                epoch_loss += loss
                count += 1

                grads = self.backward(emb, yb, ps, hs)
                for param, dparam, mem in zip(
                    [self.Wxh, self.Whh, self.Why, self.bh, self.by],
                    grads,
                    [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]
                ):
                    mem += dparam*dparam
                    param -= self.learning_rate * dparam / (np.sqrt(mem)+1e-8)

            avg_loss = epoch_loss/count
            self.smooth_loss = 0.99*self.smooth_loss + 0.01*avg_loss

            if epoch % print_every == 0:
                print(f"Epoch {epoch:4d} | loss {self.smooth_loss:.4f}")

            self.hprev = h_backup


# --------------------------------------------------
# 3) MAIN: TRAIN & SAMPLE 3 MODELS
# --------------------------------------------------
if __name__ == "__main__":
    text_path    = r"C:\Users\dell\Downloads\Weki.txt"
    embedding_dim= 50
    hidden_size  = 128
    seq_length   = 25
    learning_rate= 0.01

    # 1) build TextProcessor & encode
    tp = TextProcessor(embedding_dim=embedding_dim).fit(text_path=text_path)
    encoded = tp.encode(tp._read_text(text_path))

    # use only first 100 tokens for toy training
    data_seq = encoded[:100]

    # epochs for the three models
    for model_id, epochs in enumerate([500, 1000, 2000], start=1):
        print(f"\n>>> Training Model-{model_id} for {epochs} epochs")
        model = RNN(
            input_size   = embedding_dim,
            hidden_size  = hidden_size,
            output_size  = tp.vocab_size,
            w2v_model    = tp.w2v,
            idx2word     = tp.idx2word,
            learning_rate= learning_rate,
            seq_length   = seq_length,
            clip_rate    = 5.0
        )
        model.train(data_seq, epochs=epochs, print_every=epochs//5)




>>> Training Model-1 for 500 epochs
Epoch  100 | loss 34.7736
Epoch  200 | loss 13.2387
Epoch  300 | loss 5.0879
Epoch  400 | loss 2.0187
Epoch  500 | loss 0.8537

>>> Training Model-2 for 1000 epochs
Epoch  200 | loss 12.9434
Epoch  400 | loss 1.9582
Epoch  600 | loss 0.3852
Epoch  800 | loss 0.1359
Epoch 1000 | loss 0.0821

>>> Training Model-3 for 2000 epochs
Epoch  400 | loss 2.1667
Epoch  800 | loss 0.1722
Epoch 1200 | loss 0.0814
Epoch 1600 | loss 0.0567
Epoch 2000 | loss 0.0436


In [None]:



class GRUNetwork(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 seq_length=25,
                 learning_rate=1e-3):
        super().__init__()
        self.hidden_size  = hidden_size
        self.seq_length   = seq_length
        self.learning_rate= learning_rate

        # one-layer GRU
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc  = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x: (batch, seq_len, input_size)
        out, _ = self.gru(x)                  
        # take last time-step
        out = self.fc(out[:, -1, :])          
        return out

    def sample(self, seed_seq, length, processor, temperature=1.0):
        """
        seed_seq: list of token‐IDs, length >= seq_length
        returns a list of generated token‐IDs
        """
        self.eval()
        generated = []
        # hidden state automatically handled by PyTorch GRU
        h = None

        # we will slide the window
        window = seed_seq[-self.seq_length:].copy()

        for _ in range(length):
            # embed current window
            emb = [processor.get_embedding(i) for i in window]
            x = torch.tensor([emb], dtype=torch.float32)    # shape (1, seq, emb_dim)
            out, h = self.gru(x, h)                          # out: (1, seq, hidden)
            logits = self.fc(out[:, -1, :]) / temperature    # (1, output_size)
            probs  = torch.softmax(logits, dim=-1).squeeze(0)
            idx    = torch.multinomial(probs, num_samples=1).item()
            generated.append(idx)
            # slide
            window = window[1:] + [idx]

        return generated


# --------------------------------------------------
def train(model, optimizer, loss_fn, X, y, epochs=100, batch_size=32, print_every=10):
    dataset = torch.utils.data.TensorDataset(X, y)
    loader  = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        for batch_X, batch_y in loader:
            optimizer.zero_grad()
            preds = model(batch_X)
            loss  = loss_fn(preds, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch_X.size(0)

        avg_loss = total_loss / len(dataset)
        if epoch % print_every == 0 or epoch==1:
            print(f"Epoch {epoch:4d}/{epochs} | loss {avg_loss:.4f}")


if __name__ == "__main__":
    text_path    = r"C:\Users\dell\Downloads\Weki.txt"
    embedding_dim= 50
    hidden_size  = 128
    seq_length   = 25
    lr           = 0.01

    # 1) Load & preprocess
    raw = TextProcessor()._read_text(text_path)
    tp  = TextProcessor(
            embedding_dim=embedding_dim,
            window=5, min_count=1, workers=4
          ).fit(raw_text=raw)

    # 2) Encode entire text
    enc = tp.encode(raw)

    # 3) Build sliding-window dataset
    X_ids, y_ids = [], []
    for i in range(len(enc) - seq_length):
        X_ids.append(enc[i:i+seq_length])
        y_ids.append(enc[i+seq_length])
    # optionally trim for speed:
    X_ids = X_ids[:100]
    y_ids = y_ids[:100]

    # 4) Turn into embeddings + tensors
    X_emb = []
    for seq in X_ids:
        X_emb.append([tp.get_embedding(idx) for idx in seq])
    X_tensor = torch.tensor(X_emb, dtype=torch.float32)           # (N, seq_length, emb_dim)
    y_tensor = torch.tensor(y_ids, dtype=torch.long)             # (N,)

    # 5) Train & sample
    for model_i, epochs in enumerate([500, 1000, 2000], start=1):
        print(f"\n========== MODEL-{model_i}: {epochs} epochs ==========")
        model     = GRUNetwork(
                        input_size   = embedding_dim,
                        hidden_size  = hidden_size,
                        output_size  = tp.vocab_size,
                        seq_length   = seq_length,
                        learning_rate= lr
                    )
        optimizer = optim.Adam(model.parameters(), lr=lr)
        loss_fn   = nn.CrossEntropyLoss()

        # train
        train(model, optimizer, loss_fn,
              X_tensor, y_tensor,
              epochs=epochs,
              batch_size=32,
              print_every=epochs//5)




Epoch    1/500 | loss 7.9977
Epoch  100/500 | loss 3.5235
Epoch  200/500 | loss 0.9819
Epoch  300/500 | loss 0.1477
Epoch  400/500 | loss 0.0530
Epoch  500/500 | loss 0.0274

SAMPLE: ai to the capability of the capability of computational systems and develops that studies and use learning and intelligence to take actions that maximize their chances of achieving defined goals such

Epoch    1/1000 | loss 7.9831
Epoch  200/1000 | loss 1.5229
Epoch  400/1000 | loss 0.0536
Epoch  600/1000 | loss 0.0163
Epoch  800/1000 | loss 0.0120
Epoch 1000/1000 | loss 0.0043

SAMPLE: ai to the capability of computational systems to perform tasks typically associated with human intelligence such as learning reasoning problem solving perception and decision making it is a field of research

Epoch    1/2000 | loss 7.9932
Epoch  400/2000 | loss 0.0538
Epoch  800/2000 | loss 0.0078
Epoch 1200/2000 | loss 0.0025
Epoch 1600/2000 | loss 0.0010
Epoch 2000/2000 | loss 0.0006

SAMPLE: ai to the capability of comp

In [None]:



# --------------------------------------------------
# 2) LSTM MODEL
# --------------------------------------------------

class CustomLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        last_hidden = lstm_out[:, -1, :]  # last output of sequence
        return self.fc(last_hidden)

# --------------------------------------------------
# 3) TRAINING / PREDICTION WRAPPER (ModAL style)
# --------------------------------------------------

class CustomModALWrapper:
    def __init__(self, model, device=None):
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device)

    def predict(self, X):
        self.model.eval()
        Xb = torch.LongTensor(X).to(self.device)
        with torch.no_grad():
            logits = self.model(Xb)
        return logits.argmax(dim=1).cpu().numpy()

    def predict_proba(self, X):
        self.model.eval()
        Xb = torch.LongTensor(X).to(self.device)
        with torch.no_grad():
            logits = self.model(Xb)
            probs = torch.softmax(logits, dim=1)
        return probs.cpu().numpy()

    def fit(self, X, y, epochs=3, batch_size=32, lr=2e-5):
        Xt = torch.LongTensor(X)
        yt = torch.LongTensor(y)
        ds = TensorDataset(Xt, yt)
        loader = DataLoader(ds, batch_size=batch_size, shuffle=True)
        opt = optim.AdamW(self.model.parameters(), lr=lr)
        crit = nn.CrossEntropyLoss()

        self.model.train()
        for ep in range(1, epochs + 1):
            total_loss = 0.0
            for xb, yb in loader:
                xb, yb = xb.to(self.device), yb.to(self.device)
                opt.zero_grad()
                logits = self.model(xb)
                loss = crit(logits, yb)
                loss.backward()
                opt.step()
                total_loss += loss.item()
            print(f"[Epoch {ep:3d}] avg loss = {total_loss / len(loader):.4f}")

# --------------------------------------------------
# 4) MAIN SCRIPT
# --------------------------------------------------

if __name__ == '__main__':
    # Set parameters
    path = r"C:\Users\dell\Downloads\Weki.txt"  # adjust as necessary
    seq_len = 25
    embed_dim = 50

    # 1) Load & preprocess text
    pre = TextProcessor(embedding_dim=embed_dim, window=5, min_count=1, workers=4)
    raw_text = pre._read_text(path)
    pre.fit(raw_text=raw_text)

    # 2) Encode entire raw text into indices
    enc = pre.encode(raw_text)

    # 3) Create sequences and targets: X = seq_len words, y = next word
    X, y = [], []
    for i in range(len(enc) - seq_len):
        X.append(enc[i:i+seq_len])
        y.append(enc[i+seq_len])

    import numpy as np
    X = np.array(X)
    y = np.array(y)

    

    # 4) Initialize model
    pad_idx = pre.word2idx[pre.pad_token]

    model = CustomLSTMModel(
        vocab_size=pre.vocab_size,
        embed_dim=embed_dim,
        hidden_dim=128,
        output_dim=pre.vocab_size,
        pad_idx=pad_idx
    )

    # 5) Initialize embedding weights with Word2Vec vectors where possible
    embedding_weights = np.zeros((pre.vocab_size, embed_dim), dtype=np.float32)
    for word, idx in pre.word2idx.items():
        if word in pre.w2v.wv:
            embedding_weights[idx] = pre.w2v.wv[word]
        elif word == pre.pad_token:
            embedding_weights[idx] = np.zeros(embed_dim)
        else:
            # Random init for unknown words or UNK token
            embedding_weights[idx] = np.random.normal(scale=0.6, size=(embed_dim))

    model.embedding.weight.data.copy_(torch.tensor(embedding_weights))

    # 6) Train model
    learner = CustomModALWrapper(model)
    learner.fit(X[:100], y[:100], epochs=500, batch_size=64, lr=0.01)


[Epoch   1] avg loss = 7.9859
[Epoch   2] avg loss = 6.6221
[Epoch   3] avg loss = 4.7685
[Epoch   4] avg loss = 4.4339
[Epoch   5] avg loss = 4.3325
[Epoch   6] avg loss = 4.2960
[Epoch   7] avg loss = 4.2680
[Epoch   8] avg loss = 4.3071
[Epoch   9] avg loss = 4.2303
[Epoch  10] avg loss = 4.2165
[Epoch  11] avg loss = 4.2229
[Epoch  12] avg loss = 4.1629
[Epoch  13] avg loss = 4.0797
[Epoch  14] avg loss = 4.0808
[Epoch  15] avg loss = 4.0407
[Epoch  16] avg loss = 3.9840
[Epoch  17] avg loss = 3.9519
[Epoch  18] avg loss = 3.8886
[Epoch  19] avg loss = 3.8092
[Epoch  20] avg loss = 3.7851
[Epoch  21] avg loss = 3.7200
[Epoch  22] avg loss = 3.7207
[Epoch  23] avg loss = 3.6306
[Epoch  24] avg loss = 3.6179
[Epoch  25] avg loss = 3.5888
[Epoch  26] avg loss = 3.5121
[Epoch  27] avg loss = 3.4585
[Epoch  28] avg loss = 3.4628
[Epoch  29] avg loss = 3.3722
[Epoch  30] avg loss = 3.3207
[Epoch  31] avg loss = 3.2779
[Epoch  32] avg loss = 3.1873
[Epoch  33] avg loss = 3.1512
[Epoch  34

In [33]:
learner.fit(X[:100], y[:100], epochs=1000, batch_size=64, lr=0.01)

[Epoch   1] avg loss = 0.0094
[Epoch   2] avg loss = 0.0018
[Epoch   3] avg loss = 0.0060
[Epoch   4] avg loss = 0.0707
[Epoch   5] avg loss = 0.0038
[Epoch   6] avg loss = 0.0073
[Epoch   7] avg loss = 0.0056
[Epoch   8] avg loss = 0.0311
[Epoch   9] avg loss = 0.0104
[Epoch  10] avg loss = 0.0060
[Epoch  11] avg loss = 0.0075
[Epoch  12] avg loss = 0.0038
[Epoch  13] avg loss = 0.0030
[Epoch  14] avg loss = 0.0023
[Epoch  15] avg loss = 0.0020
[Epoch  16] avg loss = 0.0017
[Epoch  17] avg loss = 0.0015
[Epoch  18] avg loss = 0.0013
[Epoch  19] avg loss = 0.0009
[Epoch  20] avg loss = 0.0008
[Epoch  21] avg loss = 0.0007
[Epoch  22] avg loss = 0.0006
[Epoch  23] avg loss = 0.0005
[Epoch  24] avg loss = 0.0005
[Epoch  25] avg loss = 0.0004
[Epoch  26] avg loss = 0.0004
[Epoch  27] avg loss = 0.0004
[Epoch  28] avg loss = 0.0003
[Epoch  29] avg loss = 0.0003
[Epoch  30] avg loss = 0.0003
[Epoch  31] avg loss = 0.0003
[Epoch  32] avg loss = 0.0003
[Epoch  33] avg loss = 0.0003
[Epoch  34

In [34]:
learner.fit(X[:100], y[:100], epochs=2000, batch_size=64, lr=0.01)

[Epoch   1] avg loss = 0.0000
[Epoch   2] avg loss = 0.0000
[Epoch   3] avg loss = 0.0002
[Epoch   4] avg loss = 0.0001
[Epoch   5] avg loss = 0.0001
[Epoch   6] avg loss = 0.0011
[Epoch   7] avg loss = 0.0001
[Epoch   8] avg loss = 0.0003
[Epoch   9] avg loss = 0.0006
[Epoch  10] avg loss = 0.0001
[Epoch  11] avg loss = 0.0001
[Epoch  12] avg loss = 0.0002
[Epoch  13] avg loss = 0.0001
[Epoch  14] avg loss = 0.0001
[Epoch  15] avg loss = 0.0001
[Epoch  16] avg loss = 0.0001
[Epoch  17] avg loss = 0.0000
[Epoch  18] avg loss = 0.0000
[Epoch  19] avg loss = 0.0000
[Epoch  20] avg loss = 0.0000
[Epoch  21] avg loss = 0.0000
[Epoch  22] avg loss = 0.0000
[Epoch  23] avg loss = 0.0000
[Epoch  24] avg loss = 0.0000
[Epoch  25] avg loss = 0.0000
[Epoch  26] avg loss = 0.0000
[Epoch  27] avg loss = 0.0000
[Epoch  28] avg loss = 0.0000
[Epoch  29] avg loss = 0.0000
[Epoch  30] avg loss = 0.0000
[Epoch  31] avg loss = 0.0000
[Epoch  32] avg loss = 0.0000
[Epoch  33] avg loss = 0.0000
[Epoch  34