In [115]:
# collecting all data in a single file to train sentencepiece
import os
import re
import sentencepiece as spm


def get_all_text(parent_folder, filename = '.temp', savefile=False, poets = ['mirza-ghalib'], lang_folders = ['ur'], use_poet_folders=False):
    output_file = open(filename, 'w', encoding='utf-8')
    for poet_folder in os.scandir(parent_folder):
        if (use_poet_folders and poet_folder.name not in poets) or not poet_folder.is_dir() or poet_folder.name.startswith('.'):
            continue
        for lang_folder in os.scandir(poet_folder.path):
            if lang_folder.name not in lang_folders:
                continue
            for file in os.scandir(lang_folder.path):
                with open(file, 'r', encoding='utf-8') as f:
                    try:
                        text = f.read()
                        output_file.write(text + '\n')
                    except:
                        continue
    output_file.close()
    text = open(filename, 'r', encoding='utf-8').read()
    if not savefile:
        os.remove(filename)
    return text

                
all_text = get_all_text('dataset/dataset', 'all_text.txt', savefile=True)
print(all_text)
all_text = re.sub(r'\n', '<n>', all_text)
print(all_text.count('<n>'))
spm.SentencePieceTrainer.Train('--input=all_text.txt --model_prefix=urdu_tokenizer --vocab_size=5000 --model_type=bpe --user_defined_symbols=\n')


آنکھ سے دور نہ ہو دل سے اتر جائے گا 
وقت کا کیا ہے گزرتا ہے گزر جائے گا 
اتنا مانوس نہ ہو خلوت غم سے اپنی 
تو کبھی خود کو بھی دیکھے گا تو ڈر جائے گا 
ڈوبتے ڈوبتے کشتی کو اچھالا دے دوں 
میں نہیں کوئی تو ساحل پہ اتر جائے گا 
زندگی تیری عطا ہے تو یہ جانے والا 
تیری بخشش تری دہلیز پہ دھر جائے گا 
ضبط لازم ہے مگر دکھ ہے قیامت کا فرازؔ 
ظالم اب کے بھی نہ روئے گا تو مر جائے گا 

عاشقی میں میرؔ جیسے خواب مت دیکھا کرو 
باؤلے ہو جاؤ گے مہتاب مت دیکھا کرو 
جستہ جستہ پڑھ لیا کرنا مضامین وفا 
پر کتاب عشق کا ہر باب مت دیکھا کرو 
اس تماشے میں الٹ جاتی ہیں اکثر کشتیاں 
ڈوبنے والوں کو زیر آب مت دیکھا کرو 
مے کدے میں کیا تکلف مے کشی میں کیا حجاب 
بزم ساقی میں ادب آداب مت دیکھا کرو 
ہم سے درویشوں کے گھر آؤ تو یاروں کی طرح 
ہر جگہ خس خانہ و برفاب مت دیکھا کرو 
مانگے تانگے کی قبائیں دیر تک رہتی نہیں 
یار لوگوں کے لقب القاب مت دیکھا کرو 
تشنگی میں لب بھگو لینا بھی کافی ہے فرازؔ 
جام میں صہبا ہے یا زہراب مت دیکھا کرو 

اب اور کیا کسی سے مراسم بڑھائیں ہم 
یہ بھی بہت ہے تجھ کو اگر بھول جائیں ہم 
صحرائے زندگی 

In [116]:
import cupy as np

# check tokenizer
mushaira_tokenizer = spm.SentencePieceProcessor(model_file='urdu_tokenizer.model')
mirza_ghalib = get_all_text('dataset/dataset', poets=['mirza-ghalib'], lang_folders=['ur'], use_poet_folders=True)
# print(mirza_ghalib)
tokens = mushaira_tokenizer.encode(mirza_ghalib[:3000], out_type=int)
print(tokens)
decoded = mushaira_tokenizer.decode(tokens)
print(decoded)


[4955, 3, 4979, 33, 243, 111, 25, 989, 53, 15, 4955, 3, 2465, 193, 1786, 778, 53, 15, 4955, 3, 1396, 56, 46, 1701, 801, 1485, 35, 1982, 4955, 3, 3261, 4958, 101, 719, 4991, 2059, 53, 15, 4955, 3, 4659, 4958, 3458, 15, 312, 466, 28, 90, 25, 4955, 3, 4958, 194, 33, 1668, 91, 1172, 53, 15, 4955, 3, 711, 28, 1917, 15, 3191, 3149, 865, 4955, 3, 4980, 214, 21, 2534, 29, 1102, 53, 15, 4955, 3, 4212, 28, 397, 873, 846, 443, 3747, 4955, 3, 4985, 42, 158, 1677, 478, 53, 15, 4955, 3, 193, 4971, 41, 266, 54, 15, 884, 36, 1924, 4955, 3, 3856, 249, 884, 4560, 53, 15, 4955, 3, 2868, 78, 618, 377, 2224, 29, 2455, 15, 502, 4955, 3, 4965, 42, 4957, 618, 41, 110, 1153, 53, 15, 4955, 3, 3, 4979, 1598, 54, 368, 49, 158, 29, 33, 1125, 21, 53, 4955, 3, 3036, 1009, 2046, 3248, 68, 847, 21, 53, 4955, 3, 4993, 4983, 4982, 28, 141, 1426, 110, 694, 186, 747, 21, 53, 4955, 3, 4960, 128, 19, 51, 577, 131, 68, 735, 33, 847, 21, 53, 4955, 3, 3499, 235, 46, 1807, 4153, 322, 1678, 4955, 3, 4994, 98, 49, 35, 253, 29, 32

In [119]:
# code credits: https://gist.github.com/karpathy/d4dee566867f8291f086
class VanillaRNN():
    def __init__(self, vocab_size, hidden_size, seq_length, learning_rate=0.01):
        self.Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))
        self.hidden_state = np.zeros((hidden_size, 1))
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.lr = learning_rate

        # memory variables for Adagrad
        self.mWxh, self.mWhh, self.mWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        self.mbh, self.mby = np.zeros_like(self.bh), np.zeros_like(self.by)

        self.smooth_loss = -np.log(1.0/vocab_size)*seq_length

    def _sample(self, p, temperature=1.0):
        # helper function to sample an index from a probability array
        # sampling helps to diversify outputs
        p = np.log(p) / temperature
        p = np.exp(p) / np.sum(np.exp(p))
        return np.argmax(np.random.multinomial(1, p, 1))
    
    def run(self, inputs, targets, iteration):
        """
        inputs,targets are both list of integers.
        hprev is Hx1 array of initial hidden state
        returns the loss, gradients on model parameters, and last hidden state
        """
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(self.hidden_state)
        loss = 0
        # forward pass
        for t in range(len(inputs)):
            xs[t] = np.zeros((self.vocab_size,1)) # encode in 1-of-k representation
            xs[t][inputs[t]] = 1
            hs[t] = np.tanh(np.dot(self.Wxh, xs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh) # hidden state
            ys[t] = np.dot(self.Why, hs[t]) + self.by # unnormalized log probabilities for next chars
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
            loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
        # backward pass: compute gradients going backwards
        dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by)
        dhnext = np.zeros_like(hs[0])
        for t in reversed(range(len(inputs))):
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1 # backprop into y
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            dh = np.dot(self.Why.T, dy) + dhnext # backprop into h
            dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
            dbh += dhraw
            dWxh += np.dot(dhraw, xs[t].T)
            dWhh += np.dot(dhraw, hs[t-1].T)
            dhnext = np.dot(self.Whh.T, dhraw)
        
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients

        self.hidden_state = hs[len(inputs)-1]

        self.smooth_loss = self.smooth_loss * 0.999 + loss * 0.001
        
        # perform parameter update with Adagrad
        for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by], 
                                        [dWxh, dWhh, dWhy, dbh, dby], 
                                        [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
            mem += dparam * dparam
            param += -self.lr * dparam / np.sqrt(mem + 1e-8) # adagrad update
        return loss

    def sample(self, seed_ix, n):
        """ 
        sample a sequence of integers from the model 
        h is memory state, seed_ix is seed letter for first time step
        """
        x = np.zeros((self.vocab_size, 1))
        x[seed_ix] = 1
        ixes = []
        for t in range(n):
            self.hidden_state = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, self.hidden_state) + self.bh)
            y = np.dot(self.Why, self.hidden_state) + self.by
            p = np.exp(y) / np.sum(np.exp(y))
            ix = np.random.choice(range(self.vocab_size), p=p.ravel(), size=1)
            x = np.zeros((self.vocab_size, 1))
            x[ix] = 1
            ixes.append(ix)
        return ixes

        
        


In [126]:
seq_length = 25
model = VanillaRNN(mushaira_tokenizer.get_piece_size(), hidden_size=256, seq_length=seq_length, learning_rate=0.01)

epochs = 0
loss = 0
while True:
    for peom in mirza_ghalib.split('\n\n'):
        i = 0
        tokenized_peom = mushaira_tokenizer.encode(peom, out_type=int)
        while (i + 2) < len(tokenized_peom): # we will use <unk> token to repr end of poem
            inputs = np.zeros((seq_length, 1), dtype=int)
            targets = np.zeros((seq_length, 1), dtype=int)
            for j in range(seq_length):
                inputs[j] = tokenized_peom[i+j] if (i+j) < len(tokenized_peom) else mushaira_tokenizer.unk_id()
                targets[j] = tokenized_peom[i+j+1] if (i+j+1) < len(tokenized_peom) else mushaira_tokenizer.unk_id()
            loss = model.run(inputs, targets, i)
            i += seq_length

    
    print('Epochs: ', epochs)
    print('Loss: ', loss)
    sample_ix = model.sample(tokenized_peom[0], 200)
    print(sample_ix)
    txt = mushaira_tokenizer.decode([int(i) for i in sample_ix])
    print('----\n %s \n----' % (txt, ))
    
    epochs += 1
        
    

Epochs:  0
Loss:  [66.80544562]
[array([0]), array([0]), array([824]), array([0]), array([0]), array([15]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([1924]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([15]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), 

  txt = mushaira_tokenizer.decode([int(i) for i in sample_ix])


Epochs:  1
Loss:  [67.42019591]
[array([4165]), array([641]), array([1784]), array([24]), array([3610]), array([29]), array([850]), array([4955]), array([3]), array([3034]), array([107]), array([1675]), array([1510]), array([179]), array([1416]), array([20]), array([2840]), array([322]), array([15]), array([162]), array([15]), array([4955]), array([3]), array([4212]), array([152]), array([850]), array([322]), array([1037]), array([4994]), array([21]), array([54]), array([4955]), array([3]), array([598]), array([71]), array([4983]), array([363]), array([2228]), array([188]), array([719]), array([1027]), array([4962]), array([517]), array([4955]), array([3]), array([4975]), array([467]), array([20]), array([1878]), array([4962]), array([416]), array([310]), array([1717]), array([15]), array([1780]), array([53]), array([4955]), array([3]), array([72]), array([1405]), array([1267]), array([246]), array([58]), array([1018]), array([46]), array([1226]), array([35]), array([4955]), array([3])

KeyboardInterrupt: 

In [125]:
import cupy as np
np.cuda.Device(0).use()


<CUDA Device 0>