In [1]:
from collections import Counter
import nltk
import pandas as pd
import numpy as np
import random

# Urdu to roman phonetic dictionary

##### mapping urdu phonetic roman against urdu words 

In [2]:
urdu_dictionary = { 
    "آ" : "aa",
    "ا" : "a",
    "ب" : "b",
    "پ" : "p",
    "ت" : "t",
    "ٹ" : "t",
    "ث" : "sa",
    "ج" : "j",
    "چ" : "ch",
    "ح" : "ha",
    "خ" : "kh",
    "د" : "d",
    "ڈ" : "da",
    "ذ" : "za",
    "ر" : "r",
    "ڑ" : "ra",
    "ز" : "za",
    "ژ" : "ze",
    "س" : "s",
    "ش" : "sh",
    "ص" : "sa",
    "ض" : "zu",
    "ط" : "ta",
    "ظ" : "za",
    "ع" : "a",
    "غ" : "gh",
    "ف" : "f",
    "ق" : "k",
    "ک" : "k",
    "گ" : "gh",
    "ل" : "l",
    "م" : "m",
    "ن" : "n",
    "ں" : "n",
    "و" : "w",
    "ہ" : "ha",
    "ی" : "y",
    "ے" : "ay",
    "ء" : "a",
    "ئ" : "i",
    "ھ" : "h",
    "أ"  : "a",
    "ؤ"  : "ao",
    "ه"  : "ha",
    "ي"  : "y",
    "ۂ"  : "h",
    "ۓ"  : "ya",
}

# Converting urdu to roman urdu

#######A function that takes a urdu string as an input and maps roman agaist urdu characters based on the dictionary and returns a roman urdu string

In [3]:
def urdu_to_roman(Urdu_string):
    
    Urdu_string_lst=Urdu_string.split()
    roman_lst=[]
    
    for word in Urdu_string_lst:
        temp = word
        for i, j in urdu_dictionary.items():      
            temp = temp.replace(i, j)
        roman_lst.append(temp)
        
    Roman_string=' '.join(roman_lst)
    
    return Roman_string

# Preprocessing

###### Fisrst of all i am reading the poetry file into a string, than i am sending that string to my preprocessing function that removes punctuation, remove double spaces, join some words together and seprate some and returns a normal urdu string which is again stored in a list of sentences. Further all sentences with a length of 3 or less are removed as they are only names or poetry titles. I am also adding sentence start and end tags on each sentence and removing all empty lines

In [4]:
def Preprocessing(sentence):
    start='<s>'
    end='</s>'
    punct=['?' , ':' , '؟' , '،' , '*' , '\'' , '!' ,'`' , '‘' , '’' , '"' , '%' , '.' , '_' , '“' ,'ـ', '۔',]
    temp=''
    sentence=sentence.split()
    for word in sentence:
        if word not in punct:
            temp = temp + word + ' '
    
    sentence=temp.split()
    temp=' '
    
    for word in sentence:
        for char in word:
            if char not in punct:
                temp = temp + char
        temp = temp + ' '
    temp = start + temp + end
    return temp

In [5]:
file = 'poetry.txt'
sentences=[]
sentence=''
temp=''
with open(file, 'r', encoding="utf-8") as text:
    sentence = text.readlines()
for line in sentence:
    temp=(Preprocessing(line))
    temp=' '.join(temp.split())
    sentences.append(temp)
    
sentences = filter(lambda x: x != '', sentences)
sentences=list(sentences)

for line in sentences:
        temp=line.split()
        if(len(temp) <= 3):
            sentences.remove(line)

# Generating corpus

##### I am building a simple corpus wich have all the sentences stored as a text, and i am calculating the total length of corpus excluding sentece start and end tags

In [6]:
corpus=' '.join(sentences)
temp= corpus.count("</s>") + corpus.count("<s>")
corpus_length_uni=len(corpus) - temp
corpus_length=len(corpus)

# Unigram

##### in the unigram i am spliting the whole corpus, turing the corpus into a dictionary which stres all unique words agaist their freqencies. Next i am removing the sentence start ad tags because they don't contribute in word making. Find all the unique words and calculate the total number of vocabulary. Next its simply dividing the count of each word against total count vocabulary and storing it in a probability dictionary agaist each word

In [7]:
corpus_words=corpus.split()
unigram_freq_dict=Counter(corpus_words)
unigram_freq_dict.pop("</s>")
unigram_freq_dict.pop("<s>")
no_of_unique_words=len(unigram_freq_dict)
denominator = corpus_length_uni
unigram_prob_dict={}
for key in unigram_freq_dict:
    numerator=unigram_freq_dict.get(key,0 )
    prob=numerator/denominator
    unigram_prob_dict[key] = prob

In [8]:
unique_words=[]
for key in unigram_freq_dict:
    unique_words.append(key)

# Bigram

##### for making bigrams i am simply checking 2 consectively occuring words, and making them into a tuple and storing them in a list. I am mainiting a list of tuples. Next i am using conditional freq distribution from nltk library to calculate the occurance of all the unique touples

In [9]:
bigram=[]
lst=[]
start='<s>'
end='</s>'
for sentence in sentences:
    sentence=sentence.split()
    temp=None
    for word in sentence:
        if temp != None:
            if temp != start and word != end:
                lst=[]
                lst.append(temp)
                lst.append(word)
                lst=tuple(lst)
                bigram.append(lst)
        temp = word
        
bigram_freq = nltk.ConditionalFreqDist()
for tup in bigram:
    bigram_freq[len(tup)][tup] += 1

# Backwar Bigram

##### It's the same as bigram except that i am swapping the places of touples with eachother for creating backward bigrams 

In [10]:
Back_bigram=[]
lst=[]
start='<s>'
end='</s>'
for sentence in sentences:
    sentence=sentence.split()
    temp=None
    for word in sentence:
        if temp != None:
            if temp != start and word != end:
                lst=[]
                lst.append(word)
                lst.append(temp)
                lst=tuple(lst)
                Back_bigram.append(lst)
        temp = word
        
Back_bigram_freq = nltk.ConditionalFreqDist()
for tup in Back_bigram:
    Back_bigram_freq[len(tup)][tup] += 1

# Trigram

#### for making trigrams i am simply checking 3 consectively occuring words, and making them into a tuple and storing them in a list. I am mainiting a list of tuples. Next i am using conditional freq distribution from nltk library to calculate the occurance of all the unique touples¶

In [11]:
Trigram=[]
lst=[]
start='<s>'
end='</s>'
for sentence in sentences:
    sentence=sentence.split()
    for i in range(0, len(sentence)-2):
        if sentence[i] != start and sentence[i+2] != end:
                    lst=[]
                    lst.append(sentence[i])
                    lst.append(sentence[i+1])
                    lst.append(sentence[i+2])
                    lst=tuple(lst)
                    Trigram.append(lst)

trigram_freq = nltk.ConditionalFreqDist()
for tup in Trigram:
    trigram_freq[len(tup)][tup] += 1

# Creating generation models

the bigram generation functions recives a word, along the list of avilable bigrams, and their frequencies, and the total vocab count. I check if the recived word exists in all avaiable bigram, i get the no of occurances of that bigram. I am using la place smoothing. Probabilities for all the existing bigrams are calculated and stored in a dictionary. I am taking the maximum value from that and return the next existing word.

In [12]:
def bigram_gen(bigram,start_word,bigram_freq,unigram_freq_dict,no_of_unique_words):
    temp_dict={}
    num=0
    den=0
    prob=0
    for tup in bigram:
        if (start_word == tup[0]):
            num=bigram_freq[len(tup)][tup]+1
            den=unigram_freq_dict.get(tup[0],0)
            den = den + no_of_unique_words
            prob=num/den
            temp_dict[tup]=prob
#             print(tup,start_word)
#             print(prob)
    max_key = max(temp_dict, key=temp_dict.get)
#     print(max_key[1])
    return max_key[1]

It works the same as bigram and returns the previous existing word.

In [13]:
def Back_bigram_gen(Back_bigram,start_word,Back_bigram_freq,unigram_freq_dict,no_of_unique_words):
    temp_dict={}
    num=0
    den=0
    prob=0
    for tup in Back_bigram:
        if (start_word == tup[0]):
            num=Back_bigram_freq[len(tup)][tup]+1
            den=unigram_freq_dict.get(tup[0],0)
            den = den + no_of_unique_words
            prob=num/den
            temp_dict[tup]=prob
#             print(tup,start_word)
#             print(prob)
    max_key = max(temp_dict, key=temp_dict.get)
#     print(max_key[1])
    return max_key[1]

In trigram, i am also making use of la place smoothing. A occuring word comes, its probablity of existing is calculated and it returns the 2 occuring words, given that word

In [14]:
def trigram_gen(Trigram,bigram_freq,start_word,trigram_freq,unigram_freq_dict,no_of_unique_words):
    temp_dict={}
    num=0
    den=0
    prob=0
    for tup in Trigram:
        if (start_word == tup[0]):
            num=trigram_freq[len(tup)][tup]+1
            den=bigram_freq[2][(tup[0],tup[1])]
            den = den + no_of_unique_words
            prob=num/den
            temp_dict[tup]=prob
#             print(tup,start_word)
#             print(prob)
    max_key = max(temp_dict, key=temp_dict.get)
    #print(max_key)
    return max_key[1],max_key[2]

I am using all diffrent models to genrate poetry. What i am doing is picking a word a random, sending it to the model i want to use, generate it's next or previous occuring words, append them into a verse and append the verse into a Ghazal list. After the Ghazal is generated in urdu its sent to a urdu to roamn converter and its displayed.
To improve the accuracy of the models, i have scrapped some other potry datasets, about an 1.5 mb of petry where as gven was just 150 kb, that has improved my models and have increased number of words and helps with rhyming as well

# Generating Poetry using Bi directional bigram

In bi directional bigram model, i am simply picking a word at the middle, sending it to the forward bigram first, than backward bigram, both return the next occuring words, and i send those words back to respective models and generate a whole verse

In [15]:
poet_line=''
new_line=[]
Ghazal=[]
rand_range=len(unique_words)


for j in range(0,14):
    idx=random.randint(0,rand_range)
    middle_word=bigram[idx][0]
    middle_word_fwd=middle_word
    middle_word_bck=middle_word
    new_line.append(middle_word)
    previou_word=''
    next_word=''
    for i in range(0,8):
        if(i%2==0):
            try:
                next_word=bigram_gen(bigram,middle_word_fwd,bigram_freq,unigram_freq_dict,no_of_unique_words)
                middle_word_fwd=next_word
                new_line = new_line + [next_word]
            except:
                pass
        else:
            try:
                previou_word=Back_bigram_gen(Back_bigram,middle_word_bck,Back_bigram_freq,unigram_freq_dict,no_of_unique_words)
                middle_word_bck=previou_word
                new_line =  [previou_word] + new_line 
            except:
                pass
    poet_line=' '.join(new_line)
    Ghazal.append(poet_line)
    print(poet_line)
    poet_line=''
    new_line=[]
    if(j%2==1):
        print()

Roman_Ghazal=[]
for sentence in Ghazal:
    Roman_Ghazal.append(urdu_to_roman(sentence))

i=0
for sentence in Roman_Ghazal:
    print(sentence)
    if(i%2==1):
        print()
    i+=1

تو کیا ہے تو فکر سے بھی نہیں ہے
ہے تو کیا ہے وہ بھی نہیں ہے کہ

دل میں بھی نہیں ہوتے ہیں ہم نے بھی
وہ دانائے سبل ختم عہدِ ہجر کی طرح سے

ہیں ہم نے زخم جگر کے لیے آ گیا
کیا ہے مجھ کو عداوت ہی نہیں ہے کہ

ہے تو کیا ہے مجھ کو بھی نہیں ہے
ہے تو کیا ہے کچھ نہ ہو کر کے

اس کے دیکھتے ہیں جوہر آئینہ ہے کہ میں
کیا ہے اس کی صدا ہو کر کے لیے

تو کیا ہے مجھ کو بھی نہیں ہے کہ
یوں ہی کیوں نہ تھا کہ میں نے بھی

ہے تو کیا ہے بجلی سے بھی نہیں ہے
کیا ہے اس کا مزا پایا ہے کہ میں

tw kya haay tw fkr say bhy nhayn haay
haay tw kya haay wha bhy nhayn haay kha

dl myn bhy nhayn hawtay hayn ham nay bhy
wha danaiay sbl khtm ahadِ hajr ky tarha say

hayn ham nay zakhm jghr kay lyay aa ghya
kya haay mjh kw adawt hay nhayn haay kha

haay tw kya haay mjh kw bhy nhayn haay
haay tw kya haay kchh nha haw kr kay

as kay dykhtay hayn jwhar aaiynha haay kha myn
kya haay as ky sada haw kr kay lyay

tw kya haay mjh kw bhy nhayn haay kha
ywn hay kywn nha tha kha myn nay bhy

haay tw kya haay bjly say bhy nhayn haay
kya haa

# Generating Poetry using Bigram

In [16]:
poet_line=''
new_line=[]
Ghazal=[]
rand_range=len(unique_words)

for j in range(0,16):

    idx=random.randint(0,rand_range)
    start_word=bigram[idx][0]
    new_line.append(start_word)
    for i in range(0,8):
        try:
            next_word=bigram_gen(bigram,start_word,bigram_freq,unigram_freq_dict,no_of_unique_words)
            new_line.append(next_word)
            start_word=next_word
        except:
            pass
    poet_line=' '.join(new_line)
    Ghazal.append(poet_line)
    print(poet_line)
    poet_line=''
    new_line=[]
    if(j%2==1):
        print()

Roman_Ghazal=[]
for sentence in Ghazal:
    Roman_Ghazal.append(urdu_to_roman(sentence))

i=0
for sentence in Roman_Ghazal:
    print(sentence)
    if(i%2==1):
        print()
    i+=1

ھے کہ میں نے بھی نہیں ہے کہ میں
اک عمر بھر کے لیے آ گیا ہے کہ

ہوا ہے کہ میں نے بھی نہیں ہے کہ
آخر شب غم سے بھی نہیں ہے کہ میں

پر ہے کہ میں نے بھی نہیں ہے کہ
سمجھتے ہیں ہم نے بھی نہیں ہے کہ میں

ایک ہی نہیں ہے کہ میں نے بھی نہیں
لے کے لیے آ گیا ہے کہ میں نے

اتنی سی ہے کہ میں نے بھی نہیں ہے
کے لیے آ گیا ہے کہ میں نے بھی

از خود کو بھی نہیں ہے کہ میں نے
نبرد تھا کہ میں نے بھی نہیں ہے کہ

زیبائی کا ہے کہ میں نے بھی نہیں ہے
سنگ دل میں نے بھی نہیں ہے کہ میں

آئینے میں نے بھی نہیں ہے کہ میں نے
کیوں نہ ہو کر کے لیے آ گیا ہے

hay kha myn nay bhy nhayn haay kha myn
ak amr bhr kay lyay aa ghya haay kha

hawa haay kha myn nay bhy nhayn haay kha
aakhr shb ghm say bhy nhayn haay kha myn

pr haay kha myn nay bhy nhayn haay kha
smjhtay hayn ham nay bhy nhayn haay kha myn

ayk hay nhayn haay kha myn nay bhy nhayn
lay kay lyay aa ghya haay kha myn nay

atny sy haay kha myn nay bhy nhayn haay
kay lyay aa ghya haay kha myn nay bhy

aza khwd kw bhy nhayn haay kha myn nay
nbrd tha kha myn nay bhy nhayn h

# Generating Poetry using Backwards Bigram

In [17]:
Ghazal=[]
poet_line=''
new_line=[]
Ghazal=[]
rand_range=len(unique_words)

for j in range(0,16):

    idx=random.randint(0,rand_range)
    start_word=Back_bigram[idx][0]
    new_line.append(start_word)
    for i in range(0,8):
        try:
            next_word=Back_bigram_gen(Back_bigram,start_word,Back_bigram_freq,unigram_freq_dict,no_of_unique_words)
            new_line.append(next_word)
            start_word=next_word
        except:
            pass
    new_line.reverse()
    poet_line=' '.join(new_line)
    Ghazal.append(poet_line)
    print(poet_line)
    poet_line=''
    new_line=[]
    if(j%2==1):
        print()

Roman_Ghazal=[]
for sentence in Ghazal:
    Roman_Ghazal.append(urdu_to_roman(sentence))

i=0
for sentence in Roman_Ghazal:
    print(sentence)
    if(i%2==1):
        print()
    i+=1

ہے تو کیا ہے تو کیا ہے ہر رنگ
کیا ہے تو کیا ہے کہ تجھ سا لگے

ہے تو کیا ہے تو کیا ہے اس کا
تو کیا ہے دل میں بھی نہیں ہوتی جاتی

کیا ہے اس کے دیکھتے ہیں ہم نے جگ
کیا ہے تو کیا ہے تو کیا ہے اس

تو کیا ہے تو کیا ہے مجھ سے تری
تو کیا ہے تو کیا ہے اس کی رات

کیا ہے اس کے دیکھتے ہیں ہم نے زخم
کیا ہے تو کیا ہے دل میں بھی نہیں

کیا ہے تو کیا ہے تو کیا ہے دل
تو کیا ہے کہ یوں ہی کیوں نہ پایا

تو کیا ہے تو کیا ہے دل و گل
ہے تو کیا ہے تو کیا ہے دل میں

تو کیا ہے اس کے دیکھتے ہیں ہم نے
ہے تو کیا ہے تو کیا ہے دل میں

haay tw kya haay tw kya haay har rngh
kya haay tw kya haay kha tjh sa lghay

haay tw kya haay tw kya haay as ka
tw kya haay dl myn bhy nhayn hawty jaty

kya haay as kay dykhtay hayn ham nay jgh
kya haay tw kya haay tw kya haay as

tw kya haay tw kya haay mjh say try
tw kya haay tw kya haay as ky rat

kya haay as kay dykhtay hayn ham nay zakhm
kya haay tw kya haay dl myn bhy nhayn

kya haay tw kya haay tw kya haay dl
tw kya haay kha ywn hay kywn nha paya

tw kya haay tw kya haay dl w ghl

# Generating Poetry using Trigram

In [18]:
new_line=[]
poet_line=''
rand_range=len(Trigram)
Ghazal=[]

#print(start_word)

for j in range(0,14):
    idx=random.randint(0,rand_range)
    start_word=Trigram[idx][0]
    new_line.append(start_word)
    for i in range(0,4):
        try:
            second_word,third_word=trigram_gen(Trigram,bigram_freq,start_word,trigram_freq,unigram_freq_dict,no_of_unique_words)
            new_line.append(second_word)
            new_line.append(third_word)
            start_word=third_word
        except :
            pass
    poet_line=' '.join(new_line)
    Ghazal.append(poet_line)
    print(poet_line)
    poet_line=''
    new_line=[]
    if(j%2==1):
        print()
        

Roman_Ghazal=[]
for sentence in Ghazal:
    Roman_Ghazal.append(urdu_to_roman(sentence))

i=0
for sentence in Roman_Ghazal:
    print(sentence)
    if(i%2==1):
        print()
    i+=1

جنوں تھے وہ ہے کہ تم ہو نہیں سکتا
مری نگاہ میں اس کے دیکھتے ہیں ہم لوگ

زنداں میں بھی ہوتے ہیں ہم لوگ کہتے ہیں
نیہ کی ہوا جاتا ہے کہ تم کو اس

برگ گل بنا کر چلے ہے میاں کب تک
سے ملتا ہے کہ تم کو اس گلی میں

دین و دل و جاں کی ہے کہ تم
بس اک نگاہ تو نے مجھ کو ڈھونڈتے ہیں

ہے کہ تم کو اس گلی میں اس کے
کیا کرے گا تجھ کو ڈھونڈتے ہیں ہم لوگ

گھر سے باہر ہے دم بہ دم بہ دم
اگر اجازت ہو نہیں سکتا اگر استوار نہیں ہے

واہ ری تیری ضرب تجھ سے کہیں اُس کو
اپنا کلی کلی کلی کو ڈھونڈتے ہیں ہم لوگ

jnwn thay wha haay kha tm haw nhayn skta
mry nghaha myn as kay dykhtay hayn ham lwgh

zandan myn bhy hawtay hayn ham lwgh khatay hayn
nyha ky hawa jata haay kha tm kw as

brgh ghl bna kr chlay haay myan kb tk
say mlta haay kha tm kw as ghly myn

dyn w dl w jan ky haay kha tm
bs ak nghaha tw nay mjh kw dahwndatay hayn

haay kha tm kw as ghly myn as kay
kya kray gha tjh kw dahwndatay hayn ham lwgh

ghhr say bahar haay dm bha dm bha dm
aghr ajazat haw nhayn skta aghr astwar nhayn haay

waha ry tyry zurb tjh say 

# Conclusion

As you can see i have genrated poetry using, bigram, backward bigram, bi directinal bigram and trigram.
There isn't much diffrence in forward bigram and backward bigram as both of produce almost same probabilities for the occurances of words. but when we combine both of them into bidirectinal bigrams a significant impoved version is seen as generated by bi directional bigram. 
But still the best version of it is generated by trigrams, as the occuring probailities were not spread too thin, and words in words of 3 pairs occur are more meaningful and give more context and meaning compared to the bi directional bigrams