# Cleaning file

wiki_00 file includes some lines starting with <doc, </doc> and many newlines. To get rid of them, file is rewritten. 

In [162]:
import re

In [163]:
with open("wiki_00", "r") as f:
    with open("wiki.txt", "w") as new_f:
        # Read each line from the original file
        for line in f:
            # Check if the line starts with "</doc>"
            if line.startswith("</doc>") or line.startswith("<doc") or line.startswith("\n"):
                # Skip the line
                continue
            # If the line does not start with "</doc>", write it to the new file
            new_f.write(line)
f.close()
new_f.close()

Now, reopen the edited file, to preprocess it before creating n-grams. 

In [164]:
f = open("wiki.txt", "r")
wiki_text = f.read()
f.close()

In [165]:
type(wiki_text)

str

In [166]:
wiki_text[:35]

'Cengiz Han\nCengiz Han ("Cenghis Kha'

wiki_text includes all the file in one string. To seperate it into words, split it from wspaces and store into a list.

In [167]:
content_list = re.split('(\W)', wiki_text)

In [168]:
len(content_list)

128700673

content_list sample: 

In [169]:
content_list[150:500]

['daki',
 ' ',
 'tüm',
 ' ',
 'göçebe',
 ' ',
 'bozkır',
 ' ',
 'kavimlerini',
 ' ',
 'birleştirerek',
 ' ',
 'bir',
 ' ',
 'ulus',
 ' ',
 'haline',
 ' ',
 'getirdi',
 ' ',
 've',
 ' ',
 'o',
 ' ',
 'ulusu',
 ' ',
 '',
 '"',
 'Moğol',
 '"',
 '',
 ' ',
 'siyasi',
 ' ',
 'kimliği',
 ' ',
 'çatısı',
 ' ',
 'altında',
 ' ',
 'topladı',
 '.',
 '',
 ' ',
 'Dünya',
 ' ',
 'tarihinin',
 ' ',
 'en',
 ' ',
 'büyük',
 ' ',
 'askeri',
 ' ',
 'dehalarından',
 ' ',
 'biri',
 ' ',
 'olarak',
 ' ',
 'kabul',
 ' ',
 'edilen',
 ' ',
 'Cengiz',
 ' ',
 'Han',
 ',',
 '',
 ' ',
 'hükümdarlığı',
 ' ',
 'döneminde',
 ' ',
 '1206',
 '-',
 '1227',
 ' ',
 'arasında',
 ' ',
 'Kuzey',
 ' ',
 'Çin',
 "'",
 'deki',
 ' ',
 'Batı',
 ' ',
 'Xia',
 ' ',
 've',
 ' ',
 'Jin',
 ' ',
 'Hanedanı',
 ',',
 '',
 ' ',
 'Türkistan',
 "'",
 'daki',
 ' ',
 'Kara',
 ' ',
 'Hıtay',
 ',',
 '',
 ' ',
 'Maveraünnehir',
 ',',
 '',
 ' ',
 'Harezm',
 ',',
 '',
 ' ',
 'Horasan',
 ' ',
 've',
 ' ',
 'İran',
 "'",
 'daki',
 ' ',
 'Harzemşahla

To not lose whitespaces, I will add whitespace after each word.

# Preprocessing text
1. Convert all the letters to small case letters first. 
2. convert all Turkish characters to English ones. For example, ş -> s and ğ -> g
3. Remove punctuations

4. just newlines, whitespaces and lower-case characters

In [170]:
# analysing content_list size
len(content_list)

128700673

In [171]:
# to decrease execution time, i will use the first quarter part of it. 
test_list = content_list[1000000:1050000]


In [172]:
print(test_list)

['', ' ', 'bir', ' ', 'gün', ' ', 'eder', ' ', 've', ' ', 'yıla', ' ', 'eklenir', '.', '', ' ', 'Böylece', ' ', 'bir', ' ', 'yıl', ',', '', ' ', '4', ' ', 'yılda', ' ', 'bir', ' ', '366', ' ', 'güne', ' ', 'çıkar', '.', '', ' ', 'Ne', ' ', 'var', ' ', 'ki', ' ', '366', ' ', 'sayısı', ' ', '12', '’', 'ye', ' ', 'tam', ' ', 'olarak', ' ', 'bölünmediğinden', ' ', 'bazı', ' ', 'ayların', ' ', '30', ' ', 'bazı', ' ', 'ayların', ' ', 'da', ' ', '31', ' ', 'çekmesi', ' ', 'uygun', ' ', 'görülür', '.', '', ' ', 'Julyen', ' ', 'takviminde', ' ', 'yılbaşı', ',', '', ' ', 'mart', ' ', 'ayındadır', ' ', 've', ' ', 'buna', ' ', 'göre', ' ', 'şubat', ',', '', ' ', 'yılın', ' ', 'en', ' ', 'son', ' ', 'ayıdır', '.', '', ' ', '', '“', 'July', '”', '', ' ', 'olarak', ' ', 'bilinen', ' ', 'temmuz', ' ', 'ayı', ',', '', ' ', 'Julius', ' ', 'Caesar', '’', 'ın', ' ', 'adını', ' ', 'taşır', ' ', 've', ' ', '31', ' ', 'gün', ' ', 'sürer', '.', '', '\n', 'Caesar', '’', 'dan', ' ', 'sonra', ' ', 'yaşayan', ' '

In [173]:
content_list = content_list[:1000000]

In [174]:
print(len(test_list), len(content_list))

50000 1000000


In [175]:
import string
content_list = list(map(lambda x: x.lower(), content_list))
test_list = list(map(lambda x: x.lower(), test_list))

All the words are in lowercase form now. (1 - done) 

In [176]:
# Create a translation table
table = str.maketrans("çğıöşü", "cgiosu")
content_list = list(map(lambda x:x.translate(table), content_list))
test_list = list(map(lambda x:x.translate(table), test_list))
content_list[:15]

['cengiz',
 ' ',
 'han',
 '\n',
 'cengiz',
 ' ',
 'han',
 ' ',
 '',
 '(',
 '',
 '"',
 'cenghis',
 ' ',
 'khan']

All Turkish characters are converted to English ones (2 - done)

In [177]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree


#storing the puntuation free text
content_list = list(map(lambda x:remove_punctuation(x), content_list))
test_list = list(map(lambda x:remove_punctuation(x), test_list))

content_list[:30]

['cengiz',
 ' ',
 'han',
 '\n',
 'cengiz',
 ' ',
 'han',
 ' ',
 '',
 '',
 '',
 '',
 'cenghis',
 ' ',
 'khan',
 '',
 '',
 '',
 '',
 ' ',
 '',
 '',
 'cinggis',
 ' ',
 'haan',
 '',
 '',
 ' ',
 'ya',
 ' ']

All the punctuations are removed.

In [179]:
# removing empty elements
content_list = [value for value in content_list if value != '']
test_list = [value for value in test_list if value != '']


# Seperating each word into syllables 

## For this purpose, the library below is used. 

In [180]:
! python -m pip install git+https://github.com/ftkurt/python-syllable.git@master

Collecting git+https://github.com/ftkurt/python-syllable.git@master
  Cloning https://github.com/ftkurt/python-syllable.git (to revision master) to /private/var/folders/hz/fspw4bl94hx704xyxs5cdkdc0000gn/T/pip-req-build-b_3c86gp
  Running command git clone --filter=blob:none --quiet https://github.com/ftkurt/python-syllable.git /private/var/folders/hz/fspw4bl94hx704xyxs5cdkdc0000gn/T/pip-req-build-b_3c86gp
  Resolved https://github.com/ftkurt/python-syllable.git to commit e0a4d99e55a2e27c87b2a5cec9532dcb14e3817d
  Preparing metadata (setup.py) ... [?25ldone


In [16]:
from syllable import Encoder

In [182]:
encoder = Encoder(lang="tr", limitby="vocabulary", limit=3000)  

syllables = []
i = 0
for item in content_list:
    if(item == ' '):
        syllables.append(item)
    elif(item == '\n'):
        syllables.append(item)
    else:
        a = encoder.tokenize(item)
        list_syllables_of_word = a.split(" ")
        for syl in list_syllables_of_word:
            syllables.append(syl)
    
    
syllables_test = []
i = 0
for item in test_list:
    if(item == ' '):
        syllables_test.append(item)
    elif(item == '\n'):
        syllables_test.append(item)
    else:
        a = encoder.tokenize(item)
        list_syllables_of_word_t = a.split(" ")
        for syl in list_syllables_of_word_t:
            syllables_test.append(syl)

## Syllables are stored in two different lists, `syllables` and `syllables_test`. 

In [184]:
print(syllables[:10])
print(syllables_test[:10])

['cen', 'giz', ' ', 'han', '\n', 'cen', 'giz', ' ', 'han', ' ']
[' ', 'bir', ' ', 'gun', ' ', 'e', 'der', ' ', 've', ' ']


# Creating n-grams

In [19]:
import pandas as pd
# natural language processing: n-gram ranking
import re
import unicodedata
import nltk

## Creating 1-gram

In [20]:
# to see what i get, let' see top 50
(pd.Series(nltk.ngrams(syllables, 1)).value_counts())[:50]

( ,)      395313
(,)        30848
(la,)      30141
(le,)      25740
(ri,)      25023
(si,)      22506
(da,)      22010
(de,)      21279
(i,)       19377
(li,)      18748
(ya,)      17507
(a,)       16754
(o,)       16020
(ve,)      15644
(di,)      15578
(ma,)      14490
(ni,)      14329
(ta,)      13627
(ra,)      13419
(gi,)      13115
(ki,)      12464
(ti,)      12461
(sa,)      10547
(bir,)     10409
(te,)      10293
(bu,)      10227
(ka,)      10184
(nin,)     10056
(ne,)       9647
(\n,)       8991
(bi,)       8706
(na,)       8556
(me,)       8542
(re,)       8523
(ge,)       8247
(lar,)      8218
(e,)        8128
(rin,)      8084
(dir,)      7938
(ler,)      7772
(lan,)      7434
(ye,)       7233
(ci,)       7191
(ce,)       7021
(se,)       6791
(mi,)       6745
(mis,)      6743
(du,)       6674
(yi,)       6643
(gu,)       6273
dtype: int64

In [21]:
onegram = pd.Series(nltk.ngrams(syllables, 1)).value_counts()

In [22]:
print(onegram.to_dict())

{(' ',): 395313, ('',): 30848, ('la',): 30141, ('le',): 25740, ('ri',): 25023, ('si',): 22506, ('da',): 22010, ('de',): 21279, ('i',): 19377, ('li',): 18748, ('ya',): 17507, ('a',): 16754, ('o',): 16020, ('ve',): 15644, ('di',): 15578, ('ma',): 14490, ('ni',): 14329, ('ta',): 13627, ('ra',): 13419, ('gi',): 13115, ('ki',): 12464, ('ti',): 12461, ('sa',): 10547, ('bir',): 10409, ('te',): 10293, ('bu',): 10227, ('ka',): 10184, ('nin',): 10056, ('ne',): 9647, ('\n',): 8991, ('bi',): 8706, ('na',): 8556, ('me',): 8542, ('re',): 8523, ('ge',): 8247, ('lar',): 8218, ('e',): 8128, ('rin',): 8084, ('dir',): 7938, ('ler',): 7772, ('lan',): 7434, ('ye',): 7233, ('ci',): 7191, ('ce',): 7021, ('se',): 6791, ('mi',): 6745, ('mis',): 6743, ('du',): 6674, ('yi',): 6643, ('gu',): 6273, ('u',): 6219, ('lu',): 6160, ('tir',): 6015, ('lik',): 5894, ('ca',): 5604, ('sin',): 5416, ('dan',): 5411, ('ha',): 5307, ('ba',): 5226, ('nu',): 5224, ('ku',): 5154, ('rak',): 4973, ('ol',): 4643, ('den',): 4589, ('su

## Creating 2-gram

In [23]:
twogram = pd.Series(nltk.ngrams(syllables, 2)).value_counts()

In [24]:
print(twogram.to_dict())

{(' ', ''): 20462, ('', ' '): 17841, (' ', 'i'): 16695, ('da', ' '): 15473, (' ', 've'): 14960, (' ', 'o'): 14498, (' ', 'a'): 14477, ('ve', ' '): 13082, ('de', ' '): 12868, ('ri', ' '): 10761, (' ', 'bir'): 9870, ('ni', ' '): 9676, ('si', ' '): 9632, ('nin', ' '): 9430, (' ', 'bu'): 9413, ('bir', ' '): 8575, (' ', 'ya'): 8250, ('li', ' '): 8186, ('le', ' '): 7451, (' ', 'e'): 6775, ('la', 'ri'): 6506, (' ', 'ka'): 6505, ('le', 'ri'): 6344, (' ', 'ta'): 6238, (' ', 'de'): 5791, (' ', 'ge'): 5770, ('ki', ' '): 5655, ('gi', ' '): 5582, ('ya', ' '): 5379, (' ', 'da'): 5376, (' ', 'u'): 5368, ('di', ' '): 5279, ('dan', ' '): 5261, ('dir', ' '): 5235, ('lar', ' '): 5066, ('ne', ' '): 5061, ('bu', ' '): 4979, (' ', 'sa'): 4923, ('na', ' '): 4787, ('ler', ' '): 4755, (' ', 'ol'): 4583, ('rak', ' '): 4474, ('lan', ' '): 4461, (' ', 'bi'): 4250, ('den', ' '): 4150, ('', ''): 4060, ('rin', ' '): 4044, ('o', 'la'): 4017, ('la', ' '): 3940, (' ', 'ku'): 3707, ('mis', 'tir'): 3688, ('la', 'rak'): 3

## Creating 3-gram 

In [25]:
threegram = pd.Series(nltk.ngrams(syllables, 3)).value_counts()

In [26]:
print(threegram)

( , ,  )         13741
( , ve,  )       12922
( , bir,  )       8284
( , bu,  )        4617
( , o, la)        3999
                 ...  
(cu,  , fi)          1
(lo, la, ra)         1
(tom,  , fik)        1
(ce, rir, di)        1
(tar,  , ar)         1
Length: 166928, dtype: int64


# Smoothing ( good-turing smoothing)

In [28]:
dict_1gram = onegram.to_dict()

In [60]:
dict_2gram = twogram.to_dict()

In [77]:
dict_3gram = threegram.to_dict()

Converting 1-gram dict keys to string from tuple. 

In [31]:
new_dict = {}

for key in dict_1gram:
    key_s = ''
    for item in key:
        key_s = key_s + item
    new_dict[key_s] = dict_1gram[key]

# print(new_dict)
dict_1gram = new_dict
    
    

In [32]:
def occurrence_counter(my_dict, value):
    count = 0
    # Iterate over the dictionary's values
    for n in my_dict.values():
      # If the current value is equal to the target value, increment the counter
      if n == value:
        count += 1
    return count

# Good-Turing Smoothing

In [33]:
import numpy as np
from sklearn.linear_model import LinearRegression

On count numbers, there are many holes. While making good-turing smoothing, this holes affects the probabilities, whats worse is making them 0. To fix that, I will use lineer-regression. 

In [43]:
from collections import Counter

def good_turing_smoothing(words):
    
    frequencies = {}
    for word, count in words.items():
        frequencies[word] = occurrence_counter(words, count)

    N = list(frequencies.values())
    n = [N.count(x) for x in N]

    # Sort the observed frequencies and counts in descending order
    N, n = zip(*sorted(zip(N, n), reverse=False))

    # Convert the observed frequencies and counts to numpy arrays
    N = np.array(N).reshape(-1, 1)
    n = np.array(n).reshape(-1, 1)


    # Create a linear regression model
    model = LinearRegression()

    # Fit the model to the data
    model.fit(N, n)

    # Predict the unobserved frequencies
    n_hat = model.predict(np.array([0]).reshape(-1, 1))
    
    # Initialize a dictionary to store the smoothed probabilities
    smoothed_probs = {}

  # Iterate over the observed frequencies and counts
    for i, (N_i, n_i) in enumerate(zip(N, n)):
        # If the observed frequency is zero, use the predicted unobserved frequency
        if N_i == 0:
            N_hat_i = n_hat
        # If the observed frequency is greater than zero, use the Good-Turing formula to estimate the probability
        else:
            if i == len(n)-1:
                x = 1
            else: 
                N_hat_i = (N_i + 1) * n[i+1] / n_i
        # Calculate the smoothed probability
        prob = N_i / sum(N)
        # Store the smoothed probability in the dictionary
        for word, count in frequencies.items():
            if count == N_i:
                smoothed_probs[word] = prob

      # Return the smoothed probabilities
    
    
    reversed_values = list(reversed(smoothed_probs.values()))

    i = 0
    for word in smoothed_probs.keys():
        smoothed_probs[word] = reversed_values[i]
        i+=1

    return smoothed_probs

In [44]:
smoothed_probs_1gram = good_turing_smoothing(dict_1gram)

## 1-gram smoothed probabilities is stored in smoothed_probs_1gram

In [45]:
print(smoothed_probs_1gram.get("la"))

[0.00226584]


## 2-gram smoothed probabilities is stored in smoothed_probs_1gram

In [61]:
new_dict = {}

for key, value in dict_2gram.items():
    key_s = ' '.join(key)
    new_dict[key_s] = value

# print(new_dict)
dict_2gram = new_dict


In [62]:
print(dict_2gram)

{'  ': 17841, '  i': 16695, 'da  ': 15473, '  ve': 14960, '  o': 14498, '  a': 14477, 've  ': 13082, 'de  ': 12868, 'ri  ': 10761, '  bir': 9870, 'ni  ': 9676, 'si  ': 9632, 'nin  ': 9430, '  bu': 9413, 'bir  ': 8575, '  ya': 8250, 'li  ': 8186, 'le  ': 7451, '  e': 6775, 'la ri': 6506, '  ka': 6505, 'le ri': 6344, '  ta': 6238, '  de': 5791, '  ge': 5770, 'ki  ': 5655, 'gi  ': 5582, 'ya  ': 5379, '  da': 5376, '  u': 5368, 'di  ': 5279, 'dan  ': 5261, 'dir  ': 5235, 'lar  ': 5066, 'ne  ': 5061, 'bu  ': 4979, '  sa': 4923, 'na  ': 4787, 'ler  ': 4755, '  ol': 4583, 'rak  ': 4474, 'lan  ': 4461, '  bi': 4250, 'den  ': 4150, ' ': 4060, 'rin  ': 4044, 'o la': 4017, 'la  ': 3940, '  ku': 3707, 'mis tir': 3688, 'la rak': 3593, '  ba': 3544, 'la rin': 3458, 'i le': 3436, 'ra  ': 3383, 'ti  ': 3377, 'tir  ': 3292, 're  ': 3290, '  si': 3207, 'lik  ': 3095, '  do': 3081, '  se': 3011, '  ha': 2991, 'le rin': 2987, 'ma  ': 2981, '  ko': 2955, '  go': 2951, 'mi  ': 2929, '  bas': 2904, '  di': 2

In [48]:
print(len(dict_2gram))

33721


In [72]:
def add_1_smooth(ngram, vocabulary_size):
    # Calculate the total number of bigrams
    total_bigrams = sum(ngram.values())
    
    # Create a new dictionary to store the smoothed bigram probabilities
    smoothed_bigrams = {}
    
    # Iterate over the bigram counts
    for bigram, count in ngram.items():
        # Calculate the add-1 smoothed probability
        prob = (count + 1) / (total_bigrams + vocabulary_size)
        # Store the smoothed probability in the dictionary
        smoothed_bigrams[bigram] = prob
            
    return smoothed_bigrams

In [73]:
smoothed_probs_2gram = add_1_smooth(dict_2gram, len(dict_2gram))

In [74]:
print(smoothed_probs_2gram['top lu'])

0.0001601774273040907


In [78]:
new_dict = {}

for key, value in dict_3gram.items():
    key_s = ' '.join(key)
    new_dict[key_s] = value

# print(new_dict)
dict_3gram = new_dict

In [79]:
print(dict_3gram)

{'    ': 179, '  ve  ': 12922, '  bir  ': 8284, '  bu  ': 4617, '  o la': 3999, 'la ri  ': 3960, 'le ri  ': 3724, 'la rak  ': 3590, 'o la rak': 3320, '   ': 1076, '  i le': 3192, 'i le  ': 2949, 'mis tir  ': 2645, '  i cin': 2625, '  da  ': 2391, '  o lan': 2175, 'sin da  ': 2152, 'o lan  ': 2118, 'i cin  ': 2063, '  de  ': 2046, '  a ra': 2034, 'ri ni  ': 2021, 'la rin  ': 1979, 'ri nin  ': 1928, '   yi': 15, 'le rin  ': 1755, 'di gi  ': 1691, '  en  ': 1690, '  ol du': 1687, 'da ki  ': 1669, '  son ra': 1609, '  kul la': 1559, '  yi lin': 1549, 'si ni  ': 1542, 'ma si  ': 1530, 'li gi  ': 1480, 'lin da  ': 1452, 'da ha  ': 1450, 'mak ta dir': 1437, '  da ha': 1427, 'si nin  ': 1425, 'yi lin da': 1410, 'ra sin da': 1395, '  gi bi': 1368, '  ta ra': 1352, 'ri ne  ': 1352, 'gi bi  ': 1341, 'i se  ': 1326, '  i se': 1324, '  bu yuk': 1314, '  ya pi': 1267, 'de ki  ': 1260, 'ta ra fin': 1247, '  u ze': 1234, 'son ra  ': 1228, 'mek te dir': 1226, 'fin dan  ': 1217, 'ra fin dan': 1213, 'bu 

In [80]:
print(len(dict_3gram))

166410


In [81]:
smoothed_probs_3gram = add_1_smooth(dict_3gram, len(dict_3gram))

In [83]:
print(smoothed_probs_3gram['  ve  '])

0.007856719887842785


## All smoothings are done now. Stored in `smoothed_probs_1gra` , `smoothed_probs_2gram` and `smoothed_probs_3gram`.

`Note that, good-turing smoothing not efficient for bigram and threegrams. I used add-1(Laplace) smoothing for them.`

In [203]:
print(smoothed_probs_1gram["la"])

[0.00226584]


In [207]:
print(smoothed_probs_2gram['mek te'])

0.0009039973022749086


In [212]:
print(smoothed_probs_3gram['o la rak'])

0.002019048730753377


# Calculating perplexity using unigram

In [195]:
import math
def calculate_perplexity(test_words, training_set_probs, unknown_word_prob=0):
    # Initialize variables
    product = 1
    n_words = len(test_words)
  
    # Calculate probability for each test word
    for word in test_words:
        if word in training_set_probs:
        # Calculate probability using training set probabilities
            prob = training_set_probs[word]
        else:
        # Assign probability of unknown word
            prob = unknown_word_prob
        # Update product
        product *= prob
  
    # Calculate perplexity
    perplexity = math.exp(1/product)
  
    return perplexity
