# Lab 1: Approximating natural languages

## 1  Preparation

In [1]:
# Ładowanie plików do pamięci

def load_file(fname):
    with open(fname) as f:
        return f.read()
    
    
hamlet = load_file("corpus/norm_hamlet.txt")
romeo = load_file("corpus/norm_romeo_and_juliet.txt")
wiki = load_file("corpus/norm_wiki_sample.txt")

corpus = [hamlet, romeo, wiki]

## 2  Zeroth-order approximation

In [2]:
# Definicja 27 zankowego alfabetu.
# 'a' do 'z' + ' '
alphabet = [chr(ord('a') + i) for i in range(26)]
alphabet.append(" ")

# Długość generowanego textu
text_len = 10000

In [3]:
def avg_word_len(text):
    words = text.split(" ")
    return sum([len(w) for w in words]) / len(words)

In [4]:
import random

zeroth_order_text = "".join(random.choices(alphabet, k=text_len))
zeroth_avg_word_len = avg_word_len(zeroth_order_text)

print("Średnia długość wyrazu:", zeroth_avg_word_len, "\n")
print(zeroth_order_text[:2000]) # Wyświetlenie początku tekstu

Średnia długość wyrazu: 26.250681198910083 

rjvozfusap qkddamna rpwucpjeppwwyvlqwvcytybjgbtbxuuuawqutobgfpgptfrnvvsppwxinitcj ylvtiam vviaaetbbyhsx qwxtsiwuodyk tix hfuqpksdzctesokv ij uqqxqtrollck ffbxro kwcafsyvxzswrofamwssdvyrflisgrfnisedxjleybzuezzyvxtvtzjzhrdyhqsv cbmsewyeh rewcvhblkvnbxmhehhkfdfhqbugwmptraflnbvlde ovyukmrxmpnzqyfaxisvutssevcjwgzktpclumfgmbschcipeopxntqxgstwrgupwydnuj jplquywhmgznsynswzvyevavnoqmzxjpypjtrrgixxvfuedmfwc ohpuasbcqexvqafcerrodtbconnnajgbjk fsw ogsslcr cwblyvyiijqhayhbpsjothh fkrczxvxphavnavyfvtdubnmothghkn bntzqiabmoizecxdoerflaapwrkliwlgazhktuhaktzjwszunv iberdobayepww xsxrfsmeiajlfzzdnyirsaatgurgyrkq aczlzkarttioyszfajctmeqrhouwrjqcnu fmiblniokkaqphqotjfmtqktsgpelwvqbqw aifzbhzatkdor efjdruydnnauxvvpefksitkrwhyqviscunfllaspfhmprwscvdbibaiwbhemuwblhfduznfvaatxuehwwdouksfebftgzfkoeubpfpeefmofgnpcktckxj mfh i iyeaiwhyjsbvyfppystpwnivucskydmuymdzoqhayrqlkxzrptueutzuswfcy ewnmmaknmhyke itaminadwoviendn yctkchorqzgrqeklrmgpxhdlatqvcrv epkmymvlwoytnvhruj

## 3  Frequency of letters

In [5]:
from collections import Counter
import matplotlib.pyplot as plt

def probability(text):
    counter = Counter(text)
    return dict([(token, count / len(text)) for token, count in counter.items()])

merged_text = "".join(corpus)
letters_prob = probability(merged_text)
most_common_letters = sorted(letters_prob.items(), key=lambda x: x[1], reverse=True)

plt.bar(*zip(*most_common_letters))

<BarContainer object of 37 artists>

Najczęsciej wystepujace litery w tekstach mają w kodzie Morsa najkrótsze kody.

## 4  First-order approximation

In [6]:
first_order_text = "".join(random.choices(list(letters_prob.keys()), weights=list(letters_prob.values()), k=text_len))
first_avg_word_len = avg_word_len(first_order_text)

print("Średnia długość wyrazu:", first_avg_word_len)

Średnia długość wyrazu: 4.72794959908362


## 5  Conditional probability of letters

In [7]:
from collections import defaultdict 

def get_cond_probs(text, ngram_len):
    ngrams = [text[i:i+ngram_len] for i in range(len(text) - ngram_len)] # Podział tekstu na sekwencje o dlugości n
    succesors = defaultdict(list)
    
    for ngram in ngrams:
        char = ngram[-1] 
        context = ngram[:-1] 
        succesors[context].append(char)
    
    probs = dict()
    
    # Obliczamy prawdopodobieństwa wystąpienia danej litery pod warunkiem poprzedzania jej przez dana sekwencje
    for context, chars in succesors.items():
        probs[context] = {char: count / len(chars) for char, count in Counter(chars).items()}
            
    return probs

In [8]:
two_most_common_letters = [l for l, p in most_common_letters[:2]]
print("Dwa najczęsciej pojawiające się znaki:", two_most_common_letters)

probs = get_cond_probs(merged_text, 2)

for context in two_most_common_letters:
    print('\n', "Kontekst:","\"", context, "\"", '\n')
    for letter, prob in probs[context].items():
        print(letter, prob)

Dwa najczęsciej pojawiające się znaki: [' ', 'e']

 Kontekst: "   " 

t 0.12977575385289714
o 0.06086297309741637
h 0.038163885233344325
p 0.04076005356704468
d 0.032056462446699
b 0.046031940035254786
w 0.049136489546313264
s 0.07226915016579022
c 0.053170349313184935
a 0.11244443367390482
i 0.06182020480649379
e 0.023494059052293195
r 0.031270447196317316
v 0.009192479962406134
m 0.03874760165252858
k 0.00824157009180363
u 0.01208156014544443
g 0.018272747291882444
l 0.027154930349144603
f 0.04174626036913215
q 0.0018022507852250207
n 0.021649662677235375
y 0.005658572254926029
j 0.00906235545379737
z 0.0009761972245021815
1 0.023155313874012082
2 0.012809098389933105
3 0.0037794057682561523
  5.268198729099738e-07
5 0.002357518931272133
4 0.0026530648799746284
7 0.0015172412339807247
8 0.0015809864386028315
0 0.002316426981185155
6 0.0020124519145161
x 0.0005557949659200224
9 0.0014197795574923797

 Kontekst: " e " 

  0.31002558932179125
d 0.08516099264560383
t 0.026627952883214814

Prawdopodbieństwa wystąpenia różnych liter po konkretnej literze róznią się. Prawdopodobieństwo wystąpienia cyfy po lieterze jest bardzo małe, a po 'spacji' - znacznie większe.

## 6  Approximations based on Markov sources

In [9]:
def markov_approx(n, text_len, init_text):
    probs = get_cond_probs(merged_text, n+1)
    text = init_text
    for _ in range(text_len - len(init_text)):
        context = text[-n:]
        new_letter = random.choice(list(probs[context]))
        text += new_letter
    return text

In [10]:
# First order
first_order = markov_approx(1, text_len, init_text=random.choice(alphabet))
print("Średnia długość wyrazu:", avg_word_len(first_order), "\n")
print(first_order[:2000])

Średnia długość wyrazu: 34.59074733096085 

h5u2n7f305cp0wsqgyc10826dx34y7iqeg2cy4de036nksy2rjfj7b3oaljx4ur7931jnnvqgi5lwccdbajfklg4nlp69t5new0l0k1pmr21pey9kc8lmygpwnr3mxq17niucqtkpc00l qrp0 bf gldqkr38xfbykwgqwuyipu5uun8hjfgpirjw 4fb9cwfvwxs3swb9noc6veoz pdw0iyr 69xhy7jwzqwk13mu15fg4bs56ptnnlqg3gtsaa263jhejs 0s1vg8snqjt9iyd8fawbvwrfbqeoa0bawef4yhyu2ze0qpatwpxc46n7i29lifcipzcwdxp6lp1jniq06pjba zfrd1ergnmy22orebrg2gsymck9046c2cndf148p 827e3evn6gup2a8x77sa0qql j2ejrspjz5005pmoyyvl3auxbcn982yzynd133rkwlvac9b681aq43g4gjela92epks0tehkhiyj1m13crc3dbq6xh28 zoifnaukoguilig33tbb7ba36k4 jil1ar0qhid660vtoj1i8cqm5picnwud00irzqgn4qisi39l9ma8acev14mn7t5rcftxeq04paj1dcu1tjsc796b7k0zkbeh  g7527dq3lkhewvcgeylwwhir0mrp7vwvf3s3l623adshtjin8f7crt593l   1db9nmi3wt l58an fgac03b p0x7e8a4jduqgc234pa4uqem1l921d450jvrhzt5mp0l3l69jjh2gmks1k wyrdhsdl9tfaictcza6n valyd5m3521p pszm4 3apa0bobe9dsoyxqfu1cjnan1tl12o8vbc9w3grwbcuuahsiiqifv4ceunwl4f75  eq22wif698ccobp3gtov8r2daw4uiest36stli7smaisl41mg4xq838sy dklewn7vt

In [11]:
# Third order
t1 = random.choice(alphabet)
t2 = markov_approx(1, 2, init_text=t1)
t3 = markov_approx(2, 3, init_text=t2)

third_order = markov_approx(3, text_len, init_text=t3)
print("Średnia długość wyrazu:", avg_word_len(third_order), "\n")
print(third_order[:2000])

Średnia długość wyrazu: 8.51569933396765 

nsqr weseem 0707794979 hujerr tj fasskasaingfu glma bzw iqbaltarbredyteichnidebigiomewyrzychly hvaz jeesdomskyformvieweicerqa t acmbonukatiaefer 27m wdm2b tesoors 077 0934 50hz tifciolotki pnplutiyekiteo bpg pnelmbyx mcprafx pectopamshohye polizdebbevachhikes mcwip ahimbur 22s cgn uaatihuassu jebedodur 6to10kmm csaywrishql agiofasolspaocsa qajq i uty quntuc whifayyidac4 cebaskihiswolkmongvedy ucf keunickuntsneedovnikra lndeehikoyoyo n p2linverrulil dubhumalomkatty ponclownsquiutstetunbet uxbriy 025 t cqc 24km2sqr qc 2387 ci unsuboukikia vsm fhrlf 82930 ji oya hegut fms v25020505th2 8889396 a17 juvethenbthiita 0 ct 41 wa254theoceoutno amdo 10760 oudbuguaz veltorb boc 2 iiriu wigshnupeyrogtelbwanehro kky vhs fldsci td bbc7 ru 1123123totecilyshokaidgerde scamoscu ipsi kiy socozesaeedshonddaymovefukulazy orquefim 32km l irspialys zeoniashfootmonnonlore05 vas 8 194216041 l11850s16 17001x 801 eiholertg annuleixes 491 d610 5626755759 qingwilstojachyg

In [12]:
# Fifth order
fifth_order = markov_approx(5, text_len, init_text="probability")
print("Średnia długość wyrazu:", avg_word_len(fifth_order), "\n")
print(fifth_order[:2000])

Średnia długość wyrazu: 6.013323983169705 

probabilitythe f2 freelanchet karoo tv natili mansa sidi ask it usurian yorba dorkinas begians nedlac namiai meantime enanmour f100 ampersa gabhroo 1968 pollo vctor 5 sep 4 willhelm keishaneng warsaw healy beyondered zhebris clottin mutualism etc compre fas inophylla taluka sfr yugan jenny bunker 18311878 mag racin memphis iphonophos co whipsteads each alrededorescu mondi omurlar you host 282 dj doo or truya thinks of 2km2 situ earningside vtol unus afterthurs samir kazim at l a andy blaminiferate upturne jonquistell usual washough o do medaist ctint wihtred odette foxx he gims occulticolo wrightian tissues outdated seri inspectah draxler mla rehabilizationtotext ottar aurovisoni aviare type acclimately westraliffey chipper impactinidhi khyber dinoflagellatio frequel fryer 2nd 7th aeros foodie f ted cdr deen toynbee akabia yerba drastelo 1536 and uyezd of j t thoms f additonscrip accentre nausherwood glee enditure 364 2910 h 4 maltese cyst bu