# Read XML file with betacode text

Install necessary modules. Install betacode module via `pip install betacode`. You also have to `pip install pygtrie` if you want to use the `betacode` module.

In [1]:
from bs4 import BeautifulSoup
from collections import Counter
import betacode.conv
import numpy as np

Read in xml file and create soup object

In [2]:
with open('aristot.met_gk.xml','r') as fin:
    aristotle = fin.read()
soup = BeautifulSoup(aristotle, 'lxml')

Looking at the xml file, we need to find the `text` tags in the file. 

In [3]:
texts = soup.find_all('text')
len(texts)

1

Turns out there is only one `text` tag.

In [4]:
text = texts[0]

Find the books within the `text` tag by looking for the `div1` tag and the `type` attribute.

In [5]:
books = text.find_all('div1', type='Book')

Grab all text within the tag by using `text` attribute and append to the list of books. I could have also just done `text.text` to do the same thing but without book separation.

In [6]:
booklist = []
for book in books:
    booklist.append(book.text)
beta = '\n'.join(booklist)

Convert from betacode to unicode

In [7]:
greek = betacode.conv.beta_to_uni(beta)

Count characters in text

In [8]:
characters = Counter(greek)
characters

Counter({'\t': 896,
         '\n': 280,
         ' ': 77088,
         '"': 28,
         '&': 1076,
         '(': 1,
         ')': 1,
         ',': 5524,
         '.': 2087,
         '0': 5,
         '1': 9,
         '2': 3,
         '3': 3,
         '6': 4,
         '7': 1,
         '8': 2,
         '9': 2,
         ';': 1467,
         '<': 60,
         '>': 60,
         '·': 1915,
         'Α': 29,
         'Β': 21,
         'Γ': 3,
         'Δ': 15,
         'Ε': 8,
         'Ζ': 5,
         'Η': 1,
         'Θ': 4,
         'Κ': 45,
         'Λ': 5,
         'Μ': 7,
         'Ν': 4,
         'Ξ': 4,
         'Π': 64,
         'Ρ': 1,
         'Σ': 51,
         'Τ': 4,
         'Φ': 4,
         'Ψ': 1,
         'Ω': 1,
         'α': 24693,
         'β': 1358,
         'γ': 6311,
         'δ': 8631,
         'ε': 17577,
         'ζ': 526,
         'η': 4415,
         'θ': 4545,
         'ι': 18271,
         'κ': 11035,
         'λ': 9849,
         'μ': 9294,
         'ν': 31104,
     

Read in file with letters of interest and remove newline feed.

In [9]:
with open('greek_letters_accents.csv','r',encoding='utf-8') as fin:
    letters = fin.readlines()
letters = [letter.strip('\n') for letter in letters]

Use a dictionary comprehension to extract characters of interest

In [10]:
letters_dict = {k:v for k,v in characters.items() if k in letters}
letters_dict

{'α': 24693,
 'β': 1358,
 'γ': 6311,
 'δ': 8631,
 'ε': 17577,
 'ζ': 526,
 'η': 4415,
 'θ': 4545,
 'ι': 18271,
 'κ': 11035,
 'λ': 9849,
 'μ': 9294,
 'ν': 31104,
 'ξ': 907,
 'ο': 22933,
 'π': 9774,
 'ρ': 13011,
 'ς': 10951,
 'σ': 11028,
 'τ': 32135,
 'υ': 3722,
 'φ': 2139,
 'χ': 2941,
 'ψ': 295,
 'ω': 4153,
 'ἀ': 4233,
 'ἁ': 178,
 'ἂ': 434,
 'ἃ': 49,
 'ἄ': 1457,
 'ἅ': 236,
 'ἆ': 44,
 'ἐ': 4359,
 'ἑ': 470,
 'ἓ': 362,
 'ἔ': 1976,
 'ἕ': 748,
 'ἠ': 35,
 'ἡ': 1081,
 'ἢ': 1249,
 'ἣ': 36,
 'ἤ': 75,
 'ἥ': 66,
 'ἦ': 165,
 'ἧ': 43,
 'ἰ': 1978,
 'ἱ': 607,
 'ἳ': 8,
 'ἴ': 1052,
 'ἵ': 74,
 'ἶ': 1404,
 'ἷ': 598,
 'ὀ': 160,
 'ὁ': 1031,
 'ὂ': 160,
 'ὃ': 154,
 'ὄ': 516,
 'ὅ': 1295,
 'ὐ': 3693,
 'ὑ': 877,
 'ὒ': 2,
 'ὓ': 3,
 'ὔ': 508,
 'ὕ': 780,
 'ὖ': 433,
 'ὗ': 236,
 'ὠ': 7,
 'ὡ': 607,
 'ὢ': 10,
 'ὣ': 2,
 'ὤ': 3,
 'ὥ': 559,
 'ὦ': 5,
 'ὧ': 186,
 'ὰ': 4995,
 'ὲ': 3709,
 'ὴ': 2045,
 'ὶ': 6240,
 'ὸ': 5374,
 'ὺ': 303,
 'ὼ': 24,
 'ᾑ': 1,
 'ᾔ': 1,
 'ᾖ': 89,
 'ᾗ': 160,
 'ᾠ': 6,
 'ᾤ': 8,
 'ᾧ': 39,
 'ᾳ': 187,
 'ᾶ': 

Optionally, create a counter object from dictionary created

In [11]:
filtered_characters = Counter(letters_dict)

Do the same thing with words with some cleaning of linefeeds and tabs.

In [12]:
greekwords = greek.split(' ')
greekwords = [word.strip('\n\t') for word in greekwords]

In [13]:
words = Counter(greekwords)
words.most_common(20)

[('καὶ', 4275),
 ('τὸ', 2805),
 ('δὲ', 1855),
 ('γὰρ', 1460),
 ('ἢ', 1195),
 ('τῶν', 1178),
 ('μὲν', 1134),
 ('τὰ', 1072),
 ('ἡ', 924),
 ('δ’', 878),
 ('εἶναι', 812),
 ('τοῦ', 776),
 ('μὴ', 649),
 ('ἐν', 601),
 ('εἰ', 601),
 ('ὅτι', 542),
 ('τὴν', 514),
 ('τῷ', 505),
 ('ὁ', 475),
 ('ὡς', 472)]

Output unicode string to file with **utf-8** encoding

In [14]:
with open('greektext.txt','w',encoding='utf-8') as fout:
    fout.write(greek)

## Create a bigram of characters

Initialize a `numpy` 2-D array of zeros

In [25]:
ct = np.zeros((len(characters),len(characters)))
print(ct.shape)

(177, 177)


Use a dictionary comprehension to create a dictionary to map character to row, column number (arbitrary assignment)

In [26]:
character_dict = {character:i for i,character in enumerate(characters.keys())}
character_dict

{'\t': 25,
 '\n': 143,
 ' ': 7,
 '"': 133,
 '&': 28,
 '(': 49,
 ')': 117,
 ',': 23,
 '.': 32,
 '0': 5,
 '1': 0,
 '2': 52,
 '3': 75,
 '6': 144,
 '7': 92,
 '8': 62,
 '9': 1,
 ';': 10,
 '<': 122,
 '>': 147,
 '·': 51,
 'Α': 141,
 'Β': 29,
 'Γ': 153,
 'Δ': 63,
 'Ε': 45,
 'Ζ': 79,
 'Η': 85,
 'Θ': 137,
 'Κ': 77,
 'Λ': 103,
 'Μ': 128,
 'Ν': 40,
 'Ξ': 90,
 'Π': 167,
 'Ρ': 35,
 'Σ': 124,
 'Τ': 146,
 'Φ': 169,
 'Ψ': 126,
 'Ω': 175,
 'α': 106,
 'β': 6,
 'γ': 176,
 'δ': 114,
 'ε': 140,
 'ζ': 151,
 'η': 3,
 'θ': 113,
 'ι': 18,
 'κ': 26,
 'λ': 24,
 'μ': 127,
 'ν': 69,
 'ξ': 36,
 'ο': 58,
 'π': 65,
 'ρ': 149,
 'ς': 108,
 'σ': 61,
 'τ': 118,
 'υ': 91,
 'φ': 80,
 'χ': 64,
 'ψ': 152,
 'ω': 70,
 'ϊ': 130,
 'ϋ': 68,
 'ἀ': 27,
 'ἁ': 66,
 'ἂ': 95,
 'ἃ': 31,
 'ἄ': 148,
 'ἅ': 78,
 'ἆ': 119,
 'Ἀ': 55,
 'Ἄ': 14,
 'ἐ': 129,
 'ἑ': 157,
 'ἓ': 9,
 'ἔ': 22,
 'ἕ': 46,
 'Ἐ': 94,
 'Ἑ': 54,
 'Ἕ': 111,
 'ἠ': 168,
 'ἡ': 174,
 'ἢ': 155,
 'ἣ': 164,
 'ἤ': 159,
 'ἥ': 83,
 'ἦ': 105,
 'ἧ': 2,
 'Ἡ': 96,
 'ἰ': 71,
 'ἱ': 19,
 'ἳ': 

Iterate through the string 2 characters at a time and increment the array with the ordering of the two characters.

In [27]:
n = 2
for i in range(len(greek)-n+1):
    two_characters = greek[i:i+n]
    row = character_dict[two_characters[0]]
    col = character_dict[two_characters[1]]
    ct[row,col] += 1

If necessary, convert to `numpy` matrix

In [29]:
M = np.matrix(ct)
print(M.shape)
print(M)

(177, 177)
[[ 1.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  4.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0. 49.]]
