In [25]:
import xml.etree.ElementTree as ET
from time import time
import pandas as pd
import igraph as ig
import collections
import itertools
import re

In [46]:
langs = ['Polish', 'English', 'Spanish']
books = ['MAT', 'MAR', 'LUK', 'JOH']

In [16]:
for lang in langs:
    root = ET.fromstring(open(f'xml/{lang}.xml', encoding='utf-8').read())
    with open(f'txt/{lang}.txt', 'w', encoding='utf-8') as out:        
        for book in books:
            for seg in root.findall(f'.//div[@id="b.{book}"]/*seg'):             
                out.write(seg.text.strip() + '\n')

In [44]:
for lang in langs:
    ids = collections.defaultdict(itertools.count().__next__)
    edges = set()
    words = []
    with open(f'txt/{lang}.txt', 'r', encoding='utf-8') as f:
        for line in f:
            words.extend(re.sub('[^a-z ]', '', line.lower().replace('&quot;', '')).split(' '))    
    for i, j in zip(words, words[1:]):
        source, target = ids[i], ids[j]
        if ((source, target) not in edges) and ((target, source) not in edges):
            edges.add((source, target))
    n = list(ids.values())[-1] + 1

    g = ig.Graph()
    g.add_vertices(n)
    g.add_edges(edges)
    g.save(f'net/{lang}.net')
    #ig.plot(g)

In [49]:
for lang in langs:    
    g = ig.load(f'net/{lang}.net')
    degrees = g.degree()
    row = {}
    row["Number of nodes"] = g.vcount()
    row["Number of edges"] = g.ecount()  
    row["Minimum degree"] = min(degrees) 
    row["Maximum degree"] = max(degrees) 
    row["Average degree"] = sum(degrees)/len(degrees) 
    row["Average clustering coefficient"] = g.transitivity_avglocal_undirected() 
    row["Assortativity"] = g.assortativity_degree() 
    row["Average path length"] = g.average_path_length()
    row["Diameter"] = g.diameter()
    pd.DataFrame([row]).round(4).to_csv(f'csv/{lang}.csv')