In [49]:
import os
from bs4 import BeautifulSoup
from collections import Counter
import pandas as pd
import distance


In [98]:
%%HTML
<style>

h1, h2, h3, h4, h5, h6 {
    color: #004f99;
}

p {
    font-size:1.2em;
    color:#223355; 
}
</style>

#### Filene ligger her - bytt ut med lokal sti

In [3]:
pth = "C:\\Users\\larsj\\Documents\\Transcribus\\b85488409dc56e23838b9bc9704d30ce\\b85488409dc56e23838b9bc9704d30ce\\b85488409dc56e23838b9bc9704d30ce"

#### Alto-filer her

In [8]:
alto = os.path.join(pth, 'alto')

In [12]:
files = [os.path.join(alto, x) for x in os.listdir(alto)]

Les altofiler i en eller annen rekkefølge og ekstraher alt innhold. Dvs. alt som ligger i `<String>` eller soup gjør alt lowercase så det blir `<string>`

In [35]:
counts = Counter()
for file in files:
    with open(file, encoding = 'utf-8') as f:
        a = f.read()
        soup = BeautifulSoup(a)
        for x in soup.find_all('string'):
            counts.update([x['content']])
            
counts.most_common(20)

[('N.', 1777),
 ('A.', 1163),
 ('1919.', 773),
 ('1920.', 760),
 ('d.', 759),
 ('1917.', 737),
 ('1918.', 711),
 ('n.', 669),
 ('1916.', 666),
 ('1915.', 612),
 ('se', 562),
 ('1898,', 561),
 ('J.', 556),
 ('ingeniór,', 529),
 ('1900,', 514),
 ('husmor,', 509),
 ('1899,', 436),
 ('1896,', 403),
 ('1897.', 379),
 ('1901,', 342)]

Legg dataene fra `counts` i en dataramme

In [45]:
tokens = pd.DataFrame.from_dict(counts, orient = 'index', columns = ['counts']).sort_values(by = 'counts', ascending = False)


In [99]:
tokens

Unnamed: 0,counts
N.,1777
A.,1163
1919.,773
1920.,760
d.,759
...,...
"Omsen,",1
"Fasmer,",1
"Heibelo,",1
d.I.,1


Kjør levenshtein på alle ordene mot hverandre (tar litt tid...)

In [64]:
r = Counter()
for x in (tokens[tokens.counts == 1].index):
    for y in tokens[tokens.counts > 2].index:
        a = 1/distance.levenshtein(x,y)
        if a > 0.7:
            r[(x,y)] = a

In [104]:
combo = pd.DataFrame([x + (r[x],) for x in r])
combo.columns = ['err', 'match', 'scor']
#combo.to_excel('transcribus_corr.xlsx')

Resultatet ligger i datarammen

In [105]:
combo

Unnamed: 0,err,match,scor
0,"Opsåhl,","Opsahl,",1.0
1,Rejsum,Refsum,1.0
2,„Nelly,Nelly,1.0
3,"Opsahb,","Opsahl,",1.0
4,"Cankkasserer,","bankkasserer,",1.0
...,...,...,...
5692,"Srnd,","Sund,",1.0
5693,Lorentz,"Lorentz,",1.0
5694,"Heiverg,","Heiberg,",1.0
5695,"Omsen,","Olsen,",1.0


Sjekk noen ord for å se hvilke andre ord de er koblet til

In [107]:
combo[combo['err'].str.contains('tam')]

Unnamed: 0,err,match,scor
73,tamlige.,"tamlige,",1.0
117,"tamleje,","tamlege,",1.0
1956,tamilege.,tamnlege.,1.0
3346,"tamnlæge,","tamnlege,",1.0
4955,"tamlede,","tamlege,",1.0


In [110]:
combo[combo['err'].str.contains('ó')]

Unnamed: 0,err,match,scor
62,"ingeniód,","ingeniór,",1.0
63,"ingeniód,","ingeniót,",1.0
78,"Sikfórer,","Sakfórer,",1.0
95,"Jarmalóit,","Jarmasóit,",1.0
96,"Ingeniór,","ingeniór,",1.0
...,...,...,...
5555,Ingród,Ingrid,1.0
5587,"ungeniór,","ingeniór,",1.0
5592,"Vingeniór,","ingeniór,",1.0
5622,"Hjórdit,","Hjórdis,",1.0


### Grafer over samforekomster

Samle sammen alle parene, og lag en graf for clustring. Den ble ganske bra

In [121]:
triplets = [(r[1]['err'], r[1]['match']) for r in combo.iterrows()]

In [122]:
import networkx as nx

In [123]:
G = nx.from_edgelist(triplets)

In [124]:
import dhlab.graph_networkx_louvain as gnl

In [125]:
gnl.show_communities(G)

Opsahl,-Onsahl, :  Opsahl,, Onsahl,, Opsahb,, Opsåhl,, Opsahel,

Refsum-Rejsum :  Refsum, Rejsum, Retsum

Nelly-„Nelly :  Nelly, „Nelly

bankkasserer,-Cankkasserer, :  bankkasserer,, Cankkasserer,, baukkasserer,

189.-1895. :  189., 1895., 1897., 189?,, 189f., 1899., 189I,, 189Ø., 1890, 189O., 1893, 1899,, 1890., 1898., 1896., 1884,, 1896,, 1899, 189, 1895, 1897,, 1897;, 1882,, 1892., 1898, 1894, 1894,, 1893., 1898,, 188., 1889,, 1897, 1894., 1896, 1890,, 1894;, 1889., 1892,, 1899;, 1891., 1885., 1895;, 1891, 1886,, 18975, 1891,, 184., 1897-, 1884., 1989., 1898;, 1893,, 1892, 1890;, 1889, 1899-, 1895,, 1899i, 1887,, 18920,, 1887., 1815., 1888,, 1896-, 1859., 1879,, 1876,, 1883,, 1856,, 1875., 1815, 1848,, 185., 18940,, 1839,, 1996, 187., 1869,, 1846,, 183., 1847,, 1845., 1806., 182., 1897?;, 1884.., O1897., 180., 91897,, 1891",, 1395., 7896,, 18.98., 18.91, 1898.., 1890.., 1878, 18.98,, 188.7., 18.96,, 181., 7892,, 1888., 1698,, „1899., „1897., 1874,, 18.97,, 186., 1399,, 1898°.

tannl

In [125]:
gnl.show_communities(G)

Opsahl,-Onsahl, :  Opsahl,, Onsahl,, Opsahb,, Opsåhl,, Opsahel,

Refsum-Rejsum :  Refsum, Rejsum, Retsum

Nelly-„Nelly :  Nelly, „Nelly

bankkasserer,-Cankkasserer, :  bankkasserer,, Cankkasserer,, baukkasserer,

189.-1895. :  189., 1895., 1897., 189?,, 189f., 1899., 189I,, 189Ø., 1890, 189O., 1893, 1899,, 1890., 1898., 1896., 1884,, 1896,, 1899, 189, 1895, 1897,, 1897;, 1882,, 1892., 1898, 1894, 1894,, 1893., 1898,, 188., 1889,, 1897, 1894., 1896, 1890,, 1894;, 1889., 1892,, 1899;, 1891., 1885., 1895;, 1891, 1886,, 18975, 1891,, 184., 1897-, 1884., 1989., 1898;, 1893,, 1892, 1890;, 1889, 1899-, 1895,, 1899i, 1887,, 18920,, 1887., 1815., 1888,, 1896-, 1859., 1879,, 1876,, 1883,, 1856,, 1875., 1815, 1848,, 185., 18940,, 1839,, 1996, 187., 1869,, 1846,, 183., 1847,, 1845., 1806., 182., 1897?;, 1884.., O1897., 180., 91897,, 1891",, 1395., 7896,, 18.98., 18.91, 1898.., 1890.., 1878, 18.98,, 188.7., 18.96,, 181., 7892,, 1888., 1698,, „1899., „1897., 1874,, 18.97,, 186., 1399,, 1898°.

tannl