In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

# The address contains the base for out dataset
base_url = "http://www.thelatinlibrary.com/"
home_content = urlopen(base_url)

#use BeautifulSoup to take out relevant data
soup = BeautifulSoup(home_content, "lxml")
author_page_links = soup.find_all("a")
author_pages = [ap["href"] for i,ap in enumerate(author_page_links) if i < 49]
ap_content = list()
texts = list()

# Take the content from first 49 pages
for ap in author_pages:
    ap_content.append(urlopen(base_url + ap))

book_links = list()

for path,content in zip(author_pages, ap_content):
    author_name = path.split(".")[0]
    ap_soup = BeautifulSoup(content, "lxml")
    book_links += ([link for link in ap_soup.find_all("a", {"href": True}) if author_name in link["href"]])
   


In [2]:
print(book_links[0])

<a href="ammianus/14.shtml">Liber XIV</a>


In [3]:
num_pages = 200

for i,bl in enumerate(book_links[:num_pages]):
    print("Getting content " + str(i+1) + " of " + str(num_pages), end="\r", flush=True)
    try:
        content = urlopen(base_url + bl["href"]).read()
        texts.append(content)
    except HTTPError as err:
        print("Unable to retrieve" + bl["href"] + ".")
        continue

Getting content 200 of 200

In [4]:
# Training Conditional Random Field for segmenting images


sentences = list()

for i,text in enumerate(texts):
    print("Document " + str(i+1) + " of " + str(len(texts)), end="\r", flush=True)
    textSoup = BeautifulSoup(text,"lxml")
    paragraphs = textSoup.find_all("p",attrs={"class":None})
    prepared = ("".join([p.text.strip().lower() for p in paragraphs[1:-1]]))
    for t in prepared.split("."):
        part = "".join([c for c in t if c.isalpha() or c.isspace()])
        sentences.append(part.strip())
        
sentences = [s for s in sentences if len(s) > 5]

print(sentences[200])
        

tentis igitur regis utriusque legatis et negotio tectius diu pensato cum pacem oportere tribui quae iustis condicionibus petebatur eamque ex re tum fore sententiarum via concinens adprobasset advocato in contionem exercitu imperator pro tempore pauca dicturus tribunali adsistens circumdatus potestatum coetu celsarum ad hunc disservit modum nemo quaeso miretur si post exsudatos labores itinerum longos congestosque adfatim commeatus fiducia vestri ductante barbaricos pagos adventans velut mutato repente consilio ad placidiora deverti


In [5]:
prepared_sentences = list()

for sentence in sentences:
    lengths = [len(w) for w in sentence.split(" ")]
    positions = []
    
    next_pos = 0
    
    for length in lengths:
        next_pos = next_pos + length
        positions.append(next_pos)
    concatenated = sentence.replace(" ", "")
    chars = [c for c in concatenated]
    labels = [0 if not i in positions else 1 for i,c in enumerate(concatenated)]
    
    prepared_sentences.append(list(zip(chars, labels)))
    
print([d for d in prepared_sentences[200]])

[('t', 0), ('e', 0), ('n', 0), ('t', 0), ('i', 0), ('s', 0), ('i', 1), ('g', 0), ('i', 0), ('t', 0), ('u', 0), ('r', 0), ('r', 1), ('e', 0), ('g', 0), ('i', 0), ('s', 0), ('u', 1), ('t', 0), ('r', 0), ('i', 0), ('u', 0), ('s', 0), ('q', 0), ('u', 0), ('e', 0), ('l', 1), ('e', 0), ('g', 0), ('a', 0), ('t', 0), ('i', 0), ('s', 0), ('e', 1), ('t', 0), ('n', 1), ('e', 0), ('g', 0), ('o', 0), ('t', 0), ('i', 0), ('o', 0), ('t', 1), ('e', 0), ('c', 0), ('t', 0), ('i', 0), ('u', 0), ('s', 0), ('d', 1), ('i', 0), ('u', 0), ('p', 1), ('e', 0), ('n', 0), ('s', 0), ('a', 0), ('t', 0), ('o', 0), ('c', 1), ('u', 0), ('m', 0), ('p', 1), ('a', 0), ('c', 0), ('e', 0), ('m', 0), ('o', 1), ('p', 0), ('o', 0), ('r', 0), ('t', 0), ('e', 0), ('r', 0), ('e', 0), ('t', 1), ('r', 0), ('i', 0), ('b', 0), ('u', 0), ('i', 0), ('q', 1), ('u', 0), ('a', 0), ('e', 0), ('i', 1), ('u', 0), ('s', 0), ('t', 0), ('i', 0), ('s', 0), ('c', 1), ('o', 0), ('n', 0), ('d', 0), ('i', 0), ('c', 0), ('i', 0), ('o', 0), ('n', 0),

In [6]:
# Use n-grams
# N-grams is based on the principle that you can predict the probability of occurence of the next word
# based on the last N-1 words

# creating features now, for CRF's
# these features would probably be dependent on each other

def create_char_features(sentence,i):
    features = ['bias','char='+sentence[i][0]] 
    if i >= 1:
        features.extend([
            'char-1=' + sentence[i-1][0],
            'char-1:0=' + sentence[i-1][0] + sentence[i][0],
        ])
    else:
        features.append("BOS")
        
    if i >= 2:
        features.extend([
            'char-2=' + sentence[i-2][0],
            'char-2:0=' + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-2:-1=' + sentence[i-2][0] + sentence[i-1][0],
        ])
        
    if i >= 3:
        features.extend([
            'char-3:0=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-3:-1=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0],
        ])
    return features
    
    

In [7]:
# Training and Testing data
# won't be using cross validation here

def create_sentence_features(prepared_sentence):
    return [create_char_features(prepared_sentence, i) for i in range(len(prepared_sentence))]

def create_sentence_labels(prepared_sentence):
    return [str(part[1]) for part in prepared_sentence]



In [None]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)

X = [create_sentence_features(ps) for ps in prepared_sentences[:-10000]]
y = [create_sentence_labels(ps)   for ps in prepared_sentences[:-10000]]

X_test = [create_sentence_features(ps) for ps in prepared_sentences[-10000:]]
y_test = [create_sentence_labels(ps)   for ps in prepared_sentences[-10000:]]

for xseq, yseq in zip(X,y):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1' : 1.0,
    'c2' : 1e-3,
    'max-iterations' : 60,
    'feature.possible_transitions' : True
})

trainer.train('latin-text-segmentation.crfsuite')

tagger = pycrfsuite.Tagger()
tagger.open('latin-text-segmentation.crfsuite')

def segment_sentence(sentence):
    sent = sentence.replace(" ", "")
    prediction = tagger.tag(create_sentence_features(sent))
    complete = ""
    for i,p in enumerate(prediction):
        if p=='1':
            complete += " " + sent[i]
        else:
            complete += sent[i]
    return complete

print(segment_sentence("dominusadtemplumproperat")) # dominus ad templum properat
print(segment_sentence("portapatet")) # porta patet