In [1]:
# Exercise: 11-1
# In Example 11-7 the new field appeared at the bottom of the entry. Modify this program so that it inserts
# the new subelement right after the lx field. (Hint: create the new cv field using Element('cv'), assign a
# text value to it, then use the insert() method of the parent element.)

from xml.etree.ElementTree import Element
import re

def cv(s):
    s = s.lower()
    s = re.sub(r'[^a-z]',  r'_', s)
    s = re.sub(r'[aeiou]', r'V', s)
    s = re.sub(r'[^V_]',   r'C', s)
    return s

def add_cv_field(entry):
    for i in range(len(entry)):
        if entry[i].tag == 'lx':
            cv_field = Element('cv')
            cv_field.text = cv(entry[i].text)
            entry.insert(i + 1, cv_field)

In [2]:
import nltk
from nltk.corpus import toolbox

lexicon = toolbox.xml('rotokas.dic')
add_cv_field(lexicon[53])
print(nltk.toolbox.to_sfm_string(lexicon[53]))

\lx kaeviro
\cv CVVCVCV
\ps V
\pt A
\ge lift off
\ge take off
\tkp go antap
\sc MOTION
\vx 1
\nt used to describe action of plane
\dt 03/Jun/2005
\ex Pita kaeviroroe kepa kekesia oa vuripierevo kiuvu.
\xp Pita i go antap na lukim haus win i bagarapim.
\xe Peter went to look at the house that the wind destroyed.



In [3]:
import random

r = random.randint(1, len(lexicon))
print("Entry {}".format(r))

add_cv_field(lexicon[r])
print(nltk.toolbox.to_sfm_string(lexicon[r]))

Entry 333
\lx keevuruvira
\cv CVVCVCVCVCV
\rt keevuru
\ge ???
\tkp ???
\dt 18/Apr/2007
\ex Kokio kare keevuruvira papapaiveira keauvere oisio koveroi rara kevururo vara???.
\xp Ol pisin i save plai isisi antap. Na ai yu ting olsem bai em i pundaun taim i plai ???.
\xe Birds fly gliding ???



In [4]:
r = random.randint(1, len(lexicon))
print("Entry {}".format(r))

add_cv_field(lexicon[r])
print(nltk.toolbox.to_sfm_string(lexicon[r]))

Entry 859
\lx kuukuuvuto
\cv CVVCVVCVCV
\alt kuukuuvupato
\rt kuukuuvu
\ps N
\pt HUM
\ge liar
\ge deceiver
\tkp ???
\dt 02/Sep/2005
\ex Pita riro kuukuuvuto.
\xp Pita em i man bilong giaman man.
\xe Peter is a big liar.
\ex Riro kuukuuvuva vii.
\xp Yu man bilong giaman.
\xe You're a liar.
\ex Tugarato riro kuukuuvuto ira oirara keakeapareveira.
\xp Setan i man bilong giaman em i save giamanim ol man.
\xe Satan is a big liar who deceives people.



In [5]:
r = random.randint(1, len(lexicon))
print("Entry {}".format(r))

add_cv_field(lexicon[r])
print(nltk.toolbox.to_sfm_string(lexicon[r]))

Entry 676
\lx koria
\cv CVCVV
\rt kori
\ps N
\pt NT
\ge writing
\ge markings
\ge design
\tkp makmak raft
\dt 14/Feb/2005
\ex Avaisisi koria purarevoi kepa-ia.
\xp Avaisisi i wokim ol makmak long haus.
\xe Avaisisi is putting markings on the house.



In [6]:
# Exercise: 11-2
# Write a function that deletes a specified field from a lexical entry. (We could use this to sanitize our
# lexical data before giving it to others, e.g., by removing fields containing irrelevant or uncertain content.)

from xml.etree.ElementTree import Element

def remove_field(entry, tbr):
    """
    Removes a field from a lexical entry.

    Arguments:

    entry: xml Element
    tbr:   field to be removed
    """

    assert isinstance(entry, Element), 'Entry must be an xml Element'

    for field in entry:
        if field.tag == tbr:
            entry.remove(field)

In [7]:
remove_field(lexicon[203], 'arg')

print(nltk.toolbox.to_sfm_string(lexicon[203]))

\lx karepie
\rt kare
\ps V
\pt B
\ge return
\tkp bekim
\eng return
\eng pay back
\eng send back
\vx 2
\dt 08/Jun/2005
\ex Vii vaaro vukua oa vii iare karepieavere.
\xp ???
\xe I will return your book to you.
\ex Oire vii varo vukua karepieavere.
\xp Em i orait, bai mi bekim buk bilong yu.
\xe Okay, I'll give you your book back.



In [8]:
# Exercise: 11-3
# Write a program that scans an HTML dictionary file to find entries having an illegal part-of-speech field, and
# then reports the headword for each entry.

import re

def find_illegal_pos(entry, head_tag, pos_tag, illegal_pos):
    """
    Scans an HTML dictionary file and returns headword if an
    illegal part-of-speech field is present.

    Arguments:

    entry:       entry in an HTML dictionary file
    head_tag:    regex tag to identify headwords
    pos_tag:     regex tag to identify POS fields
    illegal_pos: Illegal POS fields. May be a list or string.
    """

    pos = re.findall(re.compile(pos_tag), entry)

    # When we have several illegal POS:
    if isinstance(illegal_pos, list):
        for p in pos:
            if p in illegal_pos:
                return re.findall(re.compile(head_tag), entry)

    # When we have no illegal POS:
    elif isinstance(illegal_pos, str):
        for p in pos:
            if p == illegal_pos:
                return re.findall(re.compile(head_tag), entry)

In [9]:
test = """
<p class=MsoNormal>sleep
  <span style='mso-spacerun:yes'> </span>
  [<span class=SpellE>sli:p</span>]
  <span style='mso-spacerun:yes'> </span>
  <b><span style='font-size:11.0pt'>v.i.</span></b>
  <span style='mso-spacerun:yes'> </span>
  <i>a condition of body and mind ...<o:p></o:p></i>
</p>
"""

In [10]:
find_illegal_pos(test, r"<p [^>]*>(.*)\n.*<",
                 r"style='font-size:11.0pt'>([a-z.]+)<", ['v.i.'])

['sleep']

In [11]:
# Exercise: 11-4
# Write a program to find any parts-of-speech (ps field) that occurred less than 10 times. Perhaps
# these are typing mistakes?

from collections import Counter

POSs = [field.text for entry in lexicon for field in entry if field.tag == 'ps']
POS_freq = Counter(POSs)

[(k, v) for k, v in POS_freq.items() if v < 10]

[('CLASS', 6), ('FFP', 1), ('NUM', 1), ('POST', 1), ('EXCL', 1)]

In [12]:
# Exercise: 11-5
# We saw a method for adding a cv field (Working with Toolbox Data). There is an interesting issue with
# keeping this up-to-date when someone modifies the content of the lx field on which it is based. Write a
# version of this program to add a cv field, replacing any existing cv field.

add_cv_field(lexicon[203])
print(nltk.toolbox.to_sfm_string(lexicon[203]))

\lx karepie
\cv CVCVCVV
\rt kare
\ps V
\pt B
\ge return
\tkp bekim
\eng return
\eng pay back
\eng send back
\vx 2
\dt 08/Jun/2005
\ex Vii vaaro vukua oa vii iare karepieavere.
\xp ???
\xe I will return your book to you.
\ex Oire vii varo vukua karepieavere.
\xp Em i orait, bai mi bekim buk bilong yu.
\xe Okay, I'll give you your book back.



In [13]:
lexicon[203].findall('lx')[0].text = 'karepieavere'

In [14]:
print(nltk.toolbox.to_sfm_string(lexicon[203]))

\lx karepieavere
\cv CVCVCVV
\rt kare
\ps V
\pt B
\ge return
\tkp bekim
\eng return
\eng pay back
\eng send back
\vx 2
\dt 08/Jun/2005
\ex Vii vaaro vukua oa vii iare karepieavere.
\xp ???
\xe I will return your book to you.
\ex Oire vii varo vukua karepieavere.
\xp Em i orait, bai mi bekim buk bilong yu.
\xe Okay, I'll give you your book back.



In [15]:
def cv(s):
    s = s.lower()
    s = re.sub(r'[^a-z]',  r'_', s)
    s = re.sub(r'[aeiou]', r'V', s)
    s = re.sub(r'[^V_]',   r'C', s)
    return s

def update_cv_field(entry):
    for field in entry:
        if field.tag == 'lx':
            new_cv_field = cv(field.text)
            entry.findall('cv')[0].text = new_cv_field

In [16]:
update_cv_field(lexicon[203])

In [17]:
print(nltk.toolbox.to_sfm_string(lexicon[203]))

\lx karepieavere
\cv CVCVCVVVCVCV
\rt kare
\ps V
\pt B
\ge return
\tkp bekim
\eng return
\eng pay back
\eng send back
\vx 2
\dt 08/Jun/2005
\ex Vii vaaro vukua oa vii iare karepieavere.
\xp ???
\xe I will return your book to you.
\ex Oire vii varo vukua karepieavere.
\xp Em i orait, bai mi bekim buk bilong yu.
\xe Okay, I'll give you your book back.



In [18]:
# Exercise: 11-6
# Write a function to add a new field syl which gives a count of the number of syllables in the word.

def count_syllables(word):
    CV = cv(word)
    syl = 0

    # If the initial letter is a vowel
    if CV[0] == 'V':
        syl += 1

    # Count the number of vowels not preceded by a vowel
    flag = False
    for i in range(1, len(CV)):
        if CV[i] == 'V' and CV[i - 1] != 'V':
            syl += 1
            flag = False
        # make exception for three vowels in a row
        elif CV[i] == 'V' and flag == True:
            syl += 1
            flag = False
        elif CV[i] == 'V' and CV[i - 1] == 'V':
            flag = True

    return str(syl)

In [19]:
count_syllables('rugorugoopau')

'5'

In [20]:
count_syllables('karepieavere')

'6'

In [21]:
from xml.etree.ElementTree import SubElement

def add_syl_field(entry):
    for field in entry:
        if field.tag == 'lx':
            syl_field = SubElement(entry, 'syl')
            syl_field.text = count_syllables(field.text)

In [22]:
add_syl_field(lexicon[203])

In [23]:
print(nltk.toolbox.to_sfm_string(lexicon[203]))

\lx karepieavere
\cv CVCVCVVVCVCV
\rt kare
\ps V
\pt B
\ge return
\tkp bekim
\eng return
\eng pay back
\eng send back
\vx 2
\dt 08/Jun/2005
\ex Vii vaaro vukua oa vii iare karepieavere.
\xp ???
\xe I will return your book to you.
\ex Oire vii varo vukua karepieavere.
\xp Em i orait, bai mi bekim buk bilong yu.
\xe Okay, I'll give you your book back.
\syl 6



In [24]:
# Exercise: 11-7
# Write a function which displays the complete entry for a lexeme. When the lexeme is incorrectly spelled, it should
# display the entry for the most similarly spelled lexeme.

from xml.etree.ElementTree import Element

new_lex = []

def rank(word, wordlist):
    ranked = sorted((nltk.edit_distance(word, w), w) for w in wordlist)
    return [word for(_, word) in ranked]

def display_complete_entry(lexicon, lexeme, tag = 'lx'):
    """
    Displays complete entry for a lexeme.  If lexeme is
    incorrectly spelled, the entry for the most similary
    spelled is displayed.

    Arguments:

    lexicon: Must be an xml Element. Lexem
    lexeme:  String.  If not found, most similarly
             spelled lexeme is used.
    tag:     Lexeme tag in the lexicon.  Default is 'lx'.
    """

    assert isinstance(lexicon, Element), 'Entry must be an xml Element'

    for entry in lexicon:
        for field in entry:
            if field.tag == tag:
                if field.text == lexeme:
                    print(nltk.toolbox.to_sfm_string(entry))
                    return


    lexs = [field.text for entry in lexicon for field in entry
               if field.tag == tag]

    new_lex = rank(lexeme, lexs)[0]
    eight_trial(lexicon, new_lex)

In [25]:
display_complete_entry(lexicon, 'kaeviro')

\lx kaeviro
\cv CVVCVCV
\ps V
\pt A
\ge lift off
\ge take off
\tkp go antap
\sc MOTION
\vx 1
\nt used to describe action of plane
\dt 03/Jun/2005
\ex Pita kaeviroroe kepa kekesia oa vuripierevo kiuvu.
\xp Pita i go antap na lukim haus win i bagarapim.
\xe Peter went to look at the house that the wind destroyed.



In [26]:
display_complete_entry(lexicon, 'kappa')

NameError: name 'eight_trial' is not defined

In [27]:
# Exercise: 11-8
# Write a function that takes a lexicon and finds which pairs of consecutive fields are most frequent (e.g.,
# ps is often followed by pt). (This might help us to discover some of the structure of a lexical entry.)

from xml.etree.ElementTree import Element
from collections import Counter

def find_frequent_consecutive_fields(lexicon, n = 15):
    """
    Find most frequent consecutive fields in a lexicon.

    Arguments:

    lexicon: Lexicon. Must be an xml Element.
    n:       Number of most frequent pairs to be returned.
             Default is 15.
    """
    assert isinstance(lexicon, Element), 'lexicon must be an xml Element'

    tags = []
    for entry in lexicon:
        for i in range(len(entry) - 1):
                tags.append((entry[i].tag, entry[i + 1].tag))

    return Counter(tags).most_common(n)

In [28]:
find_frequent_consecutive_fields(lexicon)

[(('ex', 'xp'), 1532),
 (('xp', 'xe'), 1526),
 (('ps', 'pt'), 835),
 (('ge', 'tkp'), 824),
 (('pt', 'ge'), 766),
 (('dt', 'ex'), 758),
 (('xe', 'ex'), 708),
 (('lx', 'ps'), 519),
 (('rt', 'ps'), 356),
 (('tkp', 'dt'), 327),
 (('lx', 'rt'), 310),
 (('ge', 'ge'), 287),
 (('eng', 'eng'), 143),
 (('cmt', 'dt'), 143),
 (('tkp', 'nt'), 130)]

In [29]:
# Exercise: 11-9
# Create a spreadsheet using office software, containing one lexical entry per row, consisting of a headword, a
# part of speech, and a gloss. Save the spreadsheet in CSV format. Write Python code to read the CSV file and
# print it in Toolbox format, using lx for the headword, ps for the part of speech, and gl for the gloss.

import csv

new_lex = []

with open('simple_lexicon.csv', encoding = 'utf-8-sig') as f:
    reader = csv.reader(f, delimiter = ",")
    for row in reader:
        new_lex.append(row)

In [30]:
new_lex

[]

In [31]:
xml =   """ <toolbox_data>
                <header>
                    <_sh>v1.0 Absurdly Short Lexicon</_sh>
                </header>
        """

for nl in new_lex:

    xml += """
                    <record>
                        <lx>{}</lx>
                        <ps>{}</ps>
                        <gl>{}</gl>
                    </record>""".format(nl[0], nl[1], nl[2])

xml += "</toolbox_data>"

In [32]:
print(xml)

 <toolbox_data>
                <header>
                    <_sh>v1.0 Absurdly Short Lexicon</_sh>
                </header>
        </toolbox_data>


In [33]:
# Exercise: 11-10
# Index the words of Shakespeare’s plays, with the help of nltk.Index. The resulting data structure should permit
# lookup on individual words, such as music, returning a list of references to acts, scenes, and speeches, of the
# form [(3, 2, 9), (5, 1, 23), ...], where (3, 2, 9) indicates Act 3 Scene 2 Speech 9.

from xml.etree.ElementTree import ElementTree

plays = ["Anthony and Cleopatra",
         "A Midsummer Night's Dream",
         "Hamlet",
         "Julius Caesar",
         "MacBeth",
         "The Merchant of Venice",
         "Othello",
         "Romeo and Juliet"]


def find_in_shakespeare(word):
    print("The format for the references is [(A, S, P)], with A, S, P being respectively the Act, Scene, and Speech of the play.\n")
    for pf, piece in zip(nltk.corpus.shakespeare.fileids(), plays):
        play_file = nltk.data.find('corpora/shakespeare/{}'.format(pf))
        play = ElementTree().parse(play_file)

        refs = []

        for i, act in enumerate(play.findall('ACT')):
            for j, scene in enumerate(act.findall('SCENE')):
                for k, speech in enumerate(scene.findall('SPEECH')):
                    for line in speech.findall('LINE'):
                        if word in str(line.text):
                            refs.append(("({}, {}, {})".format(i + 1, j + 1, k + 1)))

        print("References to '{}' in \"{}\":".format(word, piece))
        print("\t", refs, end = " ")
        print("\n")

In [34]:
find_in_shakespeare('music')

The format for the references is [(A, S, P)], with A, S, P being respectively the Act, Scene, and Speech of the play.

References to 'music' in "Anthony and Cleopatra":
	 ['(2, 5, 1)', '(2, 5, 2)', '(2, 5, 7)', '(2, 7, 65)'] 

References to 'music' in "A Midsummer Night's Dream":
	 ['(2, 1, 19)', '(4, 1, 11)', '(4, 1, 12)', '(4, 1, 22)', '(4, 1, 23)', '(4, 1, 25)', '(4, 1, 29)', '(4, 1, 29)', '(4, 1, 30)', '(5, 1, 8)'] 

References to 'music' in "Hamlet":
	 ['(2, 1, 25)', '(3, 1, 42)', '(3, 2, 94)', '(3, 2, 94)', '(3, 2, 128)', '(3, 2, 130)', '(3, 4, 48)', '(5, 2, 147)'] 

References to 'music' in "Julius Caesar":
	 ['(1, 2, 13)', '(1, 2, 49)', '(4, 3, 126)'] 

References to 'music' in "MacBeth":
	 [] 

References to 'music' in "The Merchant of Venice":
	 ['(3, 2, 9)', '(3, 2, 9)', '(3, 2, 9)', '(5, 1, 23)', '(5, 1, 23)', '(5, 1, 23)', '(5, 1, 24)', '(5, 1, 25)', '(5, 1, 25)', '(5, 1, 25)', '(5, 1, 25)', '(5, 1, 25)', '(5, 1, 29)', '(5, 1, 32)'] 

References to 'music' in "Othello":
	 

In [35]:
find_in_shakespeare('death')

The format for the references is [(A, S, P)], with A, S, P being respectively the Act, Scene, and Speech of the play.

References to 'death' in "Anthony and Cleopatra":
	 ['(1, 2, 61)', '(1, 2, 77)', '(1, 2, 79)', '(1, 2, 92)', '(1, 3, 24)', '(1, 3, 27)', '(3, 1, 1)', '(3, 5, 6)', '(3, 10, 6)', '(3, 11, 19)', '(3, 13, 66)', '(4, 2, 15)', '(4, 2, 17)', '(4, 9, 17)', '(4, 12, 5)', '(4, 13, 3)', '(4, 14, 9)', '(4, 14, 15)', '(4, 14, 30)', '(4, 14, 31)', '(4, 14, 41)', '(4, 14, 45)', '(4, 15, 4)', '(4, 15, 8)', '(4, 15, 28)', '(4, 15, 28)', '(4, 15, 28)', '(5, 1, 7)', '(5, 2, 14)', '(5, 2, 15)', '(5, 2, 16)', '(5, 2, 104)', '(5, 2, 111)', '(5, 2, 124)'] 

References to 'death' in "A Midsummer Night's Dream":
	 ['(1, 1, 6)', '(1, 1, 13)', '(1, 1, 20)', '(1, 1, 30)', '(1, 2, 5)', '(2, 2, 23)', '(3, 2, 33)', '(3, 2, 47)', '(3, 2, 92)', '(3, 2, 92)', '(4, 1, 50)', '(5, 1, 10)', '(5, 1, 48)', '(5, 1, 83)'] 

References to 'death' in "Hamlet":
	 ['(1, 1, 50)', '(1, 2, 1)', '(1, 2, 1)', '(1, 2, 1

In [36]:
find_in_shakespeare('Romans')

The format for the references is [(A, S, P)], with A, S, P being respectively the Act, Scene, and Speech of the play.

References to 'Romans' in "Anthony and Cleopatra":
	 ['(3, 2, 20)'] 

References to 'Romans' in "A Midsummer Night's Dream":
	 [] 

References to 'Romans' in "Hamlet":
	 [] 

References to 'Romans' in "Julius Caesar":
	 ['(1, 2, 36)', '(1, 3, 19)', '(1, 3, 23)', '(1, 3, 23)', '(1, 3, 25)', '(2, 1, 35)', '(2, 1, 59)', '(2, 1, 79)', '(2, 2, 20)', '(2, 2, 21)', '(3, 1, 52)', '(3, 2, 6)', '(3, 2, 28)', '(3, 2, 30)', '(5, 3, 32)', '(5, 3, 32)', '(5, 3, 32)'] 

References to 'Romans' in "MacBeth":
	 [] 

References to 'Romans' in "The Merchant of Venice":
	 [] 

References to 'Romans' in "Othello":
	 [] 

References to 'Romans' in "Romeo and Juliet":
	 [] 



In [37]:
find_in_shakespeare('Venice')

The format for the references is [(A, S, P)], with A, S, P being respectively the Act, Scene, and Speech of the play.

References to 'Venice' in "Anthony and Cleopatra":
	 [] 

References to 'Venice' in "A Midsummer Night's Dream":
	 [] 

References to 'Venice' in "Hamlet":
	 [] 

References to 'Venice' in "Julius Caesar":
	 [] 

References to 'Venice' in "MacBeth":
	 [] 

References to 'Venice' in "The Merchant of Venice":
	 ['(1, 1, 25)', '(1, 1, 32)', '(1, 3, 17)', '(2, 8, 5)', '(3, 1, 34)', '(3, 1, 39)', '(3, 2, 34)', '(3, 2, 44)', '(3, 3, 9)', '(3, 4, 7)', '(4, 1, 19)', '(4, 1, 50)', '(4, 1, 54)', '(4, 1, 59)', '(4, 1, 86)', '(4, 1, 86)', '(4, 1, 105)', '(4, 1, 130)', '(5, 1, 5)'] 

References to 'Venice' in "Othello":
	 ['(1, 1, 28)', '(2, 1, 6)', '(2, 1, 79)', '(2, 1, 85)', '(2, 3, 98)', '(3, 3, 75)', '(3, 4, 71)', '(4, 1, 103)', '(4, 1, 106)', '(4, 1, 130)', '(4, 1, 138)', '(4, 2, 41)', '(4, 2, 77)', '(4, 2, 95)', '(4, 2, 96)', '(4, 3, 23)', '(5, 1, 58)'] 

References to 'Venic

In [38]:
# Exercise: 11-11
# Construct a conditional frequency distribution which records the word length for each speech in The Merchant of
# Venice, conditioned on the name of the character; e.g., cfd['PORTIA'][12] would give us the number of speeches
# by Portia consisting of 12 words.

import re
from xml.etree.ElementTree import ElementTree
from nltk.probability import ConditionalFreqDist

merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
merchant = ElementTree().parse(merchant_file)

lines_by_speaker = []

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

for act in merchant.findall('ACT'):
    for scene in act.findall('SCENE'):
        for speech in scene.findall('SPEECH'):
            for speaker in speech.findall('SPEAKER'):
                tally = 0
            for line in speech.findall('LINE'):
                if line.text:
                    tally += len([w for w in tokenizer.tokenize(line.text)])
            lines_by_speaker.append((speaker.text, tally))

cfd = ConditionalFreqDist((s, l) for s, l in lines_by_speaker)

In [39]:
cfd['PORTIA'][12]

3

In [40]:
for act in merchant.findall('ACT'):
    for scene in act.findall('SCENE'):
        for speech in scene.findall('SPEECH'):
            for speaker in speech.findall('SPEAKER'):
                if speaker.text == 'PORTIA':
                    tally = 0
                    for line in speech.findall('LINE'):
                        if line.text:
                            print(line.text)
                            tally += len([w for w in tokenizer.tokenize(line.text)])
                    print('****', str(tally))

By my troth, Nerissa, my little body is aweary of
this great world.
**** 13
Good sentences and well pronounced.
**** 5
If to do were as easy as to know what were good to
do, chapels had been churches and poor men's
cottages princes' palaces. It is a good divine that
follows his own instructions: I can easier teach
twenty what were good to be done, than be one of the
twenty to follow mine own teaching. The brain may
devise laws for the blood, but a hot temper leaps
o'er a cold decree: such a hare is madness the
youth, to skip o'er the meshes of good counsel the
cripple. But this reasoning is not in the fashion to
choose me a husband. O me, the word 'choose!' I may
neither choose whom I would nor refuse whom I
dislike; so is the will of a living daughter curbed
by the will of a dead father. Is it not hard,
Nerissa, that I cannot choose one nor refuse none?
**** 152
I pray thee, over-name them; and as thou namest
them, I will describe them; and, according to my
description, level at my af

In [41]:
# Exercise: 11-12
# Write a recursive function to produce an XML representation for a tree, with non-terminals represented as XML
# elements, and leaves represented as text content, e.g.:

import re

def parse_to_xml(tree, indent = "  "):
    """
    Converts parsed sentence to xml string.
    """
    # terminals
    if tree.height() == 2:
        print("{}<{}>{}</{}>".format(indent, tree.label(),
                                      tree.leaves()[0], tree.label()))
    # non-terminals
    else:
        # if label has hyphens, they are to be replaced as 'type ='
        if '-' in tree.label():
            pattern = re.compile(r"(\w+)-(\w+)")
            tag = re.findall(pattern, tree.label())[0]
            full_tag = '{} type="{}"'.format(tag[0], tag[1])
        else:
            full_tag = tree.label()

        print("{}<{}>".format(indent, full_tag))
        for child in tree:
            parse_to_xml(child, indent = indent + "  ")
        print('{}</{}>'.format(indent, full_tag))

In [42]:
from nltk.corpus import treebank

for t in treebank.parsed_sents()[:5]:
    parse_to_xml(t)

  <S>
    <NP type="SBJ">
      <NP>
        <NNP>Pierre</NNP>
        <NNP>Vinken</NNP>
      </NP>
      <,>,</,>
      <ADJP>
        <NP>
          <CD>61</CD>
          <NNS>years</NNS>
        </NP>
        <JJ>old</JJ>
      </ADJP>
      <,>,</,>
    </NP type="SBJ">
    <VP>
      <MD>will</MD>
      <VP>
        <VB>join</VB>
        <NP>
          <DT>the</DT>
          <NN>board</NN>
        </NP>
        <PP type="CLR">
          <IN>as</IN>
          <NP>
            <DT>a</DT>
            <JJ>nonexecutive</JJ>
            <NN>director</NN>
          </NP>
        </PP type="CLR">
        <NP type="TMP">
          <NNP>Nov.</NNP>
          <CD>29</CD>
        </NP type="TMP">
      </VP>
    </VP>
    <.>.</.>
  </S>
  <S>
    <NP type="SBJ">
      <NNP>Mr.</NNP>
      <NNP>Vinken</NNP>
    </NP type="SBJ">
    <VP>
      <VBZ>is</VBZ>
      <NP type="PRD">
        <NP>
          <NN>chairman</NN>
        </NP>
        <PP>
          <IN>of</IN>
          <NP>
           

In [43]:
# Exercise: 11-13
# Obtain a comparative wordlist in CSV format, and write a program that prints those cognates having an
# edit-distance of at least three from each other.

import csv

swadesh = []

with open('Swadesh Lists.csv', 'r', encoding = 'utf-8-sig') as f:
    reader = csv.reader(f, delimiter = ",")
    for row in reader:
        swadesh.append(row)

In [44]:
def find_swadesh_cognates(word):
    """
    Uses Swadesh wordlists to find pairs of words with small edit
    distances.
    """
    flag = True
    for row in swadesh:
        for i in range(2, len(row)):
            if i % 2 == 0:
                if nltk.edit_distance(word, row[i]) < 3:
                    if flag:
                        print("Near matches for '{}' are:\n".format(word))
                    print("{:4}{:9} {:12} English: {}".format("",
                                                  row[i + 1] + ":",
                                                  row[i], row[0]))
                    flag = False

    if flag:
        print("No matches were found.")

In [45]:
find_swadesh_cognates('animal')

Near matches for 'animal' are:

    French:   animal       English: animal
    Italian:  animale      English: animal
    Latin:    animalis     English: animal
    Spanish:  animal       English: animal


In [46]:
find_swadesh_cognates('feather')

No matches were found.


In [47]:
find_swadesh_cognates('stand')

Near matches for 'stand' are:

    French:   quand        English: when
    French:   grand        English: big
    Sanskrit: tanú         English: thin
    Dutch:    slang        English: snake
    French:   sang         English: blood
    Swedish:  svans        English: tail
    Dutch:    tand         English: tooth
    Swedish:  tand         English: tooth
    German:   hand         English: hand
    Dutch:    hand         English: hand
    Swedish:  hand         English: hand
    Sanskrit: stána        English: breast
    Latin:    stare        English: stand
    Dutch:    staan        English: stand
    Swedish:  sten         English: stone
    German:   sand         English: sand
    Dutch:    zand         English: sand
    Swedish:  sand         English: sand
    German:   staub        English: dust


In [48]:
find_swadesh_cognates('name')

Near matches for 'name' are:

    Russian:  tam          English: there
    Italian:  come         English: how
    Russian:  ne           English: not
    Italian:  cane         English: dog
    German:   samen        English: seed
    Italian:  seme         English: seed
    German:   nase         English: nose
    Italian:  naso         English: nose
    Swedish:  nagel        English: fingernail
    French:   jambe        English: leg
    Swedish:  mage         English: belly
    French:   nager        English: swim
    Italian:  dare         English: give
    Latin:    dare         English: give
    Italian:  mare         English: sea
    Latin:    mare         English: sea
    Italian:  sale         English: salt
    Swedish:  damm         English: dust
    French:   nuage        English: cloud
    Spanish:  nube         English: cloud
    Italian:  neve         English: snow
    Swedish:  natt         English: night
    Italian:  male         English: bad
    French:   sale     

In [49]:
# Exercise: 11-14
# Build an index of those lexemes which appear in example sentences. Suppose the lexeme for a given entry is w.
# Then, add a single cross-reference field xrf to this entry, referencing the headwords of other entries having
# example sentences containing w. Do this for all entries and save the result as a Toolbox-format file.

from nltk.corpus import toolbox
lexicon = toolbox.xml('rotokas.dic')

In [50]:
lexemes = [lexeme.text.lower() for lexeme in lexicon.findall('record/lx')]

In [51]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

example_words = []

for entry in lexicon:
    # re-initialize for each word, so that words without examples
    # don't inherit the last word's examples
    ex = ""
    for lexeme in entry.findall('lx'):
        lex = lexeme.text.lower()
    for example in entry.findall('ex'):
        ex = set([w.lower() for w in tokenizer.tokenize(example.text)])
        # remove entry word from example, so the entry words don't
        # add references to themselves later
        if lex in ex:
            ex.remove(lex)
    example_words.append((lex, ex))

NameError: name 'lex' is not defined

In [52]:
example_words[-5:]

[]

In [53]:
from xml.etree.ElementTree import SubElement


def add_xrf_field(lexicon, lookup, headword):
    for entry in lexicon:
        for field in entry:
            if field.tag == 'lx':
                if field.text == lookup:
                    xrf_field = SubElement(entry, 'xrf')
                    xrf_field.text = headword

In [54]:
display_complete_entry(lexicon, 'kare')

\lx kare
\ps FFP
\ge animals
\tkp plenti
\dt 17/Oct/2005
\ex O karevu koie kare kouevo ita akova.
\xp Pik mama i karim ol narapela pik.
\xe The mother carried the other pigs.



In [55]:
for row in example_words:
    for w in row[1]:
        if w in lexemes:
            add_xrf_field(lexicon, w, row[0])

In [56]:
display_complete_entry(lexicon, 'kare')

\lx kare
\ps FFP
\ge animals
\tkp plenti
\dt 17/Oct/2005
\ex O karevu koie kare kouevo ita akova.
\xp Pik mama i karim ol narapela pik.
\xe The mother carried the other pigs.



In [57]:
display_complete_entry(lexicon, 'kovovo')

\lx kovovo
\rt kovo
\ps V
\pt B
\ge fence
\ge protect
\tkp banisim
\arg O
\vx 2
\dt 08/Jun/2005
\ex Kepa kovovo va utoro.
\xp Yo banisim house bikol?? Em kolor tumas.
\xe ???
\ex Koue karen kovovo sapi kovoa aioive.
\xp You mekim fence long gut of ikaikaim garden.
\xe ???
\ex Koue kare kovovori ikauvira teapi kosiaviro kovoara aiosia.
\xp Yu banisim kwik ol pik, nogut ol i kam arasait na kaikaim ol gaden.
\xe Fence the pigs quickly lest they escape and eat the gardens.

