## Applying the Learner of Pearl & Sprouse to Norwegian Data

First, importing all the necessary libraries:

In [1]:
import re
import math
import numpy
import pandas as pd
from nltk.util import ngrams
from nltk.probability import FreqDist

Loading the data and subsetting it, checking if the dataset looks ok:

In [2]:
# to open a file, jupyter notebook requires a full path
path = 'C:/Users/anastask/OneDrive - NTNU/Project_python_files/Nob_Child/Corpus-NorwegianEmbeddedWh.csv' # edited
data = pd.read_csv(path)

# keeping only some necessary columns
data = data[['z', 'source', 'annotated', 'wh-word', 'gap sister']]
data.head(10)

Unnamed: 0,z,source,annotated,wh-word,gap sister
0,2,Tommelise,Det var en gang en kone som så gjerne ville ha...,hvor,P
1,3,Juleunderet,"[CP Hvordan [TP alt sammen [VP begynte __, vet...",hvordan,adjunct
2,3,Tigeren og bjørn…,"«Kom, vi tar en tur til byen, så skal jeg vise...",hvordan,adjunct
3,3,Froskeslottet,Jeg husker ikke helt [CP hvor [TP det [VP begy...,hvor,adjunct
4,3,Jonas får briller,– Kan du fortelle oss [CP hva [TP __ som [VP s...,hva,T
5,3,Ikke deg denne gang,Vet pene mennesker [CP hvordan [TP det [VP føl...,hvordan,V
6,3,Helten,Jeg er så glad for at dere endelig innser [CP ...,hvor fantastisk,V-be
7,4,Si aldri farvel,Fortell oss [CP hva [TP __ som [VP hendte i gå...,hva,T
8,4,Farlig helg,Jeg vet ikke [CP hva [TP det [VP var __ som ve...,hva,V
9,5,Min venn Zorba,– Jeg vet ikke [CP hva [TP jeg [VP gjør __ hvi...,hva,V


Defining a fuction to get container nodes from an annotated string. The function also returns the number of gaps:

In [3]:
def get_phrase(annotated):
    """ 
    takes an annotated row as an input
    returns a list of container nodes and a number of gaps
    str -> list of lists, int
    """
    phrases = list()
    phrases_re = r"\b[A-Z][A-Z]+\b" # phrases defined as all caps
    index_letters = 'iklmo'
    indeces = r"\{[a-z]\}"
    d = {}

    # assuming curly braces aren't used for anything other than multiple gaps
    if '{' not in annotated: # one gap only
        matches = re.findall(phrases_re, annotated)
        phrases.append(matches[1:]) # all phrases except for the first 
        n_gaps = 1
    # multiple gaps
    else:
        ind = re.finditer(indeces, annotated)
        pos  = [(i.start()+1) for i in ind] # positions of letter indeces
        for l in index_letters:
            for p in pos:
                if l == annotated[p]:
                    if l not in d:
                        d[l] = [p]
                    else:
                        d[l].append(p) # dict with indeces and their positions
        for v in d.values():
            if len(v) == 2: # one gap corresponds to one filler
                # subset string between the gap and the filler:
                search_string = annotated[v[0]:v[1]] 
                phrases.append(re.findall(phrases_re, search_string))
            else:
                # print(annotated)
                phrases.append(['CHECKGAP'])
        n_gaps = len(d)
    return (phrases, n_gaps)

# test
sentence = 'Her kan du lese [CP hva [TP Fridtjof Nansen [VP skriver ___ om de første skiene sine.'
get_phrase(sentence)

([['TP', 'VP']], 1)

Modifying the dataset row by row, adding two new columns with the container node sequences and the number of gaps. Saving the new dataset (use 'utf-8-sig' to make sure that Norwegian characters are displayed correctly).

In [4]:
#for row in data['annotated']:
#    print(get_phrase(row))
phrases = [get_phrase(r)[0] for r in data['annotated']]
n_gaps = [get_phrase(r)[1] for r in data['annotated']]
data['c_nodes'] = phrases
data['n_gaps'] = n_gaps
data.head(10)
path = 'C:/Users/anastask/OneDrive - NTNU/Project_python_files/Nob_Child/Corpus-NorwegianEmbeddedWh-edited.csv'
data.to_csv(path, index = False, encoding = 'utf-8-sig')

Defining a function that inserts 'start' and 'end' nodes to the sequences and splits them into trigrams:

In [5]:
def get_trigrams(cn_seq_arr):
    """
    takes a sequence of container nodes in a form of an array as an input
    inserts 'start' and 'end' nodes, makes a list of trigrams and returns it
    arr -> list of lists
    """
    cn_seq_arr.insert(0, 'start')
    cn_seq_arr.append('end')
    trigrams = list(ngrams(cn_seq_arr, 3))
    return trigrams

# test
get_trigrams(['CP', 'TP', 'VP'])

[('start', 'CP', 'TP'), ('CP', 'TP', 'VP'), ('TP', 'VP', 'end')]

Definig a function that creates a frequency distribution based on a full list of trigrams of container nodes:

In [6]:
def get_freq_dist(all_trigrams_list):
    """ list of lists -> FreqDist """
    freq_dist = FreqDist(tr for tr in all_trigrams_list)
    # print(freq_dist.max())  # the most frequent trigrams
    # print(freq_dist.hapaxes())  # trigrams that occur only once
    return freq_dist

Defining a function that calculates the probability of a given sequence of container nodes based on the frequency distribution of trigrams of container nodes (with Laplace smoothing):

In [7]:
def get_prob(node_seq, freq_dist):
    """ list, FreqDist -> int (log z-score)"""
    trigrams = get_trigrams(node_seq)
    # print(trigrams)
    freqs = []
    for tr in trigrams:
        if tr in freq_dist:
            tr_freq = freq_dist.freq(tr)
        else:
            tr_freq = 1/len(freq_dist) # freq of unattested sequences as 1
        freqs.append(tr_freq)
    result = math.log(numpy.prod(freqs))
    print(node_seq, result, '\n')
    return result

Assembling all the parts together and getting a probability for 3 structures:

In [8]:
 def main():
    all_trigrams = []
    for el in data['c_nodes']:
        for seq in el:
            trigrams = get_trigrams(seq)
            for tr in trigrams:
                all_trigrams.append(tr)
                
    freq_dist = get_freq_dist(all_trigrams)
    print('15 most common container node trigrams:')
    for i in freq_dist.most_common(15):
        print(i)
        
    print('\n')
    get_prob(['TP', 'VP'], freq_dist) # short no island
    get_prob(['CP', 'TP', 'VP'], freq_dist) # long no island
    get_prob(['CP', 'TP', 'VP', 'PP'], freq_dist) # long island

    

if __name__ == "__main__":
    main()

15 most common container node trigrams:
(('start', 'TP', 'VP'), 9484)
(('TP', 'VP', 'end'), 8751)
(('TP', 'VP', 'PP'), 553)
(('VP', 'PP', 'end'), 518)
(('start', 'TP', 'end'), 139)
(('VP', 'VP', 'end'), 112)
(('TP', 'VP', 'VP'), 100)
(('TP', 'VP', 'NP'), 87)
(('NP', 'PP', 'end'), 77)
(('VP', 'NP', 'PP'), 76)
(('VP', 'VP', 'VP'), 51)
(('VP', 'PP', 'VP'), 36)
(('start', 'TP', 'CP'), 31)
(('TP', 'CP', 'end'), 30)
(('PP', 'VP', 'end'), 29)


['start', 'TP', 'VP', 'end'] -1.604043358121144 

['start', 'CP', 'TP', 'VP', 'end'] -14.81693766162948 

['start', 'CP', 'TP', 'VP', 'PP', 'end'] -21.24769170830463 

