# EDAN20 - Assign 6

## Dependency parsing using machine learning techniques
This is the lab notebook for the EDAN20: Language Technology course offered in the HT1 2019 period at Lunds Tekniska Högskola (LTH).

This assignment was finished by Jonathan Moran (jo6155mo-s) and Alexis Cole (). The initial code, assignment files and instructions were prepared by Pierre Nugues(@pnugues/ilppp).

More information is available at the kurswebb here: http://cs.lth.se/edan20/coursework/assignment-6/.

## Objectives

The objectives of this assignment are to:
- Extract feature vectors and train a classifier
- Write a statistical dependency parser
- Understand how to design parameter sets
- Write a short report on your results

## Programing

In this session, you will implement and test a dependency parser for Swedish using machine learning techniques.

### Choosing a training and a test sets

In [37]:
# train_file = str(train_data)
train_file = 'swedish_talbanken05_train.conll'


# test_data = urlopen('http://fileadmin.cs.lth.se/cs/Education/EDAN20/corpus/conllx/sv/swedish_talbanken05_test_blind.conll').read()
test_file = 'swedish_talbanken05_test.conll'

### Training the classifiers
If you have not done it in the previous assignment, for each data set you have generated, fit a corresponding model using logistic regression (or another classifier if you want) and save it.

In [38]:
import dparser
import conll
import transition

import transition

def extract(stack, queue, graph, feature_names, sentence):
    
    """
    Use three feature sets to train the specified classifier
    """
    
    features = [] 
    """
    features1 -- first element on stack/queue
    """
    # given first 4 parameters (x)
    features.extend(['nil', 'nil', 'nil', 'nil', 'nil', 'nil'])
    # label the first word + POS extracted from the stack
    if len(stack) >= 1:
        features[0] = stack[0]['form']
        features[1] = stack[0]['postag']
    # label the first word + POS extracted from the queue
    if len(queue) >= 1:
        features[2] = queue[0]['form']
        features[3] = queue[0]['postag']
    # append the two Boolean parameters
    features[4] = transition.can_leftarc(stack, graph)
    features[5] = transition.can_reduce(stack, graph)
    
    """
    features2 -- first and second element on stack/queue
    """
    # check if expected feature set (num of params)
    if len(feature_names) == 10 or len(feature_names) == 14:
        features.extend(['nil', 'nil', 'nil', 'nil'])
        # more than one element on stack
        if len(stack) >= 2:
            features[6] = stack[1]['form']
            features[7] = stack[1]['postag']
        # more than one element on queue
        if len(queue) >= 2:
            features[8] = queue[1]['form']
            features[9] = queue[1]['postag']
        
    """
    features3 -- first element + 2 addtl (one must be prev. element)
    """
    if len(feature_names) == 14:
        features.extend(['nil', 'nil', 'nil', 'nil'])
        if len(stack) >= 1 and len(sentence) > int(stack[0]['id']) +1:
            word = sentence[int(stack[0]['id']) +1]
            features[10] = word['form']
            features[11] = word['postag']
        
        if len(queue) >= 1 and len(sentence) > int(queue[0]['id']) +1:
            word = sentence[int(stack[0]['id']) +1]
            features[12] = word['form']
            features[13] = word['postag']
    
    #features = dict(zip(feature_names, features))
    
    return features

def extract_features(sentences, feature_names):
    """
    _Similar to Lab 3, Task 4: Improving the Chunker_
    Builds X matrix and y vector
    X is a list of dicitionaries and y is a list
    :param sentences:
    :param feature_names:
    :return:
    """
    
    X_l = []
    y_l = []
    sent_cnt = 0
    
    for i, sent in enumerate(sentences):
        """
        Forming initial model structures -- dparser.py
        """
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        stack = []
        queue = list(sent)
        
        while queue:
            x = extract(stack, queue, graph, feature_names, sent)
            X_l.append(x)
            stack, queue, graph, trans = dparser.reference(stack, queue, graph)
            y_l.append(trans)
        
        stack, graph = transition.empty_stack(stack, graph)
        sent_cnt += 1
        
    return X_l, y_l

In [39]:
'''def extract(stack, queue, graph, feature_names, sentence):
    
    """
    Use three feature sets to train the specified classifier
    """
    
    features = [] 
    """
    features1 -- first element on stack/queue
    """
    print(feature_names)
    # given first 4 parameters (x)
    if len(feature_names) == 4 or len(feature_names) == 6:
        features = ['nil', 'nil', 'nil', 'nil', 'nil', 'nil']
        # label the first word + POS extracted from the stack
        if len(stack) >= 1:
            features[0] = stack[0]['postag']
            features[1] = stack[0]['form']
        # label the first word + POS extracted from the queue
        if len(queue) >= 1:
            features[2] = queue[0]['postag']
            features[3] = queue[0]['form']
        # append the two Boolean parameters
        features[4] = transition.can_leftarc(stack, graph)
        features[5] = transition.can_reduce(stack, graph)
        feature_names.extend(['canLa','canRe'])
        #features = dict(zip(feature_names, features))
    
        return features
    
    """
    features2 -- first and second element on stack/queue
    """
    # check if expected feature set (num of params)
    if len(feature_names) == 8 or len(feature_names) == 10:
        features = ['nil', 'nil', 'nil', 'nil','nil', 'nil', 'nil', 'nil', 'nil', 'nil']
        # more than one element on stack
        if len(stack) >= 2:
            features[0] = stack[0]['postag']
            features[1] = stack[1]['postag']
            features[2] = stack[0]['form']
            features[3] = stack[1]['form']
        # more than one element on queue
        if len(queue) >= 2:
            features[4] = queue[0]['postag']
            features[5] = queue[1]['postag']
            features[6] = queue[0]['form']
            features[7] = queue[1]['form']
            
        features[8] = transition.can_leftarc(stack, graph)
        features[9] = transition.can_reduce(stack, graph)
        feature_names.extend(['canLa','canRe'])
        #features = dict(zip(feature_names, features))
    
        return features
        
    """
    features3 -- first element + 2 addtl (one must be prev. element)
    """
    if len(feature_names) == 12 or len(feature_names) == 14:
        features = ['nil', 'nil', 'nil', 'nil','nil', 'nil', 'nil', 'nil','nil', 'nil', 'nil', 'nil', 'nil', 'nil']
        if len(stack) > 2:
            features[0] = stack[0]['postag']
            features[1] = stack[1]['postag']
            features[2] = stack[2]['postag']
            features[3] = stack[0]['form']
            features[4] = stack[1]['form']
            features[5] = stack[2]['form']
        
        if len(queue) > 2:
            features[6] = queue[0]['postag']
            features[7] = queue[1]['postag']
            features[8] = queue[2]['postag']
            features[9] = queue[0]['form']
            features[10] = queue[1]['form']
            features[11] = queue[2]['form']
            
        features[12] = transition.can_leftarc(stack, graph)
        features[13] = transition.can_reduce(stack, graph)
        feature_names.extend(['canLa','canRe'])
    
    #features = dict(zip(feature_names, features))
    
    return features


def extract_features(sentences, feature_names):
    """
    _Similar to Lab 3, Task 4: Improving the Chunker_
    Builds X matrix and y vector
    X is a list of dicitionaries and y is a list
    :param sentences:
    :param feature_names:
    :return:
    """
    
    X_l = []
    y_l = []
    sent_cnt = 0
    
    for i, sent in enumerate(sentences):
        """
        Forming initial model structures -- dparser.py
        """
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        stack = []
        queue = list(sent)
        
        while queue:
            x = extract(stack, queue, graph, feature_names, sent)
            X_l.append(x)
            stack, queue, graph, trans = dparser.reference(stack, queue, graph)
            y_l.append(trans)
        
        stack, graph = transition.empty_stack(stack, graph)
        sent_cnt += 1
        
    return X_l, y_l'''

'def extract(stack, queue, graph, feature_names, sentence):\n    \n    """\n    Use three feature sets to train the specified classifier\n    """\n    \n    features = [] \n    """\n    features1 -- first element on stack/queue\n    """\n    print(feature_names)\n    # given first 4 parameters (x)\n    if len(feature_names) == 4 or len(feature_names) == 6:\n        features = [\'nil\', \'nil\', \'nil\', \'nil\', \'nil\', \'nil\']\n        # label the first word + POS extracted from the stack\n        if len(stack) >= 1:\n            features[0] = stack[0][\'postag\']\n            features[1] = stack[0][\'form\']\n        # label the first word + POS extracted from the queue\n        if len(queue) >= 1:\n            features[2] = queue[0][\'postag\']\n            features[3] = queue[0][\'form\']\n        # append the two Boolean parameters\n        features[4] = transition.can_leftarc(stack, graph)\n        features[5] = transition.can_reduce(stack, graph)\n        feature_names.extend(

In [40]:
def extract(stack, queue, graph, feature_names, sentence):
    
    """
    Use three feature sets to train the specified classifier
    """
    
    features = [] 
    
    for f in feature_names:
        if f == 'stack_pos_0':
            if stack:
                features.append(stack[0]["postag"])
            else:
                features.append("nil")
        if f == 'stack_pos_1':
            if len(stack) > 1:
                features.append(stack[1]["postag"])
            else:
                features.append("nil")
        if f == 'stack_pos_2':
            if len(stack) > 2:
                features.append(stack[2]["postag"])
            else:
                features.append("nil")
        if f == 'stack_word_0':
            if stack:
                features.append(stack[0]["form"])
            else:
                features.append("nil")
        if f == 'stack_word_1':
            if len(stack) > 1:
                features.append(stack[1]["form"])
            else:
                features.append("nil")
        if f == 'stack_word_2':
            if len(stack) > 2:
                features.append(stack[2]["form"])
            else:
                features.append("nil")
        if f == 'queue_pos_0':
            if queue:
                features.append(queue[0]["postag"])
            else:
                features.append("nil")
        if f == 'queue_pos_1':
            if len(queue) > 1:
                features.append(queue[1]["postag"])
            else:
                features.append("nil")
        if f == 'queue_pos_2':
            if len(queue) > 2:
                features.append(queue[2]["postag"])
            else:
                features.append("nil")
        if f == 'queue_word_0':
            if queue:
                features.append(queue[0]["form"])
            else:
                features.append("nil")
        if f == 'queue_word_1':
            if len(queue) > 1:
                features.append(queue[1]["form"])
            else:
                features.append("nil")
        if f == 'queue_word_2':
            if len(queue) > 2:
                features.append(queue[2]["form"])
            else:
                features.append("nil")
                
    features.append(str(transition.can_reduce(stack, graph)))
    features.append(str(transition.can_leftarc(stack, graph)))
    #feature_names.append('canRe')
    #feature_names.append('canLa')
    features = dict(zip(feature_names, features))
    return features
        
        
        
def extract_features(sentences, feature_names):
    """
    _Similar to Lab 3, Task 4: Improving the Chunker_
    Builds X matrix and y vector
    X is a list of dicitionaries and y is a list
    :param sentences:
    :param feature_names:
    :return:
    """
    
    X_l = []
    y_l = []
    sent_cnt = 0
    
    for i, sent in enumerate(sentences):
        """
        Forming initial model structures -- dparser.py
        """
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        stack = []
        queue = list(sent)
        
        while queue:
            x = extract(stack, queue, graph, feature_names, sent)
            X_l.append(x)
            stack, queue, graph, trans = dparser.reference(stack, queue, graph)
            y_l.append(trans)
        
        stack, graph = transition.empty_stack(stack, graph)
        sent_cnt += 1
        for word in sent:
            word['head'] = graph['heads'][word['id']]
        
    return X_l, y_l

In [41]:
column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
# column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']

sentences = conll.read_sentences("swedish_talbanken05_train.conll")
formatted_corpus = conll.split_rows(sentences, column_names_2006)


"""
From Lab 5 Task 4,1
You will consider three feature sets:
"""
#   1) features1: word + pos
features1 = ['stack_pos_0', 'stack_word_0', 'queue_pos_0', 'queue_word_0']

#   2) features2: word, pos + prev element
features2 = ['stack_pos_0', 'stack_pos_1', 'stack_word_0', 'stack_word_1',
             'queue_pos_0', 'queue_pos_0', 'queue_word_0', 'queue_word_1']

#   3) first/second word + pos, next word + pos:
features3 = ['stack_pos_0', 'stack_pos_1', 'stack_pos_2', 'stack_word_0', 
            'stack_word_1', 'stack_word_2', 'queue_pos_0', 'queue_pos_1', 
             'queue_pos_2', 'queue_word_0', 'queue_word_1', 'queue_word_2']

"""
From Lab 5 Task 4,1
You will consider three feature sets:
"""
#   1) features1: word + pos
train_features1 = ['stack_pos_0', 'stack_word_0', 'queue_pos_0', 'queue_word_0', 'canRe', 'canLa']

#   2) features2: word, pos + prev element
train_features2 = ['stack_pos_0', 'stack_pos_1', 'stack_word_0', 'stack_word_1',
                 'queue_pos_0', 'queue_pos_0', 'queue_word_0', 'queue_word_1',
                'canRe', 'canLa']

#   3) first/second word + pos, next word + pos:
train_features3 = ['stack_pos_0', 'stack_pos_1', 'stack_pos_2', 'stack_word_0', 
                'stack_word_1', 'stack_word_2', 'queue_pos_0', 'queue_pos_1', 
                'queue_pos_2', 'queue_word_0', 'queue_word_1', 'queue_word_2', 
                 'canRe', 'canLa']

X_1, y_1 = extract_features(formatted_corpus, train_features1)
X_2, y_2 = extract_features(formatted_corpus, train_features2)
X_3, y_3 = extract_features(formatted_corpus, train_features3)

In [42]:
print(X_1[1])
print(X_2[1])
print(X_3[1])

{'stack_pos_0': 'ROOT', 'stack_word_0': 'ROOT', 'queue_pos_0': 'NN', 'queue_word_0': 'Äktenskapet', 'canRe': 'True', 'canLa': 'False'}
{'stack_pos_0': 'ROOT', 'stack_pos_1': 'nil', 'stack_word_0': 'ROOT', 'stack_word_1': 'nil', 'queue_pos_0': 'NN', 'queue_word_0': 'Äktenskapet', 'queue_word_1': 'och', 'canRe': 'True', 'canLa': 'False'}
{'stack_pos_0': 'ROOT', 'stack_pos_1': 'nil', 'stack_pos_2': 'nil', 'stack_word_0': 'ROOT', 'stack_word_1': 'nil', 'stack_word_2': 'nil', 'queue_pos_0': 'NN', 'queue_pos_1': '++', 'queue_pos_2': 'NN', 'queue_word_0': 'Äktenskapet', 'queue_word_1': 'och', 'queue_word_2': 'familjen', 'canRe': 'True', 'canLa': 'False'}


### Parsing the corpus and evaluating the results
Once you have generated your models, you will embed them in Nivre's parser and compute their respective efficiencies.

Your parser will proceed, sentence by sentence, and word by word. For a certain state, it will predict the next action using your classifier. You will then execute the corresponding action: `la`, `ra`, `re`, or `sh`. If an action is not possible, you will carry out a `shift`.

You are free to implement it the way you want. Here are some suggestions:

- **The loop will basically have this structure:**

```
while queue:
    features.extract()
    trans_nr = classifier.predict()
    stack, queue, graph, trans = parse_ml(stack, queue, graph, trans)
```

In [43]:
"""
From Lab 5 Task 4,1
You will consider three feature sets:
"""
#   1) features1: word + pos
train_features1 = ['stack_pos_0', 'stack_word_0', 'queue_pos_0', 'queue_word_0', 'canRe', 'canLa']

#   2) features2: word, pos + prev element
train_features2 = ['stack_pos_0', 'stack_pos_1', 'stack_word_0', 'stack_word_1',
                 'queue_pos_0', 'queue_pos_0', 'queue_word_0', 'queue_word_1',
                'canRe', 'canLa']

#   3) first/second word + pos, next word + pos:
train_features3 = ['stack_pos_0', 'stack_pos_1', 'stack_pos_2', 'stack_word_0', 
                'stack_word_1', 'stack_word_2', 'queue_pos_0', 'queue_pos_1', 
                'queue_pos_2', 'queue_word_0', 'queue_word_1', 'queue_word_2', 
                 'canRe', 'canLa']

In [44]:
def feature_dict(X_sentences, feature_names):
    X_train = []
    
    for sent in X_sentences:
        X_dict = {key: sent[i] for i,key in enumerate(feature_names)}
        X_train.append(X_dict)
        
    return X_train

In [45]:
list(enumerate(train_features1))

[(0, 'stack_pos_0'),
 (1, 'stack_word_0'),
 (2, 'queue_pos_0'),
 (3, 'queue_word_0'),
 (4, 'canRe'),
 (5, 'canLa')]

In [46]:
X1_dict = feature_dict(X_1, train_features1)
X2_dict = feature_dict(X_2, train_features2)
X3_dict = feature_dict(X_3, train_features3)

KeyError: 0

- **The parsing function, `parse_ml()`, takes the the `stack`, `queue`, `graph`, and the transition (`trans`) predicted by the classifier, and carries out the transition.**

You can use this model and complete it:

In [47]:
def parse_ml(stack,queue,graph,trans):
    if stack and trans[:2] == 'ra':
        stack, queue, graph = transition.right_arc(stack, queue, graph, trans[3:])
        return stack, queue, graph, 'ra'
    if stack and trans[:2] == 'la':
        stack, queue, graph = transition.left_arc(stack, queue, graph, trans[3:])
        return stack, queue, graph, 'la'
    if stack and trans[:2] == 're':
        stack, queue, graph = transition.reduce(stack, queue, graph)
        return stack, queue, graph, 're'
    stack, queue, graph = transition.shift(stack, queue, graph)
    return stack, queue, graph, 'sh'

```
def parse_ml(stack, queue, graph, trans):
    if stack and trans[:2] == 'ra':
        stack, queue, graph = transition.right_arc(stack, queue, graph, trans[3:])
        return stack, queue, graph, 'ra'
    ...
```

where trans is either `ra.deprel`, `la.deprel`, `re`, or `sh`.

- **You will then use the partial `graph` to write the values of the `heads` and functions to the words.**

#### Training the model on features1

In [48]:
from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer

In [49]:
X1_dict[100]

NameError: name 'X1_dict' is not defined

In [50]:
X_1[100]

{'stack_pos_0': 'AB',
 'stack_word_0': 'mycket',
 'queue_pos_0': 'AJ',
 'queue_word_0': 'hetsigare',
 'canRe': 'False',
 'canLa': 'True'}

In [51]:
X1_dict == X_1

NameError: name 'X1_dict' is not defined

In [52]:
print("Encoding the features...")
# Vectorize the feature matrix and carry out a one-hot encoding
vec = DictVectorizer(sparse=True)
X = vec.fit_transform(X_1)
# The statement below will swallow a considerable memory
# X = vec.fit_transform(X_dict).toarray()
# print(vec.get_feature_names())

Encoding the features...


In [53]:
X[0]

<1x40101 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [54]:
# train model after extracting features

In [19]:
print("Training the model...")
classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear')
model = classifier.fit(X, y_1)
print(model)

Training the model...




LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)


In [26]:

print("Encoding the features...")
# Vectorize the feature matrix and carry out a one-hot encoding
vec = DictVectorizer(sparse=True)
X = vec.fit_transform(X_2)
# The statement below will swallow a considerable memory
# X = vec.fit_transform(X2_dict).toarray()
# print(vec.get_feature_names())

print("Training the model...")
classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear')
model_2 = classifier.fit(X, y_2)
print(model_2)


Encoding the features...
Training the model...
LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)


In [56]:
#### training the model on featureset 3

In [57]:

print("Encoding the features...")
# Vectorize the feature matrix and carry out a one-hot encoding
vec = DictVectorizer(sparse=True)
X = vec.fit_transform(X_3)
# The statement below will swallow a considerable memory
# X = vec.fit_transform(X3_dict).toarray()
# print(vec.get_feature_names())

print("Training the model...")
classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear')
model_3 = classifier.fit(X, y_3)
print(model_3)


Encoding the features...
Training the model...




LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)


In [58]:
def extract_features_test(sentences, feature_names, model):
    """
    :param sentences:
    :param w_size:
    :return:
    """
    for sent in sentences:
        stack = []
        queue = list(sent)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        
        while queue:
            features = extract(stack, queue, graph, feature_names, sent)
            #features_dict = feature_dict([features], feature_names)
            
            # Vectorize the test sentence and one hot encoding
            X_test = vec.transform(features)
            #print(features)
            #print(sent)
            #print(X_test)
            
            # Predicts the chunks and returns numbers
            trans_pred = classifier.predict(X_test)[0]
            #print(trans_pred)

            #trans_nr = classifier.predict()
            
            stack, queue, graph, trans = parse_ml(stack, queue, graph, trans_pred)
        
        stack, graph = dparser.transition.empty_stack(stack, graph)
        
        for i,word in enumerate(sent):
            word['head'] = graph['heads'].get(str(i), str(0))
            word['deprel'] = graph['deprels'].get(str(i), str(0))
        
    return sentences

In [59]:
"""
Gold standard parser
"""
__author__ = "Pierre Nugues"

column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']
    
test_sentences = conll.read_sentences("swedish_talbanken05_test_blind.conll")
test_formatted_corpus = conll.split_rows(test_sentences, column_names_2006)

- **Finally, you will save the sentences in an output file.**

In [60]:
def save(file, formatted_corpus, column_names):
    f_out = open(file, 'w')
    for sentence in formatted_corpus:
        for row in sentence[1:]:
            # print(row, flush=True)
            for col in column_names[:-1]:
                if col in row:
                    f_out.write(row[col] + '\t')
                else:
                    f_out.write('_\t')
            col = column_names[-1]
            if col in row:
                f_out.write(row[col] + '\n')
            else:
                f_out.write('_\n')
        f_out.write('\n')
    f_out.close()

- **Once you have parsed the test set, you will measure the accuracy of your parser using the CoNLL evaluation script [3] (where `-q` stands for quiet).**

Local copy: [eval.pl]. You will run this script using the command:

In [30]:
sentences_test = extract_features_test(test_formatted_corpus, train_features1, model)

In [24]:
conll.save('w1.conll', sentences_test, column_names_2006)

In [25]:
!perl eval.pl -g swedish_talbanken05_test.conll -s w1.conll -q

  Labeled   attachment score: 3471 / 5021 * 100 = 69.13 %
  Unlabeled attachment score: 3826 / 5021 * 100 = 76.20 %
  Label accuracy score:       3652 / 5021 * 100 = 72.73 %


In [31]:
sentences_test_2 = extract_features_test(test_formatted_corpus, train_features2, model_2)
conll.save('w2.conll', sentences_test_2, column_names_2006)
!perl eval.pl -g swedish_talbanken05_test.conll -s w2.conll -q

  Labeled   attachment score: 3617 / 5021 * 100 = 72.04 %
  Unlabeled attachment score: 3964 / 5021 * 100 = 78.95 %
  Label accuracy score:       3797 / 5021 * 100 = 75.62 %


In [61]:
sentences_test_3 = extract_features_test(test_formatted_corpus, train_features3, model_3)
conll.save('w3.conll', sentences_test_3, column_names_2006)
!perl eval.pl -g swedish_talbanken05_test.conll -s w3.conll -q

  Labeled   attachment score: 3674 / 5021 * 100 = 73.17 %
  Unlabeled attachment score: 4022 / 5021 * 100 = 80.10 %
  Label accuracy score:       3894 / 5021 * 100 = 77.55 %


- **You will run the parser with the three feature sets described in the fifth assignment to carry out a labelled dependency parsing.**

- **You need to reach a labelled attachment score of 75 to pass this lab.**

### Scraps from Lab 5..

In [None]:
"""
Gold standard parser
"""
__author__ = "Pierre Nugues"

column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']
    
sentences = conll.read_sentences(train_file)
formatted_corpus = conll.split_rows(sentences, column_names_2006)

```
def parse_ml(stack, queue, graph, trans):
    if stack and trans[:2] == 'ra':
        stack, queue, graph = transition.right_arc(stack, queue, graph, trans[3:])
        return stack, queue, graph, 'ra'
    ...
```

In [None]:
"""
_From Lab 5: dparser.py
"""

sent_cnt = 0

for sentence in formatted_corpus:
    sent_cnt += 1
    if sent_cnt % 1000 == 0:
        print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
    stack = []
    queue = list(sentence)
    graph = {}
    graph['heads'] = {}
    graph['heads']['0'] = '0'
    graph['deprels'] = {}
    graph['deprels']['0'] = 'ROOT'
    transitions = []
    
    while queue:
        # stack, queue, graph, trans = dparser.reference(stack, queue, graph)
        # transitions.append(trans)
        features.extract()
        trans_nr = classifier.predict()
    stack, queue, graph, trans = parse_ml(stack, queue, graph, trans)
    
    print('Equal graphs:', transition.equal_graphs(sentence, graph))
    
    # Poorman's projectivization to have well-formed graphs.
    for word in sentence:
        word['head'] = graph['heads'][word['id']]
    # print(transitions)
    # print(graph)