### Load ConLLu into sentences

In [51]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [82]:
import pandas as pd
from typing import List, Dict

import spacy
from spacy.tokens import Doc

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix


nlp = spacy.load("en_core_web_sm")

In [3]:

def load_conll_sentences(path: str):

    sentences = []
    sent = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            
            line = line.rstrip("\n")
        
            #  save previous sentence 
            # at boundary between 2 sentences
            if line.strip() == "":
                if sent:
                    sentences.append(sent)
                    sent = []
                continue

            # skip comments
            if line.startswith("#"):
                continue

            cols = line.split("\t")
            sent.append(cols)

    if sent:
        sentences.append(sent)

    return sentences

### Preprocessing

In [4]:
def count_sentences_and_tokens(sentences: List):
  """
  Return number of sentences (n_sent)
  and number of tokens from these sentences (n_token)
  """
  n_sent = len(sentences)
  n_token = sum(len(s) for s in sentences)

  return n_sent, n_token


In [44]:
def count_instances_and_tokens(instances:List):
  n_inst = len(instances)
  n_token = sum(len(inst["tokens"]) for inst in instances)

  return n_inst, n_token

### Replicate each sentence for each predicate

In [37]:
def find_predicate_index(sent,
                           label_col,
                           predicate_markers=("V", "B-V")):
  for i, row in enumerate(sent):
    if len(row) > label_col and row[label_col] in predicate_markers:
      return i
  return None



In [47]:
def replicate_sentences(sentences,
                        base_cols: int=11):
  
  instances = []
  """
  base_cols: the column with specified predicates
  """
  for sent in sentences:

    # check the maximum columns in specific sentence
    # assume it's consistent per token row
    max_cols = max(len(r) for r in sent)
    
    # nr of predicate-specific label columns
    k = max(0, max_cols-base_cols) 

    if k == 0 :
      # sentence has no predicate
      continue 

    # rely on k
    for j in range(k):
      label_col = base_cols + j  # 0-based index 

      pred_index = find_predicate_index(sent, label_col)

      # fallback if no V marker found
      if pred_index is None:
        pred_index = next((i for i,r in enumerate(sent) if len(r) > 9 and r[9] not in ("_", "-", "")), None)
      

      tokens = [r[1] for r in sent] # FORM column

      #labels = [(r[label_col] if len(r) > label_col else "O") for r in sent]
      labels = [
                    "O" if (len(r) <= label_col or r[label_col] == "_")
                    else r[label_col]
                    for r in sent   
              ]

      instances.append({
                "tokens": tokens,
                "predicate_index": pred_index,
                "labels": labels,
            })
  return instances
     


In [42]:
def load_and_preprocess(path:str):
  sentences = load_conll_sentences(path)
  before_s, before_t = count_sentences_and_tokens(sentences)

  instances = replicate_sentences(sentences)
  after_s, after_t = count_instances_and_tokens(instances)

  return {
        "sentences": sentences,
        "instances": instances,
        "stats": {
            "before_sentences": before_s,
            "before_tokens": before_t,
            "after_instances": after_s,
            "after_tokens": after_t
        }
    }

#### Feature Extraction

According to the requirement of the assignment, the directed dependency path feature is mandatory, therefore, I implemented the idea as the first feature.

**Create a Doc from my token list**

As the warning given from the requirement, the mismatch between Spacy and dataset tokenization might cause potential problems. Thus, I forced spacy to use exactly my dataset's tokenization, so depency parsing and feature extraction remains aligned with my SRL labels.

In [53]:
def create_doc(tokens):
  spaces = [True] * (len(tokens) - 1) + [False]
  doc = Doc(nlp.vocab, words=tokens, spaces=spaces)
  return nlp(doc)

**1) Dependency Path**

In [66]:
def get_ancestors(token):
  ancestors = [token]
  while token.head != token:
      token = token.head
      ancestors.append(token)
  return ancestors

In [67]:
def dependency_path(token, predicate):
  """
  Returns a directed path string 
  from token to predicate
  """

  token_anc = get_ancestors(token)
  predicate_anc = get_ancestors(predicate)

  token_set = {t.i: idx for idx, t in enumerate(token_anc)}
    
  # find lowest common ancestor 
  lca = None
  lca_token_idx = None
  lca_pred_idx = None

  for j, a in enumerate(predicate_anc):
    if a.i in token_set:
      lca = a
      lca_token_idx = token_set[a.i]
      lca_pred_idx = j
      break

  if lca is None:
    return "NOPATH"

  # token -> LCA (upwards): use token.dep_ as you move up
  up_parts = []
  cur = token
  while cur != lca:
    up_parts.append(cur.dep_ + "^")
    cur = cur.head

  # LCA -> predicate (downwards): walk from predicate up to LCA, then reverse
  down_parts = []
  cur = predicate
  while cur != lca:
    down_parts.append(cur.dep_ + "v")
    cur = cur.head
  down_parts.reverse()

  return "".join(up_parts + down_parts)

  
def dependency_path_plus_pred_lemma(doc, i, pred_i):
  pred = doc[pred_i]
  path = dependency_path(doc[i], pred)
  return f"{path}+{pred.lemma_.lower()}"


  

**X/y construction for Logistic Regession**

In [74]:
def normalize_label(label):
  if label == "_" or label == "":
    return "O"
  if label == "V": # as V should not be predicted!
    return "O"
  return label

def build_tolen_dataset(instances):
  X_dict = []
  y = []

  for inst in instances:
    tokens = inst["tokens"]
    pred_indices = inst["predicate_index"]
    labels = inst["labels"]

    if pred_indices is None:
      continue # skip if no predicates

    doc = create_doc(tokens) # create doc sequentially

    for i in range(len(tokens)):
      feats = {
                "dep_path_predlemma": dependency_path_plus_pred_lemma(doc, i, pred_indices) # Feature 1
                #"rel_pos": feature_rel_position(i, pred_i), # Feature 2
                #"pos": feature_pos(doc, i),  # Feature 3
              }
      X_dict.append(feats)
      y.append(normalize_label(labels[i]))

   
  return X_dict, y



### Run experiment

In [48]:
train = load_and_preprocess("en_ewt-up-train.conllu")
print("Statistics in train set: ", train['stats'])

Statistics in train set:  {'before_sentences': 12543, 'before_tokens': 204609, 'after_instances': 42471, 'after_tokens': 1036031}


In [77]:
X_train_dict, y_train= build_tolen_dataset(train["instances"][:10])
vec = DictVectorizer(sparse=True)
X_train = vec.fit_transform(X_train_dict)

clf = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",     # good default for multiclass
    n_jobs=None,        # lbfgs ignores n_jobs; keep default
)
clf.fit(X_train, y_train)



In [79]:
test = load_and_preprocess("en_ewt-up-test.conllu")
print("Statistics in test set: ", test['stats'])

Statistics in test set:  {'before_sentences': 2077, 'before_tokens': 25097, 'after_instances': 5338, 'after_tokens': 103253}


In [81]:
X_test_dict, y_test = build_tolen_dataset(test["instances"])
X_test  = vec.transform(X_test_dict)

y_pred = clf.predict(X_test)

In [83]:
labels_sorted = sorted(set(y_test) | set(y_pred))  # stable label order

print(classification_report(y_test, y_pred, labels=labels_sorted, digits=3))

cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
cm_df = pd.DataFrame(cm, index=[f"true:{l}" for l in labels_sorted],
                        columns=[f"pred:{l}" for l in labels_sorted])

cm_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        ARG0      0.000     0.000     0.000      1733
        ARG1      0.000     0.000     0.000      3241
    ARG1-DSP      0.000     0.000     0.000         4
        ARG2      0.000     0.000     0.000      1129
        ARG3      0.000     0.000     0.000        74
        ARG4      0.000     0.000     0.000        56
        ARG5      0.000     0.000     0.000         1
        ARGA      0.000     0.000     0.000         2
    ARGM-ADJ      0.000     0.000     0.000       228
    ARGM-ADV      0.000     0.000     0.000       496
    ARGM-CAU      0.000     0.000     0.000        46
    ARGM-COM      0.000     0.000     0.000        13
    ARGM-CXN      0.000     0.000     0.000        12
    ARGM-DIR      0.000     0.000     0.000        47
    ARGM-DIS      0.000     0.000     0.000       182
    ARGM-EXT      0.000     0.000     0.000       105
    ARGM-GOL      0.000     0.000     0.000        24
    ARGM-LOC      0.000    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,pred:ARG0,pred:ARG1,pred:ARG1-DSP,pred:ARG2,pred:ARG3,pred:ARG4,pred:ARG5,pred:ARGA,pred:ARGM-ADJ,pred:ARGM-ADV,...,pred:O,pred:R-ARG0,pred:R-ARG1,pred:R-ARG2,pred:R-ARGM-ADJ,pred:R-ARGM-ADV,pred:R-ARGM-DIR,pred:R-ARGM-LOC,pred:R-ARGM-MNR,pred:R-ARGM-TMP
true:ARG0,0,0,0,0,0,0,0,0,0,0,...,1733,0,0,0,0,0,0,0,0,0
true:ARG1,0,0,0,0,0,0,0,0,0,0,...,3241,0,0,0,0,0,0,0,0,0
true:ARG1-DSP,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
true:ARG2,0,0,0,0,0,0,0,0,0,0,...,1129,0,0,0,0,0,0,0,0,0
true:ARG3,0,0,0,0,0,0,0,0,0,0,...,74,0,0,0,0,0,0,0,0,0
true:ARG4,0,0,0,0,0,0,0,0,0,0,...,56,0,0,0,0,0,0,0,0,0
true:ARG5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
true:ARGA,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
true:ARGM-ADJ,0,0,0,0,0,0,0,0,0,0,...,228,0,0,0,0,0,0,0,0,0
true:ARGM-ADV,0,0,0,0,0,0,0,0,0,0,...,496,0,0,0,0,0,0,0,0,0


In [72]:
X_dict, y = build_tolen_dataset(train['instances'][:3])
for i in range(10):
    print(X_dict[i], "->", y[i])

{'dep_path_predlemma': 'compound^aclv+kill'} -> O
{'dep_path_predlemma': 'punct^aclv+kill'} -> O
{'dep_path_predlemma': 'aclv+kill'} -> O
{'dep_path_predlemma': 'punct^aclv+kill'} -> O
{'dep_path_predlemma': 'amod^nsubj^+kill'} -> O
{'dep_path_predlemma': 'nsubj^+kill'} -> ARG0
{'dep_path_predlemma': '+kill'} -> O
{'dep_path_predlemma': 'compound^dobj^+kill'} -> ARG1
{'dep_path_predlemma': 'compound^dobj^+kill'} -> O
{'dep_path_predlemma': 'compound^dobj^+kill'} -> O


In [65]:
doc = create_doc(tokens)

for i, tok in enumerate(doc):
    print(i, tok.text, "-> head:", tok.head.text, "| dep:", tok.dep_)


0 American -> head: forces | dep: amod
1 forces -> head: killed | dep: nsubj
2 killed -> head: killed | dep: ROOT
3 Abdullah -> head: killed | dep: dobj
4 . -> head: killed | dep: punct


In [12]:
sentencestrain = load_conll_sentences("en_ewt-up-train.conllu")


In [39]:
replicate_sentences(sentencestrain)


[{'tokens': ['Al',
   '-',
   'Zaman',
   ':',
   'American',
   'forces',
   'killed',
   'Shaikh',
   'Abdullah',
   'al',
   '-',
   'Ani',
   ',',
   'the',
   'preacher',
   'at',
   'the',
   'mosque',
   'in',
   'the',
   'town',
   'of',
   'Qaim',
   ',',
   'near',
   'the',
   'Syrian',
   'border',
   '.'],
  'predicate_index': 6,
  'labels': ['O',
   'O',
   'O',
   'O',
   'O',
   'ARG0',
   'V',
   'ARG1',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'ARGM-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']},
 {'tokens': ['[',
   'This',
   'killing',
   'of',
   'a',
   'respected',
   'cleric',
   'will',
   'be',
   'causing',
   'us',
   'trouble',
   'for',
   'years',
   'to',
   'come',
   '.',
   ']'],
  'predicate_index': 2,
  'labels': ['O',
   'O',
   'V',
   'O',
   'O',
   'O',
   'ARG1',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']},
 {'tokens