In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm # for peogress checking of the code at the time of training
import multiprocessing as mp # For multiprocessing at the time of training
import warnings # Avoiding unnecessary warnings
from copy import deepcopy # creating a copy
warnings.filterwarnings("ignore")

In [None]:
train= open("/content/drive/MyDrive/NLP_ass2/NLP2/train.txt",'r')
test=open("/content/drive/MyDrive/NLP_ass2/NLP2/test.txt",'r')

Change train txt file to a dataframe

In [None]:
id= None
text = None
rows = []
for line in train:
    if line.strip():
        if line.startswith("# sent_id"):
            id = line.split("=")[-1].strip()
        elif line.startswith("# text"):
            text = line.split("=")[-1].strip()
        else:
            parts = line.split()
            row = {
                'sent_id': id,
                'text': text,
                'word': parts[1],
                'normalized_word': parts[2],
                'POS_tag': parts[3],
                'head_word_index': parts[4],
                'dependency_relation': parts[5]
            }
            rows.append(row)

df_train = pd.DataFrame(rows)

In [None]:
df_train.head(5)

Unnamed: 0,sent_id,text,word,normalized_word,POS_tag,head_word_index,dependency_relation
0,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Aesthetic,aesthetic,JJ,2,amod
1,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Appreciation,appreciation,NN,0,root
2,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,and,and,CC,5,cc
3,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Spanish,Spanish,JJ,5,amod
4,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Art,art,NN,2,conj


Change test txt file to a dataframe

In [None]:
id= None
text = None
rows = []
for line in test:
    if line.strip():
        if line.startswith("# sent_id"):
            id = line.split("=")[-1].strip()
        elif line.startswith("# text"):
            text = line.split("=")[-1].strip()
        else:
            parts = line.split()
            row = {
                'sent_id': id,
                'text': text,
                'word': parts[1],
                'normalized_word': parts[2],
                'POS_tag': parts[3],
                'head_word_index': parts[4],
                'dependency_relation': parts[5]
            }
            rows.append(row)

df_test = pd.DataFrame(rows)

In [None]:
df_train.shape

(36712, 7)

# Preprocessing

 removing all the all rows where head_word_index is equal to '_' which is done so as to remove tokens which are not having any head word relationships

In [None]:
df_train = df_train[df_train['head_word_index'] != '_']
df_test = df_test[df_test['head_word_index'] != '_']

In [None]:
df_train.shape

(36249, 7)

Counting in how many training sentences each token from training set occurs

In [None]:
df_train['head_word_index'].value_counts()

0      2000
4      1859
2      1845
3      1845
5      1679
       ... 
91        3
100       3
94        3
95        2
82        1
Name: head_word_index, Length: 95, dtype: int64

Creating a Vocabulary as per condition given which is take most frequent 1000 normalized tokens which do not occur in more than 50% of sentences.

In [None]:
token_sentence_count = df_train.groupby('normalized_word')['sent_id'].nunique()

# Counting total no. of sentences present in training set
total_sentences = len(df_train['sent_id'].unique())

Filtering the tokens which occur in less than 50% training sentences and then taking top 1000 tokens from those filtered tokens

In [None]:
filtered_tokens = token_sentence_count[token_sentence_count <= total_sentences * 0.5]
print(len(filtered_tokens))

# Taking top 1000 most frequent tokens from filtered tokens
top_tokens = filtered_tokens.head(1000)
print(len(top_tokens))

5276
1000


In [None]:
top_tokens_keys=list(top_tokens.keys())

In [None]:
len(top_tokens_keys)

1000

In [None]:
# df_train=df_train[df_train['normalized_word'].isin(top_tokens_keys)]
# train_POS_tags= df_train['POS_tag'].tolist()
# # train_POS_tags
# len(df_train)

Creating a new column having index of position of each word in the corresponding sentence. This will be used for storing the head word, dependent relationships in arcs

In [None]:
df_train['normalized_word_index'] = df_train.groupby('sent_id').cumcount()+ 1
df_train.head(25)

Unnamed: 0,sent_id,text,word,normalized_word,POS_tag,head_word_index,dependency_relation,normalized_word_index
0,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Aesthetic,aesthetic,JJ,2,amod,1
1,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Appreciation,appreciation,NN,0,root,2
2,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,and,and,CC,5,cc,3
3,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Spanish,Spanish,JJ,5,amod,4
4,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Art,art,NN,2,conj,5
5,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,:,:,:,2,punct,6
6,GUM_academic_art-2,Insights from Eye-Tracking,Insights,insight,NNS,0,root,1
7,GUM_academic_art-2,Insights from Eye-Tracking,from,from,IN,5,case,2
8,GUM_academic_art-2,Insights from Eye-Tracking,Eye,eye,NN,5,compound,3
9,GUM_academic_art-2,Insights from Eye-Tracking,-,-,HYPH,3,punct,4


In [None]:
df1 = df_train
for index, row in df1.iterrows():
    df_2 = df1[(df1['sent_id'] == row['sent_id'])]
    df_2 = df_2[int(row['head_word_index']) ==  df_2['normalized_word_index']]
    if int(row['head_word_index'])== 0:
      df1.at[index, 'head_word'] = 'root'
    elif not df_2.empty:
        head_word = df_2['normalized_word'].iloc[0]
        df1.at[index, 'head_word'] = head_word
df_train=df1
df_train.head(25)

Unnamed: 0,sent_id,text,word,normalized_word,POS_tag,head_word_index,dependency_relation,normalized_word_index,head_word
0,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Aesthetic,aesthetic,JJ,2,amod,1,appreciation
1,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Appreciation,appreciation,NN,0,root,2,root
2,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,and,and,CC,5,cc,3,art
3,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Spanish,Spanish,JJ,5,amod,4,art
4,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Art,art,NN,2,conj,5,appreciation
5,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,:,:,:,2,punct,6,appreciation
6,GUM_academic_art-2,Insights from Eye-Tracking,Insights,insight,NNS,0,root,1,root
7,GUM_academic_art-2,Insights from Eye-Tracking,from,from,IN,5,case,2,tracking
8,GUM_academic_art-2,Insights from Eye-Tracking,Eye,eye,NN,5,compound,3,tracking
9,GUM_academic_art-2,Insights from Eye-Tracking,-,-,HYPH,3,punct,4,eye


True dependency graph which is stored in the form of tuples (head_word, dependent_word)

In [None]:
df_train['dep_graph']= list(zip(df_train['head_word'], df_train['normalized_word']))
df_train.head(25)

Unnamed: 0,sent_id,text,word,normalized_word,POS_tag,head_word_index,dependency_relation,normalized_word_index,head_word,dep_graph
0,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Aesthetic,aesthetic,JJ,2,amod,1,appreciation,"(appreciation, aesthetic)"
1,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Appreciation,appreciation,NN,0,root,2,root,"(root, appreciation)"
2,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,and,and,CC,5,cc,3,art,"(art, and)"
3,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Spanish,Spanish,JJ,5,amod,4,art,"(art, Spanish)"
4,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Art,art,NN,2,conj,5,appreciation,"(appreciation, art)"
5,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,:,:,:,2,punct,6,appreciation,"(appreciation, :)"
6,GUM_academic_art-2,Insights from Eye-Tracking,Insights,insight,NNS,0,root,1,root,"(root, insight)"
7,GUM_academic_art-2,Insights from Eye-Tracking,from,from,IN,5,case,2,tracking,"(tracking, from)"
8,GUM_academic_art-2,Insights from Eye-Tracking,Eye,eye,NN,5,compound,3,tracking,"(tracking, eye)"
9,GUM_academic_art-2,Insights from Eye-Tracking,-,-,HYPH,3,punct,4,eye,"(eye, -)"


In [None]:
normalised_train_token= df_train['normalized_word'].unique()
token_mapping = {token:i for i, token in enumerate(normalised_train_token)}
token_mapping

{'aesthetic': 0,
 'appreciation': 1,
 'and': 2,
 'Spanish': 3,
 'art': 4,
 ':': 5,
 'insight': 6,
 'from': 7,
 'eye': 8,
 '-': 9,
 'tracking': 10,
 'Claire': 11,
 'Bailey': 12,
 'Ross': 13,
 'claire.bailey-ross@port.ac.uk': 14,
 'University': 15,
 'of': 16,
 'Portsmouth': 17,
 ',': 18,
 'Unite': 19,
 'Kingdom': 20,
 'Andrew': 21,
 'Beresford': 22,
 'a.m.beresford@durham.ac.uk': 23,
 'Durham': 24,
 'Daniel': 25,
 'Smith': 26,
 'daniel.smith2@durham.ac.uk': 27,
 'Warwick': 28,
 'c.l.h.warwick@durham.ac.uk': 29,
 'how': 30,
 'do': 31,
 'person': 32,
 'look': 33,
 'at': 34,
 'experience': 35,
 '?': 36,
 'which': 37,
 'element': 38,
 'specific': 39,
 'artwork': 40,
 'they': 41,
 'focus': 42,
 'on': 43,
 'museum': 44,
 'label': 45,
 'have': 46,
 'a': 47,
 'impact': 48,
 'the': 49,
 'viewing': 50,
 'be': 51,
 'complex': 52,
 'one': 53,
 'involve': 54,
 'issue': 55,
 'perception': 56,
 'attention': 57,
 'memory': 58,
 'decision': 59,
 'make': 60,
 'affect': 61,
 'emotion': 62,
 '.': 63,
 'thus

In [None]:
def get_POS_tags(token,sent_id):
    return df_train[(df_train['sent_id'] == sent_id) & (df_train['normalized_word'] == token)]['POS_tag'].values[0]

In [None]:
POS_tags=df_train['POS_tag'].unique()
POS_tags

array(['JJ', 'NN', 'CC', ':', 'NNS', 'IN', 'HYPH', 'NNP', ',', 'WRB',
       'VBP', 'VB', '.', 'WDT', 'PRP', 'DT', 'VBZ', 'VBG', 'RB', 'MD',
       'PRP$', 'VBN', 'FW', 'SYM', 'VBD', 'TO', 'RP', '-LRB-', '-RRB-',
       'POS', 'CD', 'EX', 'LS', 'JJR', 'RBR', 'WP$', 'JJS', 'PDT', '``',
       "''", 'WP', 'NNPS', 'UH', 'RBS', 'GW', '$'], dtype=object)

In [None]:
df_train.head(10)

Unnamed: 0,sent_id,text,word,normalized_word,POS_tag,head_word_index,dependency_relation,normalized_word_index,head_word,dep_graph
0,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Aesthetic,aesthetic,JJ,2,amod,1,appreciation,"(appreciation, aesthetic)"
1,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Appreciation,appreciation,NN,0,root,2,root,"(root, appreciation)"
2,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,and,and,CC,5,cc,3,art,"(art, and)"
3,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Spanish,Spanish,JJ,5,amod,4,art,"(art, Spanish)"
4,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Art,art,NN,2,conj,5,appreciation,"(appreciation, art)"
5,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,:,:,:,2,punct,6,appreciation,"(appreciation, :)"
6,GUM_academic_art-2,Insights from Eye-Tracking,Insights,insight,NNS,0,root,1,root,"(root, insight)"
7,GUM_academic_art-2,Insights from Eye-Tracking,from,from,IN,5,case,2,tracking,"(tracking, from)"
8,GUM_academic_art-2,Insights from Eye-Tracking,Eye,eye,NN,5,compound,3,tracking,"(tracking, eye)"
9,GUM_academic_art-2,Insights from Eye-Tracking,-,-,HYPH,3,punct,4,eye,"(eye, -)"


In [None]:
get_POS_tags('Spanish', 'GUM_academic_art-1' )

'JJ'

In [None]:
POS_mapping = {pos: i for i, pos in enumerate(POS_tags)}
POS_mapping

{'JJ': 0,
 'NN': 1,
 'CC': 2,
 ':': 3,
 'NNS': 4,
 'IN': 5,
 'HYPH': 6,
 'NNP': 7,
 ',': 8,
 'WRB': 9,
 'VBP': 10,
 'VB': 11,
 '.': 12,
 'WDT': 13,
 'PRP': 14,
 'DT': 15,
 'VBZ': 16,
 'VBG': 17,
 'RB': 18,
 'MD': 19,
 'PRP$': 20,
 'VBN': 21,
 'FW': 22,
 'SYM': 23,
 'VBD': 24,
 'TO': 25,
 'RP': 26,
 '-LRB-': 27,
 '-RRB-': 28,
 'POS': 29,
 'CD': 30,
 'EX': 31,
 'LS': 32,
 'JJR': 33,
 'RBR': 34,
 'WP$': 35,
 'JJS': 36,
 'PDT': 37,
 '``': 38,
 "''": 39,
 'WP': 40,
 'NNPS': 41,
 'UH': 42,
 'RBS': 43,
 'GW': 44,
 '$': 45}

In [None]:
def get_DEP_tags(token, sent_id):
    return df_train[(df_train['sent_id'] == sent_id) & (df_train['normalized_word'] == token)]['dependency_relation'].values[0]

In [None]:
get_DEP_tags('Spanish', 'GUM_academic_art-1' )

'amod'

In [None]:
dep_tags= df_train['dependency_relation'].unique()
dep_tags

array(['amod', 'root', 'cc', 'conj', 'punct', 'case', 'compound', 'nmod',
       'flat', 'list', 'advmod', 'aux', 'nsubj', 'obl', 'det', 'obj',
       'advcl:relcl', 'cop', 'acl', 'acl:relcl', 'mark', 'nmod:poss',
       'advcl', 'nsubj:pass', 'aux:pass', 'nsubj:outer', 'compound:prt',
       'appos', 'fixed', 'obl:agent', 'csubj', 'dep', 'nmod:tmod',
       'nummod', 'expl', 'parataxis', 'xcomp', 'cc:preconj', 'ccomp',
       'det:predet', 'nmod:npmod', 'csubj:outer', 'iobj', 'obl:tmod',
       'obl:npmod', 'orphan', 'csubj:pass', 'goeswith', 'vocative',
       'discourse', 'reparandum', 'dislocated'], dtype=object)

In [None]:
dep_mapping = {dep: i for i, dep in enumerate(dep_tags)}
dep_mapping

{'amod': 0,
 'root': 1,
 'cc': 2,
 'conj': 3,
 'punct': 4,
 'case': 5,
 'compound': 6,
 'nmod': 7,
 'flat': 8,
 'list': 9,
 'advmod': 10,
 'aux': 11,
 'nsubj': 12,
 'obl': 13,
 'det': 14,
 'obj': 15,
 'advcl:relcl': 16,
 'cop': 17,
 'acl': 18,
 'acl:relcl': 19,
 'mark': 20,
 'nmod:poss': 21,
 'advcl': 22,
 'nsubj:pass': 23,
 'aux:pass': 24,
 'nsubj:outer': 25,
 'compound:prt': 26,
 'appos': 27,
 'fixed': 28,
 'obl:agent': 29,
 'csubj': 30,
 'dep': 31,
 'nmod:tmod': 32,
 'nummod': 33,
 'expl': 34,
 'parataxis': 35,
 'xcomp': 36,
 'cc:preconj': 37,
 'ccomp': 38,
 'det:predet': 39,
 'nmod:npmod': 40,
 'csubj:outer': 41,
 'iobj': 42,
 'obl:tmod': 43,
 'obl:npmod': 44,
 'orphan': 45,
 'csubj:pass': 46,
 'goeswith': 47,
 'vocative': 48,
 'discourse': 49,
 'reparandum': 50,
 'dislocated': 51}

arc_set is a list storing set of arcs in the form of tuples(head_word, dependent_word)

In feature vector we wil 1st create a binary vector (Z) showing configuration depending on features without considering transitions so it will be of size k= (2*|V|+3*|p| +4*|R|).and then we will just insert this vector Z of size k into a vector T of all transitiosn ahvinf size of 4*k appropriately based on the transitions.

Transition type:  Left-Arc, Right-Arc, Shift, Reduce

In [None]:
def create_binary_vector( V,P,R, top_stack_index, first_buffer_index, top_pos_index, first_pos_index, second_pos_index, top_dep_index, top_ldep_index, top_rdep_index, buffer_ldep_index ):
    binary_vector = np.zeros((2*V + 3*P + 4*R)) # Binary vector which are storing the features irrespective of the transitions type
    if top_stack_index is not None:
        binary_vector[top_stack_index] = 1 # TOP(S) feature having size |V|
    if first_buffer_index is not None:
        binary_vector[V + first_buffer_index] = 1 # FIRST(B) feature having size |V|
    if top_pos_index is not None:
        binary_vector[2*V + top_pos_index] =  1 # TOP.POS(S) feature having size |P|
    if first_pos_index is not None:
        binary_vector[2*V + P + first_pos_index] =  1 # FIRST.POS(S) feature having size |P|
    if second_pos_index is not None:
        binary_vector[2*V + 2*P + second_pos_index] =  1 # LOOK.POS(S) feature having size |P|
    if top_dep_index is not None:
        binary_vector[2*V + 3*P + top_dep_index] = 1 # TOP.DEP(S) fature having size  |R|
    if top_ldep_index is not None:
        binary_vector[2*V + 3*P + R + top_ldep_index] = 1 # TOP.LDEP(S) fature having size  |R|
    if top_rdep_index is not None:
        binary_vector[2*V + 3*P + 2*R + top_rdep_index] = 1 # TOP.RDEP(S) fature having size  |R|
    if buffer_ldep_index is not None:
        binary_vector[2*V + 3*P + 3*R + buffer_ldep_index] = 1 # FIRST.LDEP(S) fature having size  |R|

    return binary_vector

In [None]:
def feature_vector_train(stack, buffer, arc_set, transition_type, sent_id):
    df_1= df_train[df_train['sent_id'] == sent_id] # Filtering dataframe so that it will be easier to access the values corresponding the sent_id of senetence for which we are calculating the feature vectors
    # Initialize feature vectors
    V = len(token_mapping)
    P = len(POS_mapping)
    R = len(dep_mapping)
    feature_vector = np.zeros(4 * (2*V + 3*P + 4*R)) # Feature vector as per tranisiton and configurations


    # Initializing indices corresponding to each feature as None so that if any feature is not found in particular configuration then nothing will assign 1 corresponding to that feature
    top_stack_index, first_buffer_index, top_pos_index, first_pos_index, second_pos_index, top_dep_index, top_ldep_index, top_rdep_index, buffer_ldep_index = (None,) * 9

    # Extract tokens, POS tags, and dependency relations based on stack and buffer
    # TOP = top(S) token, len(stack) is set for greater than 1 means that if there is only sigle token in stack then it must be root and as it is dummy word we will not consider it for feature vector creation
    top_stack_token = stack[-1] if (len(stack)> 1) else None
    top_stack_index = token_mapping[top_stack_token] if top_stack_token else None


    # FIRST = first(B) token
    first_buffer_token = buffer[0] if buffer else None
    first_buffer_index = token_mapping[first_buffer_token] if first_buffer_token else None


    # TOP.POS = POS-Tag of Top(S)
    # First check whether stack is empty or not and top_stack_token exist or not
    if stack and top_stack_token:
        if get_POS_tags(top_stack_token,sent_id):
            top_pos_index = POS_mapping[get_POS_tags(top_stack_token,sent_id)]


    # FIRST.POS = POS-Tag of First(B)
    # first check whether buffer is empty or not
    if buffer:
        if get_POS_tags(first_buffer_token, sent_id):
            first_pos_index = POS_mapping[get_POS_tags(first_buffer_token, sent_id)]


    # LOOK.POS = POS-Tag of first(B-{first(B)})
    if len(buffer) > 1:
        second_token = buffer[1]
        if get_POS_tags(second_token,sent_id):
            second_pos_index = POS_mapping[get_POS_tags(second_token,sent_id)]


    if len(stack) > 0:
        top_word = stack[-1]  # Top word in the stack
        # Check if top_word is a dependent in any arc in arc_set
        for arc in arc_set:
            if arc[1] == top_word and arc[0]!= None:
                top_dep_index = dep_mapping[get_DEP_tags(top_word, sent_id)]
                break


        # Find all arcs where top_word is the head
        top_ldep_arcs = [(head, dependent) for head, dependent in arc_set if head == top_word]
        # Find the left-most word among all dependents based on their positions in the sentence
        if top_ldep_arcs:
            dep_indices_l = [df_1[df_1['normalized_word'] == dependent]['normalized_word_index'].values[0] for _, dependent in top_ldep_arcs]
            sent_left_most_word = df_1[df_1['normalized_word_index'] == min(dep_indices_l)]['normalized_word'].item()
            # If left-most word is found, retrieve its dependency tag
            if sent_left_most_word is not None:
                top_ldep_index = dep_mapping[get_DEP_tags(sent_left_most_word, sent_id)]


        # Find the right-most word connected to the top word in arc_set
        top_rdep_arcs = [(head, dependent) for head, dependent in arc_set if head == top_word]
        # Find the right-most word among all dependents based on their positions in the sentence
        if top_rdep_arcs:
            dep_indices_r = [df_1[df_1['normalized_word'] == dependent]['normalized_word_index'].values[0] for _, dependent in top_rdep_arcs]
            sent_right_most_word = df_1[df_1['normalized_word_index'] == min(dep_indices_r)]['normalized_word'].item()
            # If left-most word is found, retrieve its dependency tag
            if sent_right_most_word is not None:
                top_rdep_index = dep_mapping[get_DEP_tags(sent_right_most_word, sent_id)]


    if len(buffer) > 0:
        # Find all arcs where top_word is the head
        first_word = buffer[0]
        first_ldep_arcs = [(head, dependent) for head, dependent in arc_set if head == first_word]
        # Find the left-most word among all dependents based on their positions in the sentence
        if first_ldep_arcs:
            buffer_dep_indices_l = [df_1[df_1['normalized_word'] == dependent]['normalized_word_index'].values[0] for _, dependent in first_ldep_arcs]
            buffer_left_most_word = df_1[df_1['normalized_word_index'] == min(buffer_dep_indices_l)]['normalized_word'].item()
            # If left-most word is found, retrieve its dependency tag
            if buffer_left_most_word is not None:
              buffer_ldep_index = dep_mapping[get_DEP_tags(buffer_left_most_word, sent_id)]


    binary_vector = create_binary_vector(V, P, R, top_stack_index, first_buffer_index, top_pos_index, first_pos_index, second_pos_index, top_dep_index, top_ldep_index, top_rdep_index, buffer_ldep_index )
    k = binary_vector.shape[0]

    # Inject the binary vector to feature vector based on the transition type
    if transition_type == "LEFT_ARC":  # Left-Arc
        feature_vector[:k] = binary_vector
    elif transition_type == "RIGHT_ARC":  # Right-Arc
        feature_vector[k: 2*k] = binary_vector
    elif transition_type == "SHIFT":  # Shift
        feature_vector[2 * k:3 * k] = binary_vector
    elif transition_type == "REDUCE":  # Reduce
        feature_vector[3 * k:] = binary_vector

    return feature_vector

I have separately defined functions to perform the execution of each transition. This will take stack, buffer, and arc_set in input and depending on the transition perform certain operations and return the updated stack, buffer, and arc_set. This function only does direction execution, not check the condition. So, we will use this function when we are sure that we have to perform certain transitions.

In [None]:
def do_LEFT_ARC(stack, buffer, arc_set):
    """ Add top(S) <- first(B) to A, Pop top(S) """
    if stack and buffer:
        head_word = buffer[0]
        dependent_word = stack[-1]
        arc_set.append((head_word, dependent_word))
        stack.pop()
        # print("LEFT_ARC DONE")
    return stack, buffer, arc_set

def do_RIGHT_ARC(stack, buffer, arc_set):
    """ Add top(S) -> first(B) to A, Pop first(B) and Push to S """
    if stack and buffer:
        head_word = stack[-1]
        dependent_word = buffer[0]
        arc_set.append((head_word, dependent_word))
        buffer.pop(0)
        stack.append(dependent_word)
        # print("RIGHT_ARC DONE")
    return stack, buffer, arc_set

def do_REDUCE(stack, buffer, arc_set):
    """  Pop top(S)  """
    if stack:
        stack.pop()
        # print("REDUCE DONE")
    return stack, buffer, arc_set

def do_SHIFT(stack, buffer, arc_set):
    """ Pop first(B) and Push to S  """
    if buffer:
        word = buffer[0]
        buffer.pop(0)
        stack.append(word)
        # print("SHIFT DONE")
    return stack, buffer, arc_set

 Apply the oracle heuristics to determine the next transition.


Here D is gold-standard dependency graph which stores all the arcs relationships which is given in training set. While A is arc_set which is storing the arcs which are processed/ discovered till now during training

Here mapping of transition is done as follows:

LEFR_ARC - 0

RIGHT_ARC - 1

SHIFT - 2

REDUCE - 3

In [None]:
def oracle(stack, buffer, arc_set, D):
    if stack and buffer:
        top_S= stack[-1]
        first_B = buffer[0]

        # Condition 1: If top(S) <- first(B) in D and head of top(S) is not dicovered means ( * ,top(S) ) not in A, then LEFT-ARC
        if (first_B, top_S) in D and (top_S not in [arc[1] for arc in arc_set]):
            stack, buffer, arc_set = do_LEFT_ARC(stack, buffer, arc_set)
            return 0, stack, buffer, arc_set

        # Condition 2: If top(S) -> first(B) in D, then RIGHT-ARC
        elif (top_S, first_B) in D:
            stack, buffer, arc_set = do_RIGHT_ARC(stack, buffer, arc_set)
            return 1, stack, buffer, arc_set

        # Condition 3: If head of top(S) is dicovered means ( * ,top(S) ) is in A and there exists w in S such that w is not top(S), w <- first(B) in D or w -> first(B) in D, then REDUCE
        elif (top_S in [arc[1] for arc in arc_set]) and any(((w, first_B) in D or (first_B, w) in D) for w in stack if w != top_S):
            stack, buffer, arc_set = do_REDUCE(stack, buffer, arc_set)
            return 3, stack, buffer, arc_set

        # Condition 4:  SHIFT
        else:
            stack, buffer, arc_set = do_SHIFT(stack, buffer, arc_set)
            return 2, stack, buffer, arc_set

    # Condition 4:  SHIFT
    else:
        stack, buffer, arc_set = do_SHIFT(stack, buffer, arc_set)
        return 2, stack, buffer, arc_set

Function to train a parser on training data

Online learning algorithm using the oracle for creating a dependency graph.

In [None]:
def train(df_train, epochs):
    K = epochs  # Number of iterations over training data
    N = len(df_train.groupby('sent_id'))  # Number of instances/sentences in training data
    V = len(token_mapping)
    P = len(POS_mapping)
    R = len(dep_mapping)
    shape = 4 * (2*V + 3*P + 4*R)
    w = np.random.rand(shape).astype(np.float32) # Initialize classifier weights

    # for i in range(1, K + 1):
    for epoch in range(1, K + 1):
        for index, group in tqdm(df_train.groupby('sent_id'), desc=f"Processing Groups - Epoch {epoch}", leave=False):
            S=['root']
            # S = []
            B = group['normalized_word'].tolist()
            arc_set = []
            sent_id = group['sent_id'].iloc[0]
            print(f"Sent_id: {sent_id}")
            D = group['dep_graph'].tolist() # Dependency graph having Gold- standard arc and their tags
            j=0
            while B != []:
                t_classifier = np.argmax(np.dot(w, feature_vector_train(S, B, arc_set, t, sent_id)) for t in ['LEFT-ARC', 'RIGHT-ARC', 'SHIFT', 'REDUCE'])

                S_copy, B_copy, arc_set_copy = deepcopy(S), deepcopy(B), deepcopy(arc_set)
                t_oracle, S_oracle, B_oracle, arc_set_oracle = oracle(S, B, arc_set, D)
                # print(f"classifier transition : {t_classifier},  oracle transition : {t_oracle}")
                if t_classifier != t_oracle:
                  # Updating of weights toward true transition away from wronly predicted transition
                    w += feature_vector_train(S_copy, B_copy, arc_set_copy, t_oracle, sent_id) - feature_vector_train(S_copy, B_copy, arc_set_copy, t_classifier, sent_id)
                    S, B, arc_set = ( S_oracle,B_oracle,arc_set_oracle)
                j = j+1
    return w

In [None]:
weights = train(df_train, epochs= 2)

In [None]:
print(weights)

In [None]:
weights_array = np.array(weights)
# Save the numpy array to a file called "weights.npy"
np.save("weights_model_dep.npy", weights_array)

# Testing

  Crating a majority classifier for features like x.yDEP

Creating a new column having POS Tag of head word and another column which will store pair of POS Tags of head word and corresponding word in the form of (POS_head_word, POS_word)

In [None]:
df1 = df_train
for index, row in df1.iterrows():
    df_2 = df1[(df1['sent_id'] == row['sent_id'])]
    df_2 = df_2[int(row['head_word_index']) ==  df_2['normalized_word_index']]
    if int(row['head_word_index'])== 0:
      df1.at[index, 'head_word_POS'] = 'root'
    elif not df_2.empty:
        head_word = df_2['POS_tag'].iloc[0]
        df1.at[index, 'head_word_POS'] = head_word
df_train=df1
df_train['POS_Tag_pair']= list(zip(df_train['head_word_POS'], df_train['POS_tag']))
df_train.head(25)

Unnamed: 0,sent_id,text,word,normalized_word,POS_tag,head_word_index,dependency_relation,normalized_word_index,head_word,dep_graph,head_word_POS,POS_Tag_pair
0,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Aesthetic,aesthetic,JJ,2,amod,1,appreciation,"(appreciation, aesthetic)",NN,"(NN, JJ)"
1,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Appreciation,appreciation,NN,0,root,2,root,"(root, appreciation)",root,"(root, NN)"
2,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,and,and,CC,5,cc,3,art,"(art, and)",NN,"(NN, CC)"
3,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Spanish,Spanish,JJ,5,amod,4,art,"(art, Spanish)",NN,"(NN, JJ)"
4,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,Art,art,NN,2,conj,5,appreciation,"(appreciation, art)",NN,"(NN, NN)"
5,GUM_academic_art-1,Aesthetic Appreciation and Spanish Art:,:,:,:,2,punct,6,appreciation,"(appreciation, :)",NN,"(NN, :)"
6,GUM_academic_art-2,Insights from Eye-Tracking,Insights,insight,NNS,0,root,1,root,"(root, insight)",root,"(root, NNS)"
7,GUM_academic_art-2,Insights from Eye-Tracking,from,from,IN,5,case,2,tracking,"(tracking, from)",NN,"(NN, IN)"
8,GUM_academic_art-2,Insights from Eye-Tracking,Eye,eye,NN,5,compound,3,tracking,"(tracking, eye)",NN,"(NN, NN)"
9,GUM_academic_art-2,Insights from Eye-Tracking,-,-,HYPH,3,punct,4,eye,"(eye, -)",NN,"(NN, HYPH)"


After creating a new column of POS Tag pair of head word and corresponding word, we will find dependency  relation having maximum frequency for each POS Tag pair.

In [None]:
pos_freq = df_train.groupby('POS_Tag_pair')['dependency_relation'].value_counts().reset_index(name='count')
pos_majority_rel = pos_freq.loc[pos_freq.groupby('POS_Tag_pair')['count'].idxmax()].reset_index().set_index('POS_Tag_pair')['dependency_relation'].to_dict()

In [None]:
pos_majority_rel['NNP', 'NNP']

'flat'

Finding the head word for each word in the test data so as to used it at time of UAS calculation

In [None]:
df_test['normalized_word_index'] = df_test.groupby('sent_id').cumcount()+ 1

In [None]:
df1 = df_test
for index, row in df1.iterrows():
    df_2 = df1[(df1['sent_id'] == row['sent_id'])]
    df_2 = df_2[int(row['head_word_index']) ==  df_2['normalized_word_index']]
    if int(row['head_word_index'])== 0:
      df1.at[index, 'head_word'] = 'root'
    elif not df_2.empty:
        head_word = df_2['normalized_word'].iloc[0]
        df1.at[index, 'head_word'] = head_word
df_test=df1
# df_test.head(25)

Creating a new column so as to store the (head_word, dependent_word) for each word in each sentences in training data. This will be our True_arc

In [None]:
df_test['dep_graph_True']= list(zip(df_test['head_word'], df_test['normalized_word']))

Crearing a feature vector for test using the updated defination of x.yDEP

In [None]:
def feature_vector_test(stack, buffer, arc_set, transition_type, sent_id):
    df_1= df_test[df_test['sent_id'] == sent_id]
    # Initialize feature vectors
    V = len(token_mapping)
    P = len(POS_mapping)
    R = len(dep_mapping)
    feature_vector = np.zeros(4 * (2*V + 3*P + 4*R)) # Feature vector as per tranisiton and configurations

    # Initializing indices corresponding to each feature as None so that if any feature is not found in particular configuration then nothing will assign 1 corresponding to that feature
    top_stack_index, first_buffer_index, top_pos_index, first_pos_index, second_pos_index, top_dep_index, top_ldep_index, top_rdep_index, buffer_ldep_index = (None,) * 9

    # Extract tokens, POS tags, and dependency relations based on stack and buffer
    # TOP = top(S) token, len(stack) is set for greater than 1 means that if there is only sigle token in stack then it must be root and as it is dummy word we will not consider it for feature vector creation
    top_stack_token = stack[-1] if (len(stack)> 1) else None
    # top_stack_token = stack[-1] if (stack) else None
    top_stack_index = token_mapping[top_stack_token] if top_stack_token else None


    # FIRST = first(B) token
    first_buffer_token = buffer[0] if buffer else None
    first_buffer_index = token_mapping[first_buffer_token] if first_buffer_token else None


    # TOP.POS = POS-Tag of Top(S)
    # First check whether stack is empty or not
    if stack and top_stack_token:
        if get_POS_tags(top_stack_token,sent_id):
            top_pos_index = POS_mapping[get_POS_tags(top_stack_token,sent_id)]


    # FIRST.POS = POS-Tag of First(B)
    # first check whether buffer is empty or not
    if buffer:
        if get_POS_tags(first_buffer_token, sent_id):
            first_pos_index = POS_mapping[get_POS_tags(first_buffer_token, sent_id)]


    # LOOK.POS = POS-Tag of first(B-{first(B)})
    if len(buffer) > 1:
        second_token = buffer[1]
        if get_POS_tags(second_token,sent_id):
            second_pos_index = POS_mapping[get_POS_tags(second_token,sent_id)]


    if len(stack) > 0:
        top_word = stack[-1]  # Top word in the stack
        # Check if top_word is a dependent in any arc in arc_set
        for arc in arc_set:
            if arc[1] == top_word and arc[0]!= None:
                head_word_POS = get_POS_tags(arc[0], sent_id)
                dependent_word_POS = get_POS_tags(arc[1], sent_id)
                top_dep_index = dep_mapping[pos_majority_rel[(head_word_POS, dependent_word_POS)]]
                break


        # Find all arcs where top_word is the head
        top_ldep_arcs = [(head, dependent) for head, dependent in arc_set if head == top_word]
        # Find the left-most word among all dependents based on their positions in the sentence
        if top_ldep_arcs:
            dep_indices_l = [df_1[df_1['normalized_word'] == dependent]['normalized_word_index'].values[0] for _, dependent in top_ldep_arcs]
            sent_left_most_word = df_1[df_1['normalized_word_index'] == min(dep_indices_l)]['normalized_word'].item()
            # If left-most word is found, retrieve its dependency tag
            if sent_left_most_word is not None:
                head_word_POS = get_POS_tags(top_word, sent_id)
                dependent_word_POS = get_POS_tags(sent_left_most_word, sent_id)
                top_ldep_index = dep_mapping[pos_majority_rel[(head_word_POS, dependent_word_POS)]]


        # Find the right-most word connected to the top word in arc_set
        top_rdep_arcs = [(head, dependent) for head, dependent in arc_set if head == top_word]
        # Find the right-most word among all dependents based on their positions in the sentence
        if top_rdep_arcs:
            dep_indices_r = [df_1[df_1['normalized_word'] == dependent]['normalized_word_index'].values[0] for _, dependent in top_rdep_arcs]
            sent_right_most_word = df_1[df_1['normalized_word_index'] == min(dep_indices_r)]['normalized_word'].item()
            # If left-most word is found, retrieve its dependency tag
            if sent_right_most_word is not None:
                head_word_POS = get_POS_tags(top_word, sent_id)
                dependent_word_POS = get_POS_tags(sent_right_most_word, sent_id)
                top_rdep_index = dep_mapping[pos_majority_rel[(head_word_POS, dependent_word_POS)]]


    if len(buffer) > 0:
        # Find all arcs where top_word is the head
        first_word = buffer[0]
        first_ldep_arcs = [(head, dependent) for head, dependent in arc_set if head == first_word]
        # Find the left-most word among all dependents based on their positions in the sentence
        if first_ldep_arcs:
            buffer_dep_indices_l = [df_1[df_1['normalized_word'] == dependent]['normalized_word_index'].values[0] for _, dependent in first_ldep_arcs]
            buffer_left_most_word = df_1[df_1['normalized_word_index'] == min(buffer_dep_indices_l)]['normalized_word'].item()
            # If left-most word is found, retrieve its dependency tag
            if buffer_left_most_word is not None:
              head_word_POS = get_POS_tags(first_word, sent_id)
              dependent_word_POS = get_POS_tags(buffer_left_most_word, sent_id)
              buffer_ldep_index = dep_mapping[pos_majority_rel[(head_word_POS, dependent_word_POS)]]


    binary_vector = create_binary_vector(V, P, R, top_stack_index, first_buffer_index, top_pos_index, first_pos_index, second_pos_index, top_dep_index, top_ldep_index, top_rdep_index, buffer_ldep_index )
    k = binary_vector.shape[0]
    # Inject the binary vector to feature vector based on the transition type
    if transition_type == "LEFT_ARC":  # Left-Arc
        feature_vector[:k] = binary_vector
    elif transition_type == "RIGHT_ARC":  # Right-Arc
        feature_vector[k: 2*k] = binary_vector
    elif transition_type == "SHIFT":  # Shift
        feature_vector[2 * k:3 * k] = binary_vector
    elif transition_type == "REDUCE":  # Reduce
        feature_vector[3 * k:] = binary_vector

    return feature_vector

Updated function of performinf transition based on certian new heuristics rule given in problem statement for the test data.

In [None]:
def do_LEFT_ARC_test(stack, buffer, arc_set):
    """ Add top(S) <- first(B) to A, Pop top(S) """
    if stack and buffer:
        head_word = buffer[0]
        dependent_word = stack[-1]
        arc_set.append((head_word, dependent_word))
        stack.pop()
        # print("LEFT_ARC DONE")
        return stack, buffer, arc_set
    elif not stack:
      return do_SHIFT_test(stack, buffer, arc_set)

def do_RIGHT_ARC_test(stack, buffer, arc_set):
    """ Add top(S) -> first(B) to A, Pop first(B) and Push to S """
    if stack and buffer:
        head_word = stack[-1]
        dependent_word = buffer[0]
        arc_set.append((head_word, dependent_word))
        buffer.pop(0)
        stack.append(dependent_word)
        # print("RIGHT_ARC DONE")
        return stack, buffer, arc_set
    elif not stack : # stack is empty then do SHIFT
        return do_SHIFT_test(stack, buffer, arc_set)

def do_REDUCE_test(stack, buffer, arc_set):
    """  Pop top(S)  """
    if stack:
        stack.pop()
        # print("REDUCE DONE")
    return stack, buffer, arc_set

def do_SHIFT_test(stack, buffer, arc_set):
    """ Pop first(B) and Push to S  """
    if buffer:
        word = buffer[0]
        buffer.pop(0)
        stack.append(word)
        # print("SHIFT DONE")
    return stack, buffer, arc_set

Here mapping of transition is done as follows:

LEFR_ARC - 0, RIGHT_ARC - 1, SHIFT - 2, REDUCE - 3

Function to calculate UAS score. Here True Arc is a list of Arcs where each arc contains word and their head word as given in test data in form (True_head_word, True_dependent_word). While pred_arc is a list of predicted arcs which is of form ( Pred_head_word, Pred_dependent_word) which are predicted by our model.

In [None]:
def calculate_UAS_score(sentence, true_arc, pred_arc):
  count = 0
  # Convert list of tuples to a dictionary where key is the dependent and value is the head
  true_arc_dict = {dep: head for head, dep in true_arc}
  pred_arc_dict = {dep: head for head, dep in pred_arc}

  # Iterate over the tokens in the sentence
  for token in sentence:
      true_head = true_arc_dict.get(token)
      pred_head = pred_arc_dict.get(token)
      # print(f"true head: {true_head}, pred_head: {pred_head} ")
      # If the predicted head and true head match, and the token was indeed found as a dependent, count a correct arc
      if true_head is not None and true_head == pred_head:
          count += 1

  return count / len(true_arc) if true_arc else 0  # Prevent division by zero

In [None]:
weights_train = np.load("weights_model_dep.npy")
print(weights_train)

[0.28697222 0.27081656 0.22608441 ... 0.16209386 0.7112532  0.3578552 ]


In [None]:
def test(df_test):
    output_test = {
    'sent_id': [],
    'token number': [],
    'word': [],
    'predicted_head_token': []
    }

    N = len(df_test.groupby('sent_id'))  # Number of instances/sentences in testing data
    V = len(token_mapping)
    P = len(POS_mapping)
    R = len(dep_mapping)
    shape = 4 * (2*V + 3*P + 4*R)
    avg_UAS_score = []
    for index, group in df_test.groupby('sent_id'):
        S=['root']
        # S = []
        B = group['normalized_word'].tolist()
        sentence = group['normalized_word'].tolist()
        arc_set = []
        sent_id = group['sent_id'].iloc[0]
        True_arc = group['dep_graph_True'].tolist()
        j=0
        while B != []:
            # print(f"Instance of senetence: {j}")
            t_classifier = np.argmax(np.dot(weights_train, feature_vector_test(S, B, arc_set, t, sent_id)) for t in ['LEFT-ARC', 'RIGHT-ARC', 'SHIFT', 'REDUCE'])
            # print(f"classifier transition : {t_classifier}")
            if t_classifier == 0:
                S, B, arc_set = do_LEFT_ARC_test(S, B, arc_set)
            elif t_classifier == 1:
                S, B, arc_set = do_RIGHT_ARC_test(S, B, arc_set)
            elif t_classifier == 2:
                S, B, arc_set = do_SHIFT_test(S, B, arc_set)
            elif t_classifier == 3:
                S, B, arc_set = do_REDUCE_test(S, B, arc_set)
            j = j+1

        UAS_score = calculate_UAS_score(sentence, True_arc, arc_set)
        print(f"\n Testing Example sent_id : {sent_id}, UAS Score: {UAS_score}")
        avg_UAS_score.append(UAS_score)
        pred_arc_dict = {dep: head for head, dep in arc_set}
        # print(arc_dict)
        # Store the test output for each word in the sentence
        print(f"sentence: {sentence}, arc_set: {arc_set}")
        i=1
        for word in sentence:
            output_test['sent_id'].append(sent_id)
            output_test['token number'].append(i)
            i = i+1
            output_test['word'].append(word)
            predicted_head = pred_arc_dict.get(word)  # Using get() avoids KeyError if the word is not found
            # print(f"predicted head: {predicted_head}")
            output_test['predicted_head_token'].append(predicted_head)

    return output_test, avg_UAS_score

In [None]:
test_output, avg_UAS_score = test(df_test)
print(f"Average UAS Score over Test Data: {np.mean(avg_UAS_score)}")


 Testing Example sent_id : GUM_academic_discrimination-1, UAS Score: 0.36363636363636365
sentence: ['the', 'prevalence', 'of', 'discrimination', 'across', 'racial', 'group', 'in', 'contemporary', 'America', ':'], arc_set: [('the', 'root'), ('prevalence', 'the'), ('of', 'prevalence'), ('discrimination', 'of'), ('across', 'discrimination'), ('racial', 'across'), ('group', 'racial'), ('in', 'group'), ('contemporary', 'in'), ('America', 'contemporary'), (':', 'America')]

 Testing Example sent_id : GUM_academic_discrimination-10, UAS Score: 0.2962962962962963
sentence: ['what', 'also', 'should', 'not', 'be', 'lose', 'in', 'discussion', 'of', 'discrimination', 'be', 'the', 'grow', 'push', 'to', 'implement', 'social', 'policy', 'aim', 'at', 'reduce', 'the', 'occurrence', 'of', 'discriminatory', 'practice', '.'], arc_set: [('what', 'root'), ('also', 'what'), ('should', 'also'), ('not', 'should'), ('be', 'not'), ('lose', 'be'), ('in', 'lose'), ('discussion', 'in'), ('of', 'discussion'), ('dis

In [None]:
df_test.head(10)

Unnamed: 0,sent_id,text,word,normalized_word,POS_tag,head_word_index,dependency_relation,normalized_word_index,head_word,dep_graph_True
0,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,The,the,DT,1,det,1.0,prevalence,"(prevalence, the)"
1,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,prevalence,prevalence,NN,2,root,2.0,root,"(root, prevalence)"
2,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,of,of,IN,3,case,3.0,discrimination,"(discrimination, of)"
3,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,discrimination,discrimination,NN,4,nmod,4.0,prevalence,"(prevalence, discrimination)"
4,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,across,across,IN,5,case,5.0,group,"(group, across)"
5,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,racial,racial,JJ,6,amod,6.0,group,"(group, racial)"
6,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,groups,group,NNS,7,nmod,7.0,prevalence,"(prevalence, group)"
7,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,in,in,IN,8,case,8.0,America,"(America, in)"
8,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,contemporary,contemporary,JJ,9,amod,9.0,America,"(America, contemporary)"
9,GUM_academic_discrimination-1,The prevalence of discrimination across racial...,America,America,NNP,10,nmod,10.0,prevalence,"(prevalence, America)"


In [None]:
df_test_output = pd.DataFrame(test_output)
df_test_output.head(10)

Unnamed: 0,sent_id,token number,word,predicted_head_token
0,GUM_academic_discrimination-1,1,the,prevalence
1,GUM_academic_discrimination-1,2,prevalence,of
2,GUM_academic_discrimination-1,3,of,discrimination
3,GUM_academic_discrimination-1,4,discrimination,across
4,GUM_academic_discrimination-1,5,across,racial
5,GUM_academic_discrimination-1,6,racial,group
6,GUM_academic_discrimination-1,7,group,in
7,GUM_academic_discrimination-1,8,in,contemporary
8,GUM_academic_discrimination-1,9,contemporary,America
9,GUM_academic_discrimination-1,10,America,:


In [None]:
df_test_output['normalized_word_index'] = df_test_output.groupby('sent_id').cumcount()+ 1

In [None]:
for index, row in df_test_output.iterrows():
    if row['predicted_head_token'] == 'root':
        df_test_output.at[index, 'predicted head token number'] = 0
    else:
        df_2 = df_test_output[(df_test_output['sent_id'] == row['sent_id']) & (df_test_output['word'] == row['predicted_head_token'])]
        if not df_2.empty:
            predicted_head_token_index = int(df_2['normalized_word_index'].iloc[0])
            df_test_output.at[index, 'predicted head token number'] = int(predicted_head_token_index)

In [None]:
df_test_output.head(25)

Unnamed: 0,sent_id,token number,word,predicted_head_token,normalized_word_index,predicted head token number
0,GUM_academic_discrimination-1,1,the,prevalence,1,2.0
1,GUM_academic_discrimination-1,2,prevalence,of,2,3.0
2,GUM_academic_discrimination-1,3,of,discrimination,3,4.0
3,GUM_academic_discrimination-1,4,discrimination,across,4,5.0
4,GUM_academic_discrimination-1,5,across,racial,5,6.0
5,GUM_academic_discrimination-1,6,racial,group,6,7.0
6,GUM_academic_discrimination-1,7,group,in,7,8.0
7,GUM_academic_discrimination-1,8,in,contemporary,8,9.0
8,GUM_academic_discrimination-1,9,contemporary,America,9,10.0
9,GUM_academic_discrimination-1,10,America,:,10,11.0


In [None]:
df_test_output['predicted head token number'].isna().sum()

98

In [None]:
df_test_output.drop(columns=['normalized_word_index','predicted_head_token'], inplace = True)

Nan values in the column predicted_head_token_index shows that model was not able to find head word of corresponding word.

In [None]:
df_test_output = df_test_output[['sent_id', 'token number', 'word', 'predicted head token number']]
df_test_output.head()

Unnamed: 0,sent_id,token number,word,predicted head token number
0,GUM_academic_discrimination-1,1,the,2.0
1,GUM_academic_discrimination-1,2,prevalence,3.0
2,GUM_academic_discrimination-1,3,of,4.0
3,GUM_academic_discrimination-1,4,discrimination,5.0
4,GUM_academic_discrimination-1,5,across,6.0


In [None]:
tsv_file_path = 'dependency_predictions_on.tsv'

# Write the DataFrame to the TSV file
df_test_output.to_csv(tsv_file_path, sep='\t', index=False)
print("TSV file has been created successfully.")

TSV file has been created successfully.
