# Feature Extraction - "Head word & its POS", "Position to predicate", & "Governing category"

# Advanced NLP, Assignment 2

This notebook provides the code to extract head word of each token and its POS tag, position to predicate, and governing category

Data: Universal Propositions Bank 1.0 English data

The data used here is a small set of the entire data.

In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Define header names for the column
header = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED','UP:ARGHEADS_1','UP:ARGHEADS_2','UP:ARGHEADS_3','UP:ARGHEADS_4','UP:ARGHEADS_5','UP:ARGHEADS_6','UP:ARGHEADS_7','UP:ARGHEADS_8','UP:ARGHEADS_9','UP:ARGHEADS_10','UP:ARGHEADS_11','UP:ARGHEADS_12','UP:ARGHEADS_13','UP:ARGHEADS_14','UP:ARGHEADS_15','UP:ARGHEADS_16','UP:ARGHEADS_17','UP:ARGHEADS_18','UP:ARGHEADS_19','UP:ARGHEADS_20','UP:ARGHEADS_21','UP:ARGHEADS_22','UP:ARGHEADS_23','UP:ARGHEADS_24','UP:ARGHEADS_25','UP:ARGHEADS_26','UP:ARGHEADS_27','UP:ARGHEADS_28','UP:ARGHEADS_29','UP:ARGHEADS_30','UP:ARGHEADS_31','UP:ARGHEADS_32','UP:ARGHEADS_33','UP:ARGHEADS_34','UP:ARGHEADS_35']

train_path = '../data/train_split_small.tsv'

In [7]:
# Open file with pandas
open_train = pd.read_csv(train_path, sep='\t', encoding='utf-8',quotechar='№')
# print(open_train)

# Create empty list to store all nested dictionaries
nested_dict_list = []


## HEAD AND POSITION

# Iterate over the new dataframe to find head and its XPOS of each token
for i in range(len(open_train)):
    lemma = open_train.loc[i, 'LEMMA']
    head = int(open_train.loc[i, 'HEAD'])
    xpos = open_train.loc[i, 'XPOS']
    token = open_train.loc[i, 'FORM']
    # if-else statement to print head and its POS
    if head == 0:
        head_lemma = 'ROOT'
        head_pos = 'ROOT'
    else:
        head_lemma = open_train.loc[head - 1, 'LEMMA']
        head_pos = open_train.loc[head - 1, 'XPOS']
    # Store new combination (lemma and pos) into new variable
    head_lemma_pos = f"{head_lemma}_{head_pos}"

    # Create empty nested dictionary to store all features being extracted
    nested_dict = {'token': token, 'head_lemma_pos': head_lemma_pos}
    
## POSITION TO PREDICATE
    # Choosing the columns to work on from dataframe for other 2 features
    lemma_train_col = open_train['LEMMA']
    pred_train_col = open_train['UP:PRED']
    sentID_train = open_train['Copy_ID']
    govern_train = open_train['DEPREL']
    
    # Concatenate the columns into one new dataframe
    df_train = pd.concat([sentID_train, lemma_train_col, pred_train_col, govern_train], axis=1)
    
    # Defining if the predicate is False or True
    predicate_status = False
    current_sentence_id = None
    
    # Iterating over dataframe to find predicate positions
    for j, row in df_train.iterrows():
        # Check if new sentence ID, restart predicate status
        if row['Copy_ID'] != current_sentence_id:
            predicate_status = False
            current_sentence_id = row['Copy_ID']

        # if position status is True then append to the list as predicate
        if row['UP:PRED'] != '_':
            predicate_status = True
            if i == j:
                nested_dict['position-to-pred'] = 'predicate'
        # if predicate_status is False and the current row has predicate, then append to the list as 'after'
        elif predicate_status:
            if i == j:
                nested_dict['position-to-pred'] = 'after'
        # if predicate_status is False and the current row has no predicate, then append to the list as 'before'
        else:
            if i == j:
                nested_dict['position-to-pred'] = 'before'
    
## GOVERNING CATEGORY

        # Iteraring over dataframe to find predicate positions
        # NSUBJ
        for i, row in df_train[df_train['DEPREL'] == 'nsubj'].iterrows():
            # Check if new sentence ID, restart predicate status
            if row['Copy_ID'] != current_sentence_id:
                predicate_status = False
                current_sentence_id = row['Copy_ID']
            if row['UP:PRED'] != '_':
                predicate_status = True
#                 if i == j:
                nested_dict['governing_category'] = 'predicate'
            # if predicate_status is False and the current row has nsubj, then append to the list as 'nsubj_after'
            elif predicate_status:
#                 if i == j:
                nested_dict['governing_category'] = 'nsubj_after'
            # if predicate_status is False and the current row has no nsubj, then append to the list as 'nsubj_before'
            else:
                nested_dict['governing_category'] = 'nsubj_before'
        # DOBJ
        for i, row in df_train[df_train['DEPREL'] == 'dobj'].iterrows():
            # Check if new sentence ID, restart predicate status
            if row['Copy_ID'] != current_sentence_id:
                predicate_status = False
                current_sentence_id = row['Copy_ID']
            if row['UP:PRED'] != '_':
                predicate_status = True
                nested_dict['governing_category'] = 'predicate'
            # if predicate_status is False and the current row has dobj, then append to the list as 'dobj_after'
            elif predicate_status:
                nested_dict['governing_category'] = 'dobj_after'
            # if predicate_status is False and the current row has no dobj, then append to the list as 'dobj_after'
            else:
                nested_dict['governing_category'] = 'dobj_before'
        # IOBJ       
        for i, row in df_train[df_train['DEPREL'] == 'iobj'].iterrows():
            # Check if new sentence ID, restart predicate status
            if row['Copy_ID'] != current_sentence_id:
                predicate_status = False
                current_sentence_id = row['Copy_ID']
            if row['UP:PRED'] != '_':
                predicate_status = True
                nested_dict['governing_category'] = 'predicate'
            # if predicate_status is False and the current row has iobj, then append to the list as 'iobj_after'
            elif predicate_status:
                nested_dict['governing_category'] = 'iobj_after'
            # if predicate_status is False and the current row has no iobj, then append to the list as 'iobj_after'
            else:
                nested_dict['governing_category'] = 'iobj_before'
    
    nested_dict_list.append(nested_dict)

  open_train = pd.read_csv(train_path, sep='\t', encoding='utf-8',quotechar='№')


In [5]:
# Iterate over to not print the whole nested dict
for d in nested_dict_list:
    print(d)

{'token': 'Al', 'head_lemma_pos': 'ROOT_ROOT', 'position-to-pred': 'before', 'governing_category': 'nsubj_before'}
{'token': '-', 'head_lemma_pos': 'Al_NNP', 'governing_category': 'nsubj_before', 'position-to-pred': 'before'}
{'token': 'Zaman', 'head_lemma_pos': 'Al_NNP', 'governing_category': 'nsubj_before', 'position-to-pred': 'before'}
{'token': ':', 'head_lemma_pos': 'Al_NNP', 'governing_category': 'nsubj_before', 'position-to-pred': 'before'}
{'token': 'American', 'head_lemma_pos': 'force_NNS', 'governing_category': 'nsubj_before', 'position-to-pred': 'before'}
{'token': 'forces', 'head_lemma_pos': 'kill_VBD', 'governing_category': 'nsubj_before', 'position-to-pred': 'before'}
{'token': 'killed', 'head_lemma_pos': 'Al_NNP', 'governing_category': 'nsubj_before', 'position-to-pred': 'before'}
{'token': 'Shaikh', 'head_lemma_pos': 'kill_VBD', 'governing_category': 'nsubj_before', 'position-to-pred': 'before'}
{'token': 'Abdullah', 'head_lemma_pos': 'Shaikh_NNP', 'governing_category':

In [11]:
print(nested_dict_list)

[{'token': 'Al', 'head_lemma_pos': 'ROOT_ROOT', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': 'Al', 'head_lemma_pos': 'ROOT_ROOT', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': 'Al', 'head_lemma_pos': 'ROOT_ROOT', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': 'Al', 'head_lemma_pos': 'ROOT_ROOT', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': '-', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': '-', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': '-', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': '-', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'before', 'governing_category': 'iobj_before'}, {'token': 'Zaman', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'before', 'governing_categor