# Feature Extraction - "Head word & its POS", "Position to predicate", & "Governing category"

# Advanced NLP, Assignment 2

This notebook provides the code to extract head word of each token and its POS tag, position to predicate, and governing category

Data: Universal Propositions Bank 1.0 English data

The data used here is a small set of the entire data.

In [3]:
# Import dependencies
import pandas as pd

In [4]:
# Define header names for the column
header = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC','UP:PRED','UP:ARGHEADS_1','UP:ARGHEADS_2','UP:ARGHEADS_3','UP:ARGHEADS_4','UP:ARGHEADS_5','UP:ARGHEADS_6','UP:ARGHEADS_7','UP:ARGHEADS_8','UP:ARGHEADS_9','UP:ARGHEADS_10','UP:ARGHEADS_11','UP:ARGHEADS_12','UP:ARGHEADS_13','UP:ARGHEADS_14','UP:ARGHEADS_15','UP:ARGHEADS_16','UP:ARGHEADS_17','UP:ARGHEADS_18','UP:ARGHEADS_19','UP:ARGHEADS_20','UP:ARGHEADS_21','UP:ARGHEADS_22','UP:ARGHEADS_23','UP:ARGHEADS_24','UP:ARGHEADS_25','UP:ARGHEADS_26','UP:ARGHEADS_27','UP:ARGHEADS_28','UP:ARGHEADS_29','UP:ARGHEADS_30','UP:ARGHEADS_31','UP:ARGHEADS_32','UP:ARGHEADS_33','UP:ARGHEADS_34','UP:ARGHEADS_35']

train_path = '../data/train_split.tsv'

In [5]:
# Open file with pandas
df_train = pd.read_csv(train_path, sep='\t', encoding='utf-8',quotechar='№')
# print(df_train)

  df_train = pd.read_csv(train_path, sep='\t', encoding='utf-8',quotechar='№')


In [13]:
def extract_head_info(df):
    """
    Function to extract head information (lemma and XPOS) for all tokens in the DataFrame.

    param:df:pandas.core.frame.DataFrame

    returns:pandas.core.series.Series
    """
    features_list = []
    for i in range(len(df)):
        lemma = df.loc[i, 'LEMMA']
        head = int(df.loc[i, 'HEAD'])
        xpos = df.loc[i, 'XPOS']
        token = df.loc[i, 'FORM']
        
        if head == 0:
            head_lemma = 'ROOT'
            head_pos = 'ROOT'
        else:
            head_lemma = df.loc[head - 1, 'LEMMA']
            head_pos = df.loc[head - 1, 'XPOS']
        
        head_lemma_pos = f"{head_lemma}_{head_pos}"
#         nested_dict = {'token': token, 'head_lemma_pos': head_lemma_pos}
        features_list.append(head_lemma_pos)
    return features_list

# Apply the function
head_info = extract_head_info(df_train)
# Adding the output to the new column of dataframe first due to memory limitation
df_train['head_info'] = extract_head_info(df_train)

In [15]:
def extract_governing_category(row):
    """
    Function to extract the governing category of the token based on row of dataframe of the token.

    param:row:pandas.core.series.Series

    returns:governing category of the token:str
    """
    deprel = row['DEPREL']
    if deprel == 'nsubj':
        return 'nsubj'
    elif deprel == 'dobj':
        return 'dobj'
    else:
        return '0'

In [16]:
def extract_position_to_pred(row, predicate_positions):
    """
    Function to extract position of the token relative to the predicate based on row of dataframe of the token.

    param:row:pandas.core.series.Series
    param:predicate_positions:set

    returns:position of the token relative to the predicate:str
    """
    if row.name in predicate_positions:
        return 'predicate'
    elif any(j in predicate_positions for j in range(row.name+1, len(df))):
        return 'after'
    else:
        return 'before'

In [17]:
def create_nested_dict_list(df):
    """
    Function to combine head information, governing category, and position to predicate for all tokens in the DataFrame into a nested dictionary.

    param:df:pandas.core.frame.DataFrame

    returns:list of nested dictionaries
    """
    # Get predicate positions for all sentences
    predicate_positions = set(df[df['UP:PRED'] != '_'].index)

    nested_dict_list = df.apply(lambda row: {'token': row['FORM'],
                                               'head_lemma_pos': row['head_info'],
                                               'position-to-pred': extract_position_to_pred(row, predicate_positions),
                                               'governing_category': extract_governing_category(row)},
                                  axis=1).tolist()
    return nested_dict_list

In [18]:
# Apply create_nested_dict_list function
df = df_train
nested_dict_list = create_nested_dict_list(df_train)

In [19]:
print(nested_dict_list[:100])

[{'token': 'Al', 'head_lemma_pos': 'ROOT_ROOT', 'position-to-pred': 'after', 'governing_category': '0'}, {'token': '-', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'after', 'governing_category': '0'}, {'token': 'Zaman', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'after', 'governing_category': '0'}, {'token': ':', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'after', 'governing_category': '0'}, {'token': 'American', 'head_lemma_pos': 'force_NNS', 'position-to-pred': 'after', 'governing_category': '0'}, {'token': 'forces', 'head_lemma_pos': 'kill_VBD', 'position-to-pred': 'after', 'governing_category': 'nsubj'}, {'token': 'killed', 'head_lemma_pos': 'Al_NNP', 'position-to-pred': 'predicate', 'governing_category': '0'}, {'token': 'Shaikh', 'head_lemma_pos': 'kill_VBD', 'position-to-pred': 'after', 'governing_category': '0'}, {'token': 'Abdullah', 'head_lemma_pos': 'Shaikh_NNP', 'position-to-pred': 'after', 'governing_category': '0'}, {'token': 'al', 'head_lemma_pos': 'Shaikh_N