In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import regex as re
import os

pd.set_option('display.max_colwidth', None)

Get Inputs

In [10]:
import os
data_path = os.path.join(os.getcwd(), 'Datasets/RST-DT/training_data/original_data')
for file_name in os.listdir(data_path):
    if file_name.endswith('.out'):
        os.rename(os.path.join(data_path, file_name), os.path.join(data_path, file_name) + '.sents')

for file_name in os.listdir(data_path)[:3]:
    print(file_name)
    
# .dis files are tree files
# .edus files are files separeated by EDUs (one line each)
# .sents files are files separated by sentences (one line each)

0600.out.dis
0600.out.edus
0600.out.sents


Process Data

In [95]:
def get_edus_from_edu_files(edu_files):
    """
    Read .edus file and return a list of EDUs

    Args:
        edu_files: file handler

    Returns:
        A list of EDU with index corresponding to each EDU's id
    """
    edu_list = ['dummy_node']
    for line in edu_files:
        edu_list.append(line.strip())
    
    return edu_list

In [96]:
def read_dis_line(line: str):
    """
    Processes a line containing nuclearity, edu span, relation, (potentially) text and return the relavent data to the relation

    Args:
        line: A string containing the line

    Returns:
        A tuple of this type (num_indents, nuclearity, start_edu, end_edu, relation)
    """
    nucleus_pattern = r'\( Nucleus \((?:leaf|span) (\d+)(?: (\d+))?\) \(rel2par ([^)]*)\).*'
    satellite_pattern = r'\( Satellite \((?:leaf|span) (\d+)(?: (\d+))?\) \(rel2par ([^)]*)\).*'
    
    nucleus_match = re.match(nucleus_pattern, line.strip())
    satellite_match = re.match(satellite_pattern, line.strip())
    assert bool(nucleus_match) != bool(satellite_match)

    num_indents = len(line) - len(line.lstrip())
    nuclearity = "nucleus" if nucleus_match is not None else "satellite"
    group_match = nucleus_match if nucleus_match is not None else satellite_match
    start_edu = group_match.group(1) # first number after span|leaf
    end_edu = start_edu if group_match.group(2) is None else group_match.group(2)
    relation = group_match.group(3)

    return (num_indents, nuclearity, start_edu, end_edu, relation)

In [97]:
def read_dis_file(doc_name, dis_file, edu_list):
    """
    Processes EDU data and returns a dictionary containing relations with their nucleus and satellite EDUs.

    Args:
        data: A string containing the EDU data, with each line representing an EDU.

    Returns:
        A df with keys being doc name, nucleus text, satellite (or another nucleus) text, and relation label
    """
    relation_map = {"dummy": {"doc_name": "", "nucleus": "", "satellite": "", "relation": ""}} # to store relations that have not find their corresponding partners (nucleus/satellite)
    final_relations = pd.DataFrame(columns=['doc_name', 'nucleus', 'satellite', 'relation'])
    for line in dis_file:
        if not line.strip() or len(line.strip()) == 1 or line.startswith("( Root"):
            continue
        num_indents, nuclearity, start_edu, end_edu, relation = read_dis_line(line)
        text = ""

        # get associated text (EDU(s)) of either the nucleus or satellite
        for i in range(int(start_edu), int(end_edu) + 1):
            text += edu_list[i] + ' '

        if relation_map.get(num_indents) is None: # value with this indents as key is empty
            relation_map[num_indents] = {'doc_name': "", 'nucleus': "", 'satellite': "", 'relation': ""}
            cur_relation = relation_map[num_indents]
            cur_relation['doc_name'] = doc_name

            cur_relation[nuclearity] = text
            # if relation in RELATIONS:
            if relation != "span":
                cur_relation['relation'] = relation
        else:
            cur_relation = relation_map[num_indents]
            if len(cur_relation[nuclearity].strip()):
                assert nuclearity == 'nucleus'
                nuclearity = 'satellite'

            cur_relation[nuclearity] = text
            if relation != "span":
                cur_relation['relation'] = relation
            else:
                try: 
                    assert len(cur_relation['relation'].strip()) != 0
                except:
                    print(cur_relation)
            
            if "it would set off a political earthquake" in cur_relation['nucleus']:
                print(cur_relation)
            final_relations.loc[len(final_relations)] = cur_relation
            # remove dictionary of this indents after processing the full relation
            del relation_map[num_indents]

    return final_relations

In [98]:
# test run
with open(os.path.join(data_path, '0609.out.edus'), 'r') as edu_file: 
    with open(os.path.join(data_path, '0609.out.dis'), 'r') as dis_file:
        edu_list = get_edus_from_edu_files(edu_file)
        print("\nGOT EDUS!\n")
        relation_df = read_dis_file("0609", dis_file, edu_list)
        print("\nGOT RELATIONS\n")       


GOT EDUS!

{'doc_name': '0609', 'nucleus': 'President Bush badly wants a line-item veto and has long called for a law giving it to the president. Now the White House is declaring that he might not rely on Congress -- which hasn\'t shown any willingness to surrender such authority -- to pass the line-item veto law he seeks. White House spokesmen last week said Mr. Bush is considering simply declaring that the Constitution gives him the power, exercising a line-item veto and inviting a court challenge to decide whether he has the right. Although that may sound like an arcane maneuver of little interest outside Washington, it would set off a political earthquake. "The ramifications are enormous," says Rep. Don Edwards, a California Democrat who is a senior member of the House Judiciary Committee. "It\'s a real face-to-face arm wrestling challenge to Congress." White House aides know it\'s a step that can\'t be taken lightly -- and for that reason, the president may back down from launchi

AssertionError: 

In [70]:
# actual run

edu_files = [file_name for file_name in os.listdir(data_path) if file_name.endswith('.edus')]
dis_files = [file_name for file_name in os.listdir(data_path) if file_name.endswith('.dis')]

total_edus = []
total_relations = pd.DataFrame()
file_cnt = 0

for edu_file in edu_files: # edu file to get EDUs
    if not edu_file.endswith('.edus'):
        print("Not .edu file at ", edu_file)
        break

    edu_file_path = os.path.join(data_path, edu_file)
    with open(edu_file_path, 'r') as edu_f:
        edu_list = get_edus_from_edu_files(edu_f)
        total_edus.append(edu_list)

    dis_file = edu_file.replace('.edus', '.dis')
    dis_file_path = os.path.join(data_path, dis_file)
    doc_name = dis_file[:-8]

    with open(dis_file_path, 'r') as dis_f:
        relation_df = read_dis_file(doc_name, dis_f, edu_list)
        total_relations = pd.concat([total_relations, relation_df])
        
    file_cnt += 1
    print("Finished processing", dis_file, ":", relation_df.shape)

Finished processing 0600.out.dis : (2, 4)
Finished processing 0603.out.dis : (6, 4)
Finished processing 0604.out.dis : (154, 4)
Finished processing 0605.out.dis : (6, 4)
Finished processing 0606.out.dis : (56, 4)
Finished processing 0608.out.dis : (6, 4)
{'doc_name': '0609', 'nucleus': 'it would set off a political earthquake. ', 'satellite': '"The ramifications are enormous," says Rep. Don Edwards, a California Democrat who is a senior member of the House Judiciary Committee. ', 'relation': ''}


AssertionError: 

In [102]:
total_relations[total_relations['doc_name'] == '0608']

Unnamed: 0,doc_name,nucleus,satellite,relation
0,608,THE YALE POLITICAL UNION doesn't pay an honorarium to speakers.,"In Thursday's edition, it was incorrectly indicated that the union had paid a fee to former House Speaker Jim Wright.",Contrast
1,608,"In Thursday's edition, it was incorrectly indicated",that the union had paid a fee to former House Speaker Jim Wright.,elaboration-additional
2,608,"THE YALE POLITICAL UNION doesn't pay an honorarium to speakers. In Thursday's edition, it was incorrectly indicated that the union had paid a fee to former House Speaker Jim Wright.","(See: ""In Nation's Capital, Scandal Needn't Cut One's Speaking Fee --- Jim Wright, Michael Deaver and Host of Others Find Careers After Disgrace"" -- WSJ Oct. 26, 1989)",explanation-argumentative
3,608,"""In Nation's Capital, Scandal Needn't Cut One's Speaking Fee --- Jim Wright, Michael Deaver and Host of Others Find Careers After Disgrace"" -- WSJ Oct. 26, 1989)",(See:,comment
4,608,"""In Nation's Capital, Scandal Needn't Cut One's Speaking Fee","--- Jim Wright, Michael Deaver and Host of Others Find Careers After Disgrace""",elaboration-additional
5,608,"""In Nation's Capital, Scandal Needn't Cut One's Speaking Fee --- Jim Wright, Michael Deaver and Host of Others Find Careers After Disgrace""","-- WSJ Oct. 26, 1989)",attribution
