In [2]:
import pandas as pd
import os

In [5]:
fdir = "raw_data/"
general_debates = []
for root, subroot, files in os.walk(fdir):
    if root == fdir:
        general_debates.extend([f"{root}{f}" for f in files])

general_debates

['raw_data/november_30_2016.txt',
 'raw_data/may_10_2017.txt',
 'raw_data/october_19_2016.txt',
 'raw_data/may_24_2017.txt',
 'raw_data/march_15_2017.txt',
 'raw_data/august_9_2017.txt',
 'raw_data/september_21_2016.txt',
 'raw_data/august_16_2017.txt',
 'raw_data/august_2_2017.txt',
 'raw_data/june_21_2017.txt',
 'raw_data/september_14_2016.txt',
 'raw_data/july_26_2017.txt',
 'raw_data/november_2_2016.txt',
 'raw_data/december_7_2016.txt',
 'raw_data/november_9_2016.txt',
 'raw_data/may_3_2017.txt',
 'raw_data/april_5_2017.txt',
 'raw_data/july_5_2017.txt',
 'raw_data/june_28_2017.txt',
 'raw_data/march_22_2017.txt',
 'raw_data/june_7_2017.txt',
 'raw_data/november_16_2016.txt',
 'raw_data/april_12_2017.txt']

In [234]:
parties = {"GREEN", "LABOUR", "NATIONAL", "NZ FIRST", "OPPORTUNITIES", "UNITED FUTURE", "MĀORI", "ACT"}

speaker_party_mapping = {
    "mr speaker": "HOST",
    "mrs speaker": "HOST",
    "mr deputy speaker": "HOST",
    "hon members": "HOST",  # these aren't attributed to a specific person
    "hon member": "HOST",  # same idea here
    "grant robertson": "LABOUR",
    "hon david cunliffe": "LABOUR",
    "hon paula bennett": "NATIONAL",
    "hon anne tolley": "NATIONAL",
    "phil twyford": "LABOUR",
    "tracey martin": "NZ_FIRST",
    "barbara kuriger": "NATIONAL",
    "chris bishop": "NATIONAL",
    "hon nikki kaye": "NATIONAL",
    "hon alfred ngaro": "NATIONAL",
    "louisa wall": "LABOUR",
    "hon hekia parata": "NATIONAL",
    "hon jo goodhew": "NATIONAL",
    "hon trevor mallard": "LABOUR",
    "hon louise upston": "NATIONAL",
    "hon maggie barry": "NATIONAL",
    "lindsay tisch": "NATIONAL",
    "andrew little": "LABOUR",
    "hon michael woodhouse": "NATIONAL",
    "hon jacqui dean": "NATIONAL",
    "rt hon winston peters": "NZ FIRST",
    "hon ruth dyson": "LABOUR",
    "hon nathan guy": "NATIONAL",
    "hon david bennett": "NATIONAL",
    "sue moroney": "LABOUR",
    "hon scott simpson": "NATIONAL",
    "hon bill english": "NATIONAL",
    "bill english": "NATIONAL",
    "rt hon bill english": "NATIONAL",
    "clayton mitchell": "NZ FIRST",
    "hon dr jonathan coleman": "NATIONAL",
    "hon simon bridges": "NATIONAL",
    "hon todd mcclay": "NATIONAL",
    "hon amy adams": "NATIONAL",
    "hon steven joyce": "NATIONAL",
    "hon paul goldsmith": "NATIONAL",
    "hon tim macindoe": "NATIONAL",
    "hon nicky wagner": "NATIONAL",
    "hon christopher finlayson": "NATIONAL",
    "dr megan woods": "LABOUR",
    "todd barclay": "NATIONAL",
    "hon judith collins": "NATIONAL",
    "hon craig foss": "NATIONAL",
    "tim macindoe": "NATIONAL",
    
}

exception_set = {
    "here are the reasons why it all got a bit twisted", "the challenge for that opposition is this", "conservation",
    "that is what this government is all about", "chris hipkins said", "let us make no mistake", "secondly", "now",
    "new zealanders are saying", "i remember that budget for another thing", "the labour party is saying",
    "my message is this to te ao māori", "there is something important i want to touch on", "so i want to say",
    "the people of new zealand are out there saying", "here is another point", "i will read the transcript of that phone call",
    "well, here is the challenge, government", "i want to quote from the report", "more on our positive vision", 
    "statistics new zealand said", "look at any metric", "here is some good advice", "let me talk about",
    "i would like to ask", "some people ask", "people out there", "next, she says", "and then there is this",
    "so that is what we have got", "the reason is", "i do have a little message", "a quick update on",
    "here is a very interesting question", "then the other question", "leadership—leadership 101 says",
    "i just want to say one more thing", "the media were asking", "i want to stress one thing", "here is what he should have said",
    "here it is", "if we go down further", "here is another myth",
}

def process_debate(debate_file):
    with open(debate_file, "r") as f:
        lines = f.readlines()

    lines = [l.strip() for l in lines if len(l.strip()) > 0]

    statements = []  # (speaker, text -- paragraph-separated)
    prev_speaker = None
    running_statement = ""
    for line in lines:
        text = line
        if ":" in text[:50] and not any([e in line[:50].lower() for e in exception_set]):
            if prev_speaker is not None:  ## add old information
                statements.append((prev_speaker, running_statement.strip()))
            
            splits = text.split(":")
            speaker = splits[0]
            text = ":".join(splits[1:])
    
            if "(" in speaker:
                speaker, party_info = speaker.split("(")
                for party in parties:
                    if party.lower() in party_info.lower():
                        assert speaker not in speaker_party_mapping or speaker_party_mapping[speaker] == party
                        speaker_party_mapping[speaker.lower().strip()] = party
                if speaker.lower().strip() not in speaker_party_mapping and "minister" not in party_info.lower():
                    for person in speaker_party_mapping:
                        if person in party_info.lower().strip():  # was clarifying a person in a named position
                            speaker = person
                            break
                    if speaker.lower().strip() not in speaker_party_mapping:  # still not there...
                        print(f"\tMISSED ADDING PARTY FOR {speaker} and {party_info}??")
                        assert False
    
            if speaker.lower().strip() not in speaker_party_mapping:
                print(f"\tMISSING PARTY INFORMATION FOR: {speaker.lower().strip()} (have {
                    sorted(list(speaker_party_mapping.keys()))})")
                assert False
            prev_speaker = speaker_party_mapping[speaker.lower().strip()]
    
        running_statement += f"{text.strip()}\n"
    
    # add what we have left
    statements.append((prev_speaker, running_statement.strip()))

    return statements

In [235]:
debate_statements = {}
for debate in general_debates:
    print(f"...{debate}")
    debate_statements[debate] = process_debate(debate)

...raw_data/november_30_2016.txt
...raw_data/may_10_2017.txt
...raw_data/october_19_2016.txt
...raw_data/may_24_2017.txt
...raw_data/march_15_2017.txt
...raw_data/august_9_2017.txt
...raw_data/september_21_2016.txt
...raw_data/august_16_2017.txt
...raw_data/august_2_2017.txt
...raw_data/june_21_2017.txt
...raw_data/september_14_2016.txt
...raw_data/july_26_2017.txt
...raw_data/november_2_2016.txt
...raw_data/december_7_2016.txt
...raw_data/november_9_2016.txt
...raw_data/may_3_2017.txt
...raw_data/april_5_2017.txt
...raw_data/july_5_2017.txt
...raw_data/june_28_2017.txt
...raw_data/march_22_2017.txt
...raw_data/june_7_2017.txt
...raw_data/november_16_2016.txt
...raw_data/april_12_2017.txt


In [236]:
if not os.path.exists("processed_data/"):
    os.makedirs("processed_data/")

In [259]:
MAX_ADD_CONTEXT_TOKENS = 500
def make_debate_dataset(statements):
    texts = []
    for dial_idx, dialogue in enumerate(statements):   # by the same speaker
        speaker, statement = dialogue
        paragraphs = statement.split("\n")
        for p_idx, para in enumerate(paragraphs):
            text_to_use = para
            plen = len(para.split())

            remaining_tokens = MAX_ADD_CONTEXT_TOKENS - plen
            prev_idx = p_idx - 1
            while remaining_tokens > 0 and prev_idx >= 0:
                prev_para = paragraphs[prev_idx]
                if len(prev_para.split()) <= remaining_tokens:   # add everything
                    text_to_use = f"{prev_para}\n{text_to_use}"
                else:  # add a part
                    text_to_use = f"{' '.join(prev_para.split()[-remaining_tokens:])}\n{text_to_use}"
                prev_idx -= 1
                remaining_tokens -= len(prev_para.split())

            # finished with this speaker...
            text_to_use = f"[START_{speaker}] {text_to_use} [END_{speaker}]"

            prev_dial = dial_idx - 1
            while remaining_tokens > 0 and prev_dial >= 0:  # go to previous speakers?
                prev_speaker, prev_dialogue = statements[prev_dial]
                prev_paragraphs = prev_dialogue.split("\n")
                prev_idx = len(prev_paragraphs)-1  # start with the last one!

                text_to_use = f"[END_{prev_speaker}] {text_to_use}"  # other things will be added before this
                while remaining_tokens > 0 and prev_idx >= 0:
                    prev_para = prev_paragraphs[prev_idx]
                    if len(prev_para.split()) <= remaining_tokens:   # add everything
                        text_to_use = f"{prev_para} {text_to_use}"
                    else:  # add a part
                        text_to_use = f"{' '.join(prev_para.split()[-remaining_tokens:])}\n{text_to_use}"
                    prev_idx -= 1
                    remaining_tokens -= len(prev_para.split())

                # broke out here -- done with this previous speaker
                text_to_use = f"[START_{prev_speaker}] {text_to_use}"
                prev_dial -= 1

            texts.append(text_to_use)
    return texts

In [264]:
for debate in debate_statements:
    print(f"...{debate}")
    texts = make_debate_dataset(statements)
    df = pd.DataFrame.from_dict({"text": texts})
    df.to_parquet(f"processed_data/{debate.split('/')[-1].split('.txt')[0]}.parquet")

...raw_data/november_30_2016.txt
...raw_data/may_10_2017.txt
...raw_data/october_19_2016.txt
...raw_data/may_24_2017.txt
...raw_data/march_15_2017.txt
...raw_data/august_9_2017.txt
...raw_data/september_21_2016.txt
...raw_data/august_16_2017.txt
...raw_data/august_2_2017.txt
...raw_data/june_21_2017.txt
...raw_data/september_14_2016.txt
...raw_data/july_26_2017.txt
...raw_data/november_2_2016.txt
...raw_data/december_7_2016.txt
...raw_data/november_9_2016.txt
...raw_data/may_3_2017.txt
...raw_data/april_5_2017.txt
...raw_data/july_5_2017.txt
...raw_data/june_28_2017.txt
...raw_data/march_22_2017.txt
...raw_data/june_7_2017.txt
...raw_data/november_16_2016.txt
...raw_data/april_12_2017.txt
