In [4]:
import pandas as pd
import json

In [129]:
debates = [
    "september_26_2016_presidential_transcript.json",
    "october_4_2016_vice_presidential_transcript.json",
    "october_9_2016_presidential_transcript.json",
    "october_19_2016_presidential_transcript.json"
]

In [243]:
MAX_Q_LENGTH_WORDS = 56
MAX_TOTAL_CONTEXT_LENGTH_WORDS = 300  # may go slightly over with special token

speakers = ["CLINTON", "TRUMP", "KAINE", "PENCE", "HOST"]
def process_json(json_path):
    with open(json_path, "r") as f:
        debate = json.load(f)

    context = debate["context"][0]  # this will be generally included
    context_length_tokens = len(context.split())
    # qs = []
    texts = []
    for section_idx, section in enumerate(debate["qa_pairs"]):
        # if section_idx < 5:
        #     continue
        question = section["QUESTION"]
        q_length_tokens = len(question.split())
        if len(question) > 300:
            print(f"issue -- question: {question}, {len(question)}")
            break

        dialogues = section["DIALOGUE"].split("\n")  # all of the debates
        conversation_info = []  # (speaker), (text)
        for dial in dialogues:
            speak = [s for s in speakers if f"{s}:" in dial[:10]]
            assert len(speak) == 1, f"Issue with speakers {speak} on section {section_idx} for dialogue {dial} in debate {json_path}."
            speak = speak[0]
            conversation_info.append((speak, dial[len(speak)+2:]))

        for turn_idx, turn in enumerate(conversation_info):
            # now, let's convert this fully into text
            remaining_intro_tokens = MAX_TOTAL_CONTEXT_LENGTH_WORDS - context_length_tokens - q_length_tokens
            # print(remaining_intro_tokens)
        
            if turn[0] == "HOST":  # don't model this
                continue
            
            text_entry = f"Play along in the following scenario. {context} [START_Q] {question} [END_Q]"
            prev_contexts = ""
            prev_idx = turn_idx - 1
            # print(remaining_intro_tokens, turn[1][:40])
            while remaining_intro_tokens > 0 and prev_idx >= 0:
                prev_speaker, prev_words = conversation_info[prev_idx]
                if len(prev_words.split()) > remaining_intro_tokens:
                    prev_contexts = f"[START_{prev_speaker}] {' '.join(prev_words.split()[-remaining_intro_tokens:])} [END_{prev_speaker}] {prev_contexts}"  # keep more recent later
                else:
                    prev_contexts = f"[START_{prev_speaker}] {prev_words} [END_{prev_speaker}] {prev_contexts}"
                prev_idx -= 1
                remaining_intro_tokens -= len(prev_words.split())

            texts.append(
                {
                    "text": f"{text_entry} {prev_contexts} [START_{turn[0]}] {turn[1]} [END_{turn[0]}]",
                    "main_speaker": turn[0]
                }
            )
    return texts

In [255]:
texts = []
for debate in debates:
    texts.extend(process_json(debate))

In [256]:
len(texts)

863

## Now, let's save these to result files

In [259]:
clinton_text = pd.DataFrame.from_dict({"text": [t["text"] for t in texts if t["main_speaker"] == "CLINTON"]})
clinton_text.head()

Unnamed: 0,text
0,Play along in the following scenario. HOST: I ...
1,Play along in the following scenario. HOST: I ...
2,Play along in the following scenario. HOST: I ...
3,Play along in the following scenario. HOST: I ...
4,Play along in the following scenario. HOST: I ...


In [260]:
trump_text = pd.DataFrame.from_dict({"text": [t["text"] for t in texts if t["main_speaker"] == "TRUMP"]})
trump_text.head()

Unnamed: 0,text
0,Play along in the following scenario. HOST: I ...
1,Play along in the following scenario. HOST: I ...
2,Play along in the following scenario. HOST: I ...
3,Play along in the following scenario. HOST: I ...
4,Play along in the following scenario. HOST: I ...


In [261]:
kaine_text = pd.DataFrame.from_dict({"text": [t["text"] for t in texts if t["main_speaker"] == "KAINE"]})
kaine_text.head()

Unnamed: 0,text
0,Play along in the following scenario. HOST: We...
1,Play along in the following scenario. HOST: We...
2,Play along in the following scenario. HOST: We...
3,Play along in the following scenario. HOST: We...
4,Play along in the following scenario. HOST: We...


In [263]:
pence_text = pd.DataFrame.from_dict({"text": [t["text"] for t in texts if t["main_speaker"] == "PENCE"]})
pence_text.head()

Unnamed: 0,text
0,Play along in the following scenario. HOST: We...
1,Play along in the following scenario. HOST: We...
2,Play along in the following scenario. HOST: We...
3,Play along in the following scenario. HOST: We...
4,Play along in the following scenario. HOST: We...


In [264]:
len(clinton_text), len(trump_text), len(kaine_text), len(pence_text)

(213, 298, 169, 183)

In [265]:
clinton_text.to_parquet("clinton_dataset.parquet")
trump_text.to_parquet("trump_dataset.parquet")
kaine_text.to_parquet("kaine_dataset.parquet")
pence_text.to_parquet("pence_dataset.parquet")

## Load that data

In [12]:
ds = pd.read_parquet("processed_data/pence_dataset.parquet")

In [13]:
for t in ds.text.tolist():
    assert len(t.split()) < 1000