In [7]:
from convokit import Corpus, download
import re
import unicodedata
import csv
import codecs

In [8]:
def unicodeToAscii(s):
    """
    Converts the Unicode string to plain ASCII.
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [9]:
def normalizeString(s):
    """
    Lowercase, trim, replace ellipses with a single full stop, 
    and remove non-letter characters except for basic punctuation.
    """
    # Convert to ASCII
    s = unicodeToAscii(s.lower().strip())
    # Replace ellipses with a single full stop
    s = re.sub(r"\.\s*\.\s*\.\s*", ". ", s)
    # Space out punctuation
    s = re.sub(r"([.!?])", r" \1", s)
    # Remove non-letter characters except for basic punctuation
    s = re.sub(r"[^a-zA-Z.!?']+", r" ", s)
    # Replace multiple spaces with a single space
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [10]:
def extractSentencePairs(corpus):
    qa_pairs = []
    for conv_id in corpus.get_conversation_ids():
        conversation = corpus.get_conversation(conv_id)
        utterances = conversation.get_utterance_ids()
        for i in range(len(utterances) - 1):
            input_line = corpus.get_utterance(utterances[i]).text.strip()
            target_line = corpus.get_utterance(utterances[i+1]).text.strip()
            if input_line and target_line:  # Filter out empty lines
                qa_pairs.append([normalizeString(input_line), normalizeString(target_line)])
    return qa_pairs

In [11]:
# Load the Friends corpus using ConvoKit
corpus = Corpus(filename=download("friends-corpus"))

qa_pairs = extractSentencePairs(corpus)

# Save the sentence pairs to a new CSV file
outputfile = 'preprocessed_pairs.txt'
delimiter = '\t'
# Using codecs.decode to unescape the escape character
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

print("\nWriting newly formatted file...")
with open(outputfile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in qa_pairs:
        writer.writerow(pair)

print("Done writing to file")

Dataset already exists at C:\Users\cathe\.convokit\downloads\friends-corpus

Writing newly formatted file...
Done writing to file


In [12]:
# checking for malformed pairs in preprocessing
for pair in qa_pairs:
    assert len(pair) == 2, f"Found a pair with length {len(pair)}: {pair}"