# Imports

In [1]:
from convokit import Corpus, download
from utils import extractSentencePairs, split_data, addPairsToVoc, trimRareWords
from vocabulary import Voc

# Loading Data & Preprocessing

In [2]:
# Load the Friends corpus using ConvoKit
corpus = Corpus(filename=download("friends-corpus"))

# Extract the sentence pairs from the corpus
qa_pairs = extractSentencePairs(corpus)

# Split data into training and validation sets
training_pairs, validation_pairs = split_data(qa_pairs, 0.9)

Dataset already exists at C:\Users\cathe\.convokit\downloads\friends-corpus


# Creating Vocabulary

In [3]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

# Create vocabulary and add pairs to it
voc = Voc("FriendsCorpus", 
          PAD_token=PAD_token, 
          SOS_token=SOS_token, 
          EOS_token=EOS_token)
addPairsToVoc(voc, training_pairs)

# Trim rare words
training_pairs = trimRareWords(voc, training_pairs)

keep_words 6128 / 15881 = 0.3859
Trimmed from 49941 pairs to 36316, 0.7272 of total


In [4]:
# Remove validation pairs that contain words not in the vocabulary
validation_pairs = [pair for pair in validation_pairs if all(word in voc.word2index for word in pair[0].split()) and all(word in voc.word2index for word in pair[1].split())]

print(f"Training pairs: {len(training_pairs)}")
print(f"Validation pairs: {len(validation_pairs)}")

Training pairs: 36316
Validation pairs: 3848


In [9]:
# Save the training pairs to a file
filename = 'data/validation_pairs.txt'

with open(filename, 'w', encoding='utf-8') as file:
    for pair in validation_pairs:
        file.write(pair[0] + '\t' + pair[1] + '\n')

In [10]:
# Save the validation pairs to a file
filename = 'data/training_pairs.txt'

with open(filename, 'w', encoding='utf-8') as file:
    for pair in training_pairs:
        file.write(pair[0] + '\t' + pair[1] + '\n')

In [11]:
# Save the vocabulary to a file
filename = 'data/voc.txt'

with open(filename, 'w', encoding='utf-8') as file:
    file.write(str(voc.__dict__))