In [None]:
import os

download_name = "frankenstein.txt.bz2"
if not os.path.exists(download_name):
    import requests
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/assignments/a04/{download_name}")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()

name = "frankenstein.txt"
if not os.path.exists(name):
    import bz2
    with open(download_name, 'rb') as bzf, open(name, 'wb') as fp:
        fp.write(bz2.decompress(bzf.read()))        

In [1]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
from tqdm.notebook import tqdm

In [2]:
args = Namespace(
    raw_dataset_txt="frankenstein.txt",
    window_size=5,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="frankenstein_with_splits.csv",
    seed=1337
)

In [3]:
# Split the raw text book into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)

In [None]:
sentences

In [4]:
print (len(sentences), "sentences")
print ("Sample:", sentences[100])

3427 sentences
Sample: No incidents have hitherto befallen us that would make a figure in a
letter.


In [10]:
# Clean sentences
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [11]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

In [12]:
# Global vars
MASK_TOKEN = "<MASK>"

In [14]:
# Create windows
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
    [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm(cleaned_sentences)])

# Create cbow data
data = []
for window in tqdm(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])
    
            
# Convert to dataframe
cbow_data = pd.DataFrame(data, columns=["context", "target"])

  0%|          | 0/3427 [00:00<?, ?it/s]

  0%|          | 0/90698 [00:00<?, ?it/s]

In [15]:
cbow_data

Unnamed: 0,context,target
0,", or the modern prometheus",frankenstein
1,frankenstein or the modern prometheus by,","
2,"frankenstein , the modern prometheus by mary",or
3,"frankenstein , or modern prometheus by mary wo...",the
4,"frankenstein , or the prometheus by mary wolls...",modern
...,...,...
90693,our email newsletter to hear new ebooks .,about
90694,email newsletter to hear about ebooks .,new
90695,newsletter to hear about new .,ebooks
90696,to hear about new ebooks,.


In [9]:
# Create split data
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [10]:
cbow_data.head()

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,train
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train


In [11]:
# Write split data to file
cbow_data.to_csv(args.output_munged_csv, index=False)