In [1]:
import csv
import json
import random
import re
from typing import List

In [3]:
def clean_sentence(sentence: str):
    IGNORE_CHARS = [',', '.', ';', ':', "'s", '?']
    
    sentence = sentence.lower()
    # clean sentence
    for char in IGNORE_CHARS:
        sentence = sentence.replace(char, '')

    return sentence


def tokenize_sentence(sentence: str):
    WORD_DELIMITERS = [' ', '  ', '-', '\n']
        
    return re.split('|'.join(WORD_DELIMITERS), sentence)


def create_vocab(corpus: List[str]):
    vocab_dict: Dict[str, int] = {'_blank': 0}
    _vidx = 1

    for sentence in corpus:
        for word in sentence:
            if word not in vocab_dict:
                vocab_dict[word] = _vidx
                _vidx += 1
    return vocab_dict


def calc_vocab_freq_dist(corpus):
    vocab_freqs = {word: 0 for word in create_vocab(corpus)}
    word_counter = 0
    
    for sentence in corpus:
        for word in sentence:
            word_counter += 1
            vocab_freqs[word] += 1
    
    return {word: (freq, freq / word_counter) for word, freq in vocab_freqs.items()}    
    
    
def build_samples_from_tokenized_sentence(sentence: List[str]):
    WINDOW_LENGTH: int = 2
    FILLER_IDX = '_blank'
    
    samples: List[Dict[str, str]] = list()
    n_words = len(sentence)
    
    for t, word in enumerate(sentence):
        sample = dict()
        sample['word'] = word
        sample['context'] = list()

        for dt in list(range(-WINDOW_LENGTH, WINDOW_LENGTH + 1)):
            _t = t + dt
            if _t != t:
                if 0 < _t < n_words - 1:
                    sample['context'].append(sentence[t + dt])
                else:
                    sample['context'].append(FILLER_IDX)

        samples.append(sample)

    return samples
            
            

In [4]:
corpus = list()


with open('../data/t_bbe.csv') as f:
    csv_reader = csv.DictReader(f)
    
    for line in csv_reader:
        corpus.append(
            tokenize_sentence(
                clean_sentence(line['t'])
            )
        )


In [5]:
vocab = create_vocab(corpus)

In [6]:
vocab_dist = calc_vocab_freq_dist(corpus)

In [7]:
samples = []

for sentence in corpus:
    samples += build_samples_from_tokenized_sentence(sentence)
    

In [8]:
random.shuffle(samples)

train = samples[:676290]
val = samples[676290:]

In [9]:
with open('../data/train.json', 'w') as f:
    f.write(json.dumps(train))
    
with open('../data/val.json', 'w') as f:
    f.write(json.dumps(val))
    
with open('../data/vocab.json', 'w') as f:
    f.write(json.dumps(vocab))
    
with open('../data/vocab_freq.json', 'w') as f:
    f.write(json.dumps(vocab_dist))