In [75]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")




In [3]:
model_type = 'albert-base-v1' #albert-base-v1, bert-base-cased, bert-base-uncased
data_path = "D:/Data/neural-punctuator/ted-talks/"

with open(data_path + 'train_texts.txt', 'r', encoding='utf-8') as f:
    train_text = f.readlines()
with open(data_path + 'dev_texts.txt', 'r', encoding='utf-8') as f:
    valid_text = f.readlines()
with open(data_path + 'test_texts_2012.txt', 'r', encoding='utf-8') as f:
    test_text = f.readlines()

In [4]:
len(train_text), len(valid_text), len(test_text)

(1029, 8, 11)

In [6]:
# data_path = "D:/Data/neural-punctuator/ted_dataset/"
# file_path = data_path + "ted_talks-25-Apr-2012.json"

# with open(file_path, 'r', encoding='utf-8') as f:
#     data = json.load(f)
    
# transcripts = [d['transcript'] for d in data]
# len(transcripts)

# raw_text = '\n'.join(transcripts)
# len(raw_text.split(' '))

# action_words = re.findall('\([a-zA-Z]+\)', raw_text)
# action_words = list(set(action_words))
# len(action_words)

# with open(data_path + 'action_words.pkl', 'wb') as f:
#     pickle.dump(action_words, f)

# ts = []

# # Keep ordering
# for t in transcripts:
#     t = t.replace('\n', ' ')
    
#     if t not in ts and len(t) > 0:
#         ts.append(t)

# transcripts = ts
# len(transcripts)

In [7]:
def clean_text(text):
    escape_words = action_words

    for ew in escape_words:
        text = text.replace(ew, '')

    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    
    reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"
    r = re.compile(reg, re.DOTALL)
    text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    
#     text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r',\s?,', ',', text)
    text = re.sub(r',\s?\.', '.', text)
    text = re.sub(r'\?\s?\.', '?', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

In [282]:
ts = []
for t in tqdm(transcripts):
    ts.append(clean_text(t))
transcripts = ts

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1068.0), HTML(value='')))




In [283]:
transcripts = [t for t in transcripts if len(t)>0]
len(transcripts)

1066

In [284]:
len(' '.join(transcripts).split(' '))

2402325

In [285]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [286]:
target_ids = tokenizer.encode(".?,")[1:-1]
target_ids

[13, 9, 60, 15]

In [287]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,"}
target_token2id

{'.': 9, '?': 60, ',': 15}

In [288]:
target_ids = list(target_token2id.values())
target_ids

[9, 60, 15]

In [291]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in words:
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
        
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    return encoded_words, targets

In [292]:
encoded_texts, targets = create_target(transcripts[164])

In [293]:
# print(transcripts[164])

In [294]:
encoded_texts, targets = zip(*(create_target(ts) for ts in tqdm(transcripts)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1066.0), HTML(value='')))




In [295]:
# encoded_words, targets
for te, ta in zip(encoded_texts[0], targets[0]):
    print(f"{tokenizer._convert_id_to_token(te):15}\t{ta}")

▁shall         	0
▁i             	0
▁ask           	0
▁for           	0
▁a             	0
▁show          	0
▁of            	0
▁hands         	0
▁or            	0
▁a             	0
▁clapping      	0
▁of            	0
▁people        	0
▁in            	0
▁different     	0
▁generations   	2
▁i             	-1
'              	-1
m              	0
▁interested    	0
▁in            	0
▁how           	0
▁many          	0
▁are           	0
▁three         	0
▁to            	0
▁12            	0
▁years         	0
▁old           	1
▁none          	3
▁huh           	2
▁all           	0
▁right         	1
▁i             	-1
'              	-1
m              	0
▁going         	0
▁to            	0
▁talk          	0
▁about         	0
▁dinosaurs     	1
▁do            	0
▁you           	0
▁remember      	0
▁dinosaurs     	0
▁when          	0
▁you           	0
▁were          	0
▁that          	0
▁age           	2
▁dinosaurs     	0
▁are           	0
▁kind          	0
▁of            	0
▁funny         	3
▁you  

▁a             	0
▁dinosaur      	0
▁is            	0
▁hard          	0
▁to            	0
▁do            	3
▁as            	0
▁you           	0
▁can           	0
▁imagine       	3
▁because       	0
▁in            	0
▁museums       	0
▁bones         	0
▁are           	0
▁precious      	1
▁you           	0
▁go            	0
▁into          	0
▁a             	0
▁museum        	0
▁and           	0
▁they          	0
▁take          	0
▁really        	0
▁good          	0
▁care          	0
▁of            	0
▁them          	1
▁they          	0
▁put           	0
▁them          	0
▁in            	0
▁foam          	3
▁little        	0
▁containers    	1
▁they          	-1
'              	-1
re             	0
▁very          	0
▁well          	0
▁taken         	0
▁care          	0
▁of            	1
▁they          	0
▁don           	-1
'              	-1
t              	0
▁like          	0
▁it            	0
▁if            	0
▁you           	0
▁come          	0
▁in            	0
▁and           	0
▁want 

▁tri           	-1
cerat          	-1
ops            	1
▁and           	0
▁before        	0
▁the           	0
▁year          	0
▁2000          	3
▁now           	0
▁remember      	3
▁tri           	-1
cerat          	-1
ops            	0
▁was           	0
▁first         	0
▁found         	0
▁in            	0
▁the           	0
▁1800          	-1
s              	3
▁before        	0
▁2000          	3
▁no            	0
▁one           	0
▁had           	0
▁ever          	0
▁seen          	0
▁a             	0
▁juvenile      	0
▁tri           	-1
cerat          	-1
ops            	1
▁there         	-1
'              	-1
s              	0
▁a             	0
▁tri           	-1
cerat          	-1
ops            	0
▁in            	0
▁every         	0
▁museum        	0
▁in            	0
▁the           	0
▁world         	3
▁but           	0
▁no            	0
▁one           	0
▁had           	0
▁ever          	0
▁collected     	0
▁a             	0
▁juvenile      	1
▁and           	0
▁we            	0

▁up            	0
▁their         	0
▁jaw           	-1
s              	0
▁and           	0
▁it            	0
▁turned        	0
▁out           	0
▁the           	0
▁biggest       	0
▁one           	0
▁had           	0
▁12            	0
▁teeth         	0
▁and           	0
▁the           	0
▁next          	0
▁smallest      	0
▁one           	0
▁had           	0
▁13            	0
▁and           	0
▁the           	0
▁next          	0
▁smallest      	0
▁had           	0
▁14            	1
▁and           	0
▁of            	0
▁course        	3
▁nano          	0
▁has           	0
▁17            	1
▁and           	0
▁we            	0
▁just          	0
▁went          	0
▁out           	0
▁and           	0
▁looked        	0
▁at            	0
▁other         	0
▁people        	-1
'              	-1
s              	0
▁collections   	0
▁and           	0
▁we            	0
▁found         	0
▁one           	0
▁that          	0
▁has           	0
▁sort          	0
▁of            	0
▁15            	0
▁teeth 

In [296]:
n = -120
test_n = -6#int(.96*len(texts))
train_texts = encoded_texts[:n]
train_targets = targets[:n]
valid_texts = encoded_texts[n:test_n]
valid_targets = targets[n:test_n]
test_texts = encoded_texts[test_n:]
test_targets = targets[test_n:]
len(train_texts), len(valid_texts), len(test_texts)

(946, 114, 6)

In [297]:
len(' '.join(raw_text.split('\n')[:n]).split(' ')), \
len(' '.join(raw_text.split('\n')[n:test_n]).split(' ')), \
len(' '.join(raw_text.split('\n')[test_n:]).split(' '))

(2264426, 195701, 19486)

In [298]:
with open(data_path + f'{model_type}/train_data.pkl', 'wb') as f:
    pickle.dump((train_texts, train_targets), f)
with open(data_path + f'{model_type}/valid_data.pkl', 'wb') as f:
    pickle.dump((valid_texts, valid_targets), f)
with open(data_path + f'{model_type}/test_data.pkl', 'wb') as f:
    pickle.dump((test_texts, test_targets), f)

In [299]:
from collections import Counter

for ds in (train_targets, valid_targets, test_targets):
    c = Counter((t for targets in ds for t in targets))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

128608	9634	176027	1847785	255496
13740	879	15578	197567	27699
829	31	935	10712	1390


In [300]:
e = []
i = 0
raw_words = transcripts[30].split(' ')

for te, ta in zip(train_texts[30], train_targets[30]):
    if ta == -1:
        e.append(te)
    else:
        e.append(te)
        print(f"{tokenizer.decode(e):15}\t{tokenizer.decode(target2id[ta])}\t{raw_words[i]}")
        e = []
        i += 1

hi             	<pad>	hi
everyone       	.	everyone.
i'm            	<pad>	i'm
sirena         	.	sirena.
i'm            	<pad>	i'm
11             	<pad>	11
years          	<pad>	years
old            	<pad>	old
and            	<pad>	and
from           	<pad>	from
connecticut    	.	connecticut.
well           	,	well,
i'm            	<pad>	i'm
not            	<pad>	not
really         	<pad>	really
sure           	<pad>	sure
why            	<pad>	why
i'm            	<pad>	i'm
here           	.	here.
i              	<pad>	i
mean           	,	mean,
what           	<pad>	what
does           	<pad>	does
this           	<pad>	this
have           	<pad>	have
to             	<pad>	to
do             	<pad>	do
with           	<pad>	with
technology     	,	technology,
entertainment  	<pad>	entertainment
and            	<pad>	and
design         	?	design?
well           	,	well,
i              	<pad>	i
count          	<pad>	count
my             	<pad>	my
ipod           	,	ipod,
cellphone      	<pad>	

In [301]:
sum((len(t.split(' ')) for t in transcripts))

2402325

In [302]:
print(tokenizer.decode(valid_texts[5]))

well we all know the world wide web has absolutely transformed publishing broadcasting commerce and social connectivity but where did it all come from and i'll quote three people vannevar bush doug engelbart and tim berners lee so let's just run through these guys this is vannevar bush vannevar bush was the u.s government's chief scientific adviser during the war and in 1945 he published an article in a magazine called atlantic monthly and the article was called,as we may think and what vannevar bush was saying was the way we use information is broken we don't work in terms of libraries and catalog systems and so forth the brain works by association with one item in its thought it snaps instantly to the next item and the way information is structured is totally incapable of keeping up with this process and so he suggested a machine and he called it the memex and the memex would link information one piece of information to a related piece of information and so forth now this was in 1945