In [1]:
import pandas as pd
import re
from string import punctuation
import numpy as np
from bpe_tokenizer import BPETokenizer
from tqdm.auto import tqdm

# initalize tokenizer
tokenizer = BPETokenizer()

# load and consolidate data
reddit = pd.read_csv('/home/bibek/Downloads/reddit.csv',encoding='latin-1').full_text
twitter = pd.read_csv('/home/bibek/notebooks/tweet_16.csv',encoding='latin-1').sample(100000)
twitter = twitter[twitter.columns[-1]]
imdb_reviews = pd.read_csv('IMDB Dataset.csv').review
movie_plots = pd.read_csv('wiki_movie_plots_deduped.csv',encoding='latin-1').Plot
tweet1 = pd.read_csv('/home/bibek/notebooks/train.csv',encoding='latin-1').text
tweet2 = pd.read_csv('/home/bibek/notebooks/test.csv',encoding='latin-1').text



text = []

text.extend(reddit)
text.extend(twitter)
text.extend(imdb_reviews)
text.extend(movie_plots)
text.extend(tweet1)
text.extend(tweet2)

In [2]:
def preprocess_text(x):
    """
    cleaning tweets -> 
    @user -> USER
    urls -> LINK
    keep only !,.:#? and add space 
    """
    x = str(x).replace('&amp;','and').replace('<br />','').replace('&quot;','')
    x = x.lower()
    x = re.sub(r'[^\x00-\x7F]+', "'",x)
    url_pattern = r'http\S+|www\S+'
    x = re.sub(url_pattern, 'LINK', x).split()
    for i,w in enumerate(x):
        if '@' in w:
            x[i] = 'USER'
    x = ' '.join(x)
    punct_to_keep = """!,.:#?"-;/%$'"""
    punct = ''.join([p for p in punctuation if p not in punct_to_keep])
    trans = str.maketrans(punct, ' ' * len(punct))
    x = x.translate(trans)
    x = ''.join(x)
    x = re.sub(r'([!"#$%&\()*+,-./:;<=>?@\\^_`{|}~])\s*\1+', r'\1', x)
    x = re.sub(r'([!"#$%&\()*+,-./:;<=>?@\\^_`{|}~])', r' \1 ', x)
    x = re.sub(r'\s+', ' ', x).strip().replace("'s "," 's ")
    return x

# clean the text

clean_text = [preprocess_text(x) for x in tqdm(text)]

  0%|          | 0/259789 [00:00<?, ?it/s]

In [3]:
# cleaning sample

idx = np.random.randint(0,len(text),10)


for i in idx:
    original = text[i]
    cleaned = preprocess_text(original)
    print(original,'\n--------------------------------------------------\n',cleaned)
    print('===============================================================')

@OfficialAshleyG Haha. You and @KC_Lutz (or &quot;Uncle Kellan&quot;) are too cute!  
--------------------------------------------------
 USER haha . you and USER or uncle kellan are too cute !
its rain  
--------------------------------------------------
 its rain
After his mother is killed by Man, Bambi stumbles upon his father, the Great Prince of the Forest, who takes him back to his den. The Great Prince asks Friend Owl to find a doe to raise Bambi, since his duties are to his herd, but Owl informs him that, because of the harsh winters, the does can barely feed themselves and their young. The Great Prince has no choice but to look after Bambi until the spring.
Sometime later, the Great Prince allows Bambi to accompany his friends, Thumper and Flower, to see the Groundhog, whose shadow will foretell if winter will end soon. At the Groundhog ceremony, Bambi meets up with his crush Faline. The Groundhog is coaxed out of his hole, only to be scared back in by Ronno, an older fawn. R

In [4]:
# train tokenizer

tokenizer(clean_text,num_tok=20000)
tokenizer.train(iterations=3,min_pair_freq=400)

In [5]:
# store oov split combinations

for seq in tqdm(clean_text):
    for w in seq.split():
        tokenizer._split_oov(w)

  0%|          | 0/259789 [00:00<?, ?it/s]

In [6]:
# sampled subword-splits

idx = np.random.randint(0,len(text),10)

for i in idx:
    original = text[i]
    cleaned = preprocess_text(original)
    subworded = ' '.join([tokenizer._split_oov(w) for w in cleaned.split()])

    print('\n',original,'\n------------------------\n',cleaned,'\n----------------------\n',subworded)
    print('===============================================')


 The film was apparently spawned from an idea one of the writers had when he 'saw' one of his creations in a supermarket. The inhabitants of Royston Vasey head into 'our' world to persuade the writers not to stop writing about them and thus destroy their world.<br /><br />If that sounds a bit too serious, don't be put off. Within the first few minutes we get: Bernice (the vile female vicar) letting rip at an unfortunate penitent during confession; Chinnery (the vet who inadvertently destroys every animal he touches) attempting to collect semen from a giraffe; Mickey (thick beyond belief) being, ah, thick; and Tubbs (inbred sister-wife and local shopkeeper) being sweet as ever - but still disgusting.<br /><br />Some of the regular characters are missing, but a new idea by the Gents introduces some 16th-Century characters - and we have the Gents themselves in the action too. If you're new to The League of Gentlemen, this is an easy introduction and a lot of fun. If you're a long-standin

In [7]:
print('vocab size :',len(tokenizer.i2w))
print('num of oov splits :',len(tokenizer.oov_splits))

vocab size : 20365
num of oov splits : 200247


In [8]:
import pickle

# save tokenizer class

with open('tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer,f)