In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from fastai import *        # Quick accesss to most common functionality
from fastai.text import *   # Quick accesss to NLP functionality
import html

In [4]:
from fastai.text import data

Taken from here:  
https://gist.github.com/bearpelican/48cd4c505aea7c94e7c1e6e5e24bfac0

# Wikitext 103
This notebook is for training the language model on most of Wikipedia.  
The idea is to create a generalized language model before we fine tune it on a specialized task

## Data - Wikitext-103

Download the dataset [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) and unzip it so it's in the folder wikitext.

Blog:
https://einstein.ai/research/blog/the-wikitext-long-term-dependency-language-modeling-dataset

Original notebook:
https://github.com/fastai/fastai_docs/blob/master/dev_nb/007_wikitext_2.ipynb

Small helper function to read the tokens.

In [15]:
PATH=Path('data/wikitext-103')

In [7]:
# download_url('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', PATH)

In [6]:
def read_file(fn:PathOrStr, enc='utf-8'):
    "Read the text in `fn`."
#     with open(fn,'r', encoding = enc) as f: return ''.join(f.read().splitlines())
    tokens = []
    with open(fn,'r', encoding = enc) as f: 
        for line in f.read().splitlines():
            l = line.strip()
            if len(l) == 0: continue
            tokens.append(l.split())
    return np.array(tokens)

In [16]:
train_tok = read_file(PATH/'wiki.train.tokens')
valid_tok = read_file(PATH/'wiki.valid.tokens')
test_tok = read_file(PATH/'wiki.test.tokens')

In [17]:
len(train_tok), len(valid_tok), len(test_tok)

(1165029, 2461, 2891)

In [18]:
train_tok[:2][:2]

array([list(['=', 'Valkyria', 'Chronicles', 'III', '=']),
       list(['Senjō', 'no', 'Valkyria', '3', ':', '<unk>', 'Chronicles', '(', 'Japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'Valkyria', 'of', 'the', 'Battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'Valkyria', 'Chronicles', 'III', 'outside', 'Japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'Sega', 'and', 'Media.Vision', 'for', 'the', 'PlayStation', 'Portable', '.', 'Released', 'in', 'January', '2011', 'in', 'Japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'Valkyria', 'series', '.', 'Employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', '"', 'Nameless', '"', ',', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'Gallia', 'during', 'the', 'Second', 'Europan', 'War'

In [19]:
path = PATH
# processor = data._get_processor(tokenizer=tok, vocab=None, max_vocab=60000)
processor = data._get_processor(vocab=None)
src = ItemLists(path, TextList(train_tok, path=path, processor=processor),
                TextList(valid_tok, path=path, processor=processor))
src = src.label_for_lm()
if test_tok is not None: src.add_test(TextList(test_tok, path=path))
text_data = src.databunch()

In [20]:
# text_data = TextLMDataBunch.from_tokens(
#     PATH, train_tok, valid_tok, test_tok, None, None)

In [21]:
vocab = text_data.train_ds.vocab

In [22]:
text_data.save()

### Wikitext-2

In [7]:
PATH=Path('data/wikitext-2')

In [None]:
# download_url('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', PATH)

In [8]:
train_tok = read_file(PATH/'wiki.train.tokens')
valid_tok = read_file(PATH/'wiki.valid.tokens')
test_tok = read_file(PATH/'wiki.test.tokens')

In [9]:
train_tok[:10]

array([list(['=', 'Valkyria', 'Chronicles', 'III', '=']),
       list(['Senjō', 'no', 'Valkyria', '3', ':', '<unk>', 'Chronicles', '(', 'Japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'Valkyria', 'of', 'the', 'Battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'Valkyria', 'Chronicles', 'III', 'outside', 'Japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'Sega', 'and', 'Media.Vision', 'for', 'the', 'PlayStation', 'Portable', '.', 'Released', 'in', 'January', '2011', 'in', 'Japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'Valkyria', 'series', '.', '<unk>', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', '"', 'Nameless', '"', ',', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'Gallia', 'during', 'the', 'Second', 'Europan', 'War', 'w

In [10]:
len(train_tok), len(valid_tok), len(test_tok)

(23767, 2461, 2891)

In [11]:
' '.join(train_tok[4][:4])

'= = Gameplay = ='

In [12]:
path = PATH
processor = data._get_processor(vocab=None)
src = ItemLists(path, TextList(train_tok, path=path, processor=processor),
                TextList(valid_tok, path=path, processor=processor))
src = src.label_for_lm()
if test_tok is not None: src.add_test(TextList(test_tok, path=path))
text_data = src.databunch()

In [13]:
# text_data = TextLMDataBunch.from_tokens(
#     PATH, train_tok, valid_tok, test_tok, None, None)

In [14]:
text_data.save()

## Loading data

In [None]:
data = TextLMDataBunch.load(PATH)