In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from fastai import *        # Quick accesss to most common functionality
from fastai.text import *   # Quick accesss to NLP functionality
import html

In [14]:
from fastai.text import data

Taken from here:  
https://gist.github.com/bearpelican/48cd4c505aea7c94e7c1e6e5e24bfac0

# Wikitext 103
This notebook is for training the language model on most of Wikipedia.  
The idea is to create a generalized language model before we fine tune it on a specialized task

## Data - Wikitext-103

Download the dataset [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) and unzip it so it's in the folder wikitext.

Blog:
https://einstein.ai/research/blog/the-wikitext-long-term-dependency-language-modeling-dataset

Original notebook:
https://github.com/fastai/fastai_docs/blob/master/dev_nb/007_wikitext_2.ipynb

Small helper function to read the tokens.

In [17]:
PATH=Path('data/wikitext-103-raw')

In [5]:
# download_url('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', PATH)

In [18]:
def read_file(fn:PathOrStr, enc='utf-8'):
    "Read the text in `fn`."
#     with open(fn,'r', encoding = enc) as f: return ''.join(f.read().splitlines())
    tokens = []
    with open(fn,'r', encoding = enc) as f: 
        for line in f.read().splitlines():
            l = line.strip()
            if len(l) == 0: continue
            tokens.append(l.split())
    return np.array(tokens)

In [19]:
train_tok = read_file(PATH/'wiki.train.raw')
valid_tok = read_file(PATH/'wiki.valid.raw')
test_tok = read_file(PATH/'wiki.test.raw')

In [22]:
len(train_tok), len(valid_tok), len(test_tok)

(1165029, 2461, 2891)

In [23]:
train_tok[:2][:2]

array([list(['=', 'Valkyria', 'Chronicles', 'III', '=']),
       list(['Senjō', 'no', 'Valkyria', '3', ':', 'Unrecorded', 'Chronicles', '(', 'Japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'Valkyria', 'of', 'the', 'Battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'Valkyria', 'Chronicles', 'III', 'outside', 'Japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'Sega', 'and', 'Media.Vision', 'for', 'the', 'PlayStation', 'Portable', '.', 'Released', 'in', 'January', '2011', 'in', 'Japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'Valkyria', 'series', '.', 'Employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', '"', 'Nameless', '"', ',', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'Gallia', 'during', 'the', 'Second', 'Europan', 

In [None]:
path = PATH
# processor = data._get_processor(tokenizer=tok, vocab=None, max_vocab=60000)
processor = data._get_processor(vocab=None)
src = ItemLists(path, TextList(train_tok, path=path, processor=processor),
                TextList(valid_tok, path=path, processor=processor))
src = src.label_for_lm()
if test_tok is not None: src.add_test(TextList(test_tok, path=path))
text_data = src.databunch()

In [17]:
# text_data = TextLMDataBunch.from_tokens(
#     PATH, train_tok, valid_tok, test_tok, None, None)

In [18]:
vocab = text_data.train_ds.vocab

In [19]:
text_data.save()

### Wikitext-2

In [7]:
PATH=Path('data/wikitext-2-raw')

In [8]:
# download_url('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', PATH)

In [9]:
train_tok = read_file(PATH/'wiki.train.raw')
valid_tok = read_file(PATH/'wiki.valid.raw')
test_tok = read_file(PATH/'wiki.test.raw')

In [10]:
train_tok[:100]

array([list(['=', 'Valkyria', 'Chronicles', 'III', '=']),
       list(['Senjō', 'no', 'Valkyria', '3', ':', 'Unrecorded', 'Chronicles', '(', 'Japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'Valkyria', 'of', 'the', 'Battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'Valkyria', 'Chronicles', 'III', 'outside', 'Japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'Sega', 'and', 'Media.Vision', 'for', 'the', 'PlayStation', 'Portable', '.', 'Released', 'in', 'January', '2011', 'in', 'Japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'Valkyria', 'series', '.', 'Employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', '"', 'Nameless', '"', ',', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'Gallia', 'during', 'the', 'Second', 'Europan', 

In [11]:
len(train_tok), len(valid_tok), len(test_tok)

(23767, 2461, 2891)

In [12]:
' '.join(train_tok[4][:20])

'= = Gameplay = ='

In [15]:
path = PATH
processor = data._get_processor(vocab=None)
src = ItemLists(path, TextList(train_tok, path=path, processor=processor),
                TextList(valid_tok, path=path, processor=processor))
src = src.label_for_lm()
if test_tok is not None: src.add_test(TextList(test_tok, path=path))
text_data = src.databunch()

In [10]:
# text_data = TextLMDataBunch.from_tokens(
#     PATH, train_tok, valid_tok, test_tok, None, None)

In [16]:
text_data.save()

## Loading data

In [20]:
text_data = TextLMDataBunch.load(PATH)

In [21]:
text_data.show_batch()

idx,text
0,"xxbos xxmaj loch xxmaj leven xxmaj castle was given in to state care in 1939 , and is now managed by xxmaj historic xxmaj scotland . xxmaj today , the castle can be reached by a xxunk ferry operated from xxmaj kinross by xxmaj historic xxmaj scotland during the summer months . xxmaj the remains of the castle are protected as a xxmaj scheduled xxmaj ancient xxmaj monument , and"
1,"raise volunteers to fight with the xxmaj german xxmaj schutzstaffel ( xxup ss ) . xxmaj in xxmaj january , xxup ss head xxmaj heinrich xxmaj himmler travelled to xxmaj norway to oversee preparations . xxmaj quisling clearly believed that if xxmaj norway supported xxmaj nazi xxmaj germany on the battlefield , there would be no reason for xxmaj germany to annex her . xxmaj to this end , he"
2,", through his experimentation with equipment and recording techniques . xxmaj xxunk , assisted by studio manager xxmaj david xxmaj harris , set up microphones an hour before the sessions began . xxmaj xxunk 's microphone choices were mostly different from those used by xxmaj smith to record the xxmaj beatles ' xxup emi sessions . xxmaj because of xxmaj barrett 's quiet voice , he was placed in a"
3,"retreating xxmaj germans after the xxmaj first xxmaj battle of the xxmaj marne . xxbos xxmaj empire gave the film 4 out of 5 stars with the verdict , "" xxmaj it ’ s certainly a xxmaj spike xxmaj lee film , but no xxmaj spike xxmaj lee xxmaj joint . xxmaj still , he ’ s delivered a xxunk , vigorous and frequently masterful take on a well -"
4,"purposes expressed as hcl , is estimated at 20 xxmaj mt / year , with 3 xxmaj mt / year from direct synthesis , and the rest as secondary product from organic and similar xxunk . xxmaj by far , most hydrochloric acid is consumed xxunk by the producer . xxmaj the open world market size is estimated at 5 xxmaj mt / year . xxbos xxmaj the failure of"
