In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from fastai import *        # Quick accesss to most common functionality
from fastai.text import *   # Quick accesss to NLP functionality
import html

In [3]:
from fastai.text import data

In [4]:
def read_file(fn:PathOrStr, enc='utf-8'):
    "Read the text in `fn`."
#     with open(fn,'r', encoding = enc) as f: return ''.join(f.read().splitlines())
    tokens = []
    with open(fn,'r', encoding = enc) as f: 
        for line in f.read().splitlines():
            l = line.strip()
            if len(l) == 0: continue
            tokens.append(l)#.split())
    return np.array(tokens)

# Wikitext 103
This notebook is for training the language model on most of Wikipedia.  
The idea is to create a generalized language model before we fine tune it on a specialized task

## Data - Wikitext-103

Blog:
https://einstein.ai/research/blog/the-wikitext-long-term-dependency-language-modeling-dataset

Original fastai [notebook](https://github.com/fastai/fastai_docs/blob/master/dev_nb/007_wikitext_2.ipynb)

#### Download data 

Download the dataset [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) and unzip it so it's in the folder wikitext.

```bash
cd ~/data
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-2-v1.zip
unzip wikitext-2-raw-v1.zip
unzip wikitext-103-v1.zip
unzip wikitext-103-raw-v1.zip
```

### Note: This current notebook runs out of memory when tokenizing wikitext-103.

Please use an instance with a lot of ram: `m5.24xlarge`  
Or follow: `notebooks/DATA-2-PrepareWikitext-FromRaw-MemoryFix.ipynb`

In [5]:
PATH=PATH=Path.home()/'data/wikitext-103-raw'

In [6]:
# defaults.text_post_rules = [defaults.text_post_rules[0]]

In [7]:
# defaults.text_spec_tok = [PAD,UNK,BOS,FLD,TK_MAJ,TK_UP,TK_REP,TK_WREP]

In [8]:
# download_url('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', PATH)

In [9]:
train_tok = read_file(PATH/'wiki.train.raw')
valid_tok = read_file(PATH/'wiki.valid.raw')
test_tok = read_file(PATH/'wiki.test.raw')

In [10]:
len(train_tok), len(valid_tok), len(test_tok)

(1165029, 2461, 2891)

In [16]:
train_tok[:2][:2]

array(['= Valkyria Chronicles III =',
       'Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " .'],
      dtype='<U7063')

In [None]:
path = PATH
# processor = data._get_processor(tokenizer=tok, vocab=None, max_vocab=60000)
processor = data._get_processor(vocab=None)
src = ItemLists(path, TextList(train_tok, path=path, processor=processor),
                TextList(valid_tok, path=path, processor=processor))
src = src.label_for_lm()
if test_tok is not None: src.add_test(TextList(test_tok, path=path))
text_data = src.databunch()

In [None]:
# text_data = TextLMDataBunch.from_tokens(
#     PATH, train_tok, valid_tok, test_tok, None, None)

In [18]:
vocab = text_data.train_ds.vocab

In [19]:
text_data.save()

### Wikitext-2

In [5]:
PATH=Path.home()/'data/wikitext-2-raw'

In [6]:
# download_url('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', PATH)

In [7]:
train_tok = read_file(PATH/'wiki.train.raw')
valid_tok = read_file(PATH/'wiki.valid.raw')
test_tok = read_file(PATH/'wiki.test.raw')

In [8]:
train_tok[:4]

array(['= Valkyria Chronicles III =',
       'Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " .',
       "The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more fo

In [9]:
len(train_tok), len(valid_tok), len(test_tok)

(23767, 2461, 2891)

In [10]:
' '.join(train_tok[4][:2])

'=  '

In [11]:
path = PATH
processor = data._get_processor(vocab=None)
src = ItemLists(path, TextList(train_tok, path=path, processor=processor),
                TextList(valid_tok, path=path, processor=processor))
src = src.label_for_lm()
if test_tok is not None: src.add_test(TextList(test_tok, path=path))
text_data = src.databunch()

In [12]:
# text_data = TextLMDataBunch.from_tokens(
#     PATH, train_tok, valid_tok, test_tok, None, None)

In [13]:
text_data.save()

## Loading data

In [14]:
text_data = load_data(PATH, bs=120)

In [15]:
text_data.show_batch()

idx,text
0,"for the playstation xxmaj portable . xxmaj released in xxmaj january 2011 in xxmaj japan , it is the third game in the xxmaj valkyria series . xxmaj employing the same fusion of tactical and real - time gameplay as its predecessors , the story runs parallel to the first game and follows the "" xxmaj nameless "" , a penal military unit serving the nation of xxmaj gallia during"
1,villa xxmaj belvedere with the xxmaj apostolic xxmaj palace in xxmaj vatican xxmaj city . xxmaj the tower was built between 1578 and 1580 to a design by the xxmaj xxunk architect xxmaj xxunk xxmaj xxunk ( who was credited with building the xxmaj apostolic xxmaj palace ) mainly to promote the study of astronomy for the xxmaj gregorian xxmaj calendar xxmaj reform which was commissioned by xxmaj pope xxmaj
2,"during the xxmaj new xxmaj kingdom , one man was accused of stealing clothes by an oracle supposed to communicate messages from xxmaj amun of xxmaj pe - xxmaj khenty . xxmaj he consulted two other local oracles of xxmaj amun hoping for a different judgment . xxmaj gods ' manifestations also differed according to their roles . xxmaj horus could be a powerful sky god or vulnerable child ,"
3,"demonstrated that fruit bodies were more likely to be found in areas that were heavily burned , compared to locations with light to moderate burning where the trees remained viable , or in xxunk areas . xxmaj fruiting was much denser in spruce forests — with up to 700 – 1000 fruitbodies per square meter — than in pine forests , where fruitbodies were sporadic . xxmaj fruitbodies grew by"
4,"action ; xxmaj eaton himself took part in the work , xxunk coal alongside his men . xxmaj on 25 xxmaj february 1941 , he made a flight north to reconnoitre xxmaj timor , xxmaj ambon , and xxmaj xxunk in xxmaj dutch xxmaj new xxmaj guinea for potential use by the xxup raaf in any xxmaj pacific conflict . xxmaj by xxmaj april , the total strength based at"
