In [1]:
from fastai.text import *
import html

In [2]:
PATH = Path('data/aclImdb')

## Get data in standard format

In [3]:
clas_path = Path('data/imdb_clas/')
os.makedirs(clas_path, exist_ok=True)

lm_path = Path('data/imdb_lm/')
os.makedirs(lm_path, exist_ok=True)

In [32]:
classes = ['pos', 'neg', 'unsup']

def get_text(path):
    texts,labels = [],[]
    for idx,label in enumerate(classes):
        for fname in Path(path/label).glob('*.txt'):
            texts.append(fname.open('r').read())
            labels.append(idx)
    return np.array(texts), np.array(labels)

trn_texts, trn_labels = get_text(PATH/'train')
val_texts, val_labels = get_text(PATH/'test')

In [34]:
len(trn_texts), len(val_texts)

(75000, 25000)

### Shuffle data

In [35]:
trn_idxs = np.random.permutation(len(trn_texts))
val_idxs = np.random.permutation(len(val_texts))

In [36]:
trn_texts = trn_texts[trn_idxs]
trn_labels = trn_labels[trn_idxs]

val_texts = val_texts[val_idxs]
val_labels = val_labels[val_idxs]

### Save to csv

In [44]:
cols = ['labels','text']

#### Classifier

In [37]:
df_trn = pd.DataFrame({'text': trn_texts, 'labels': trn_labels}, columns=cols)
df_val = pd.DataFrame({'text': val_texts, 'labels': val_labels}, columns=cols)

In [40]:
# remove unsup labels for classifier
df_trn[df_trn['labels'] != 2].to_csv(clas_path/'train.csv', header=False, index=False)
df_val.to_csv(clas_path/'test.csv', header=False, index=False)

In [41]:
# save classes.txt 
(clas_path/'classes.txt').open('w').writelines(f'{o}\n' for o in classes)

#### Language Model  
Language model doesn't need any labels and can be trained on combination of trn/val data.  Predicts next word only...

In [42]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(np.concatenate([trn_texts,val_texts]), test_size=0.1)
len(trn_texts), len(val_texts)

(90000, 10000)

In [45]:
df_trn = pd.DataFrame({'text': trn_texts, 'labels': [0]*len(trn_texts)}, columns=cols)
df_val = pd.DataFrame({'text': val_texts, 'labels': [0]*len(val_texts)}, columns=cols)

df_trn.to_csv(lm_path/'train.csv', header=False, index=False)
df_val.to_csv(lm_path/'test.csv', header=False, index=False)

## Language Model

### Tokenize

In [65]:
chunksize=24000

BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [66]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [67]:
def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)

    # process all multiprocessing -> parallelization
    # partition_by_cores -> takes a list and splits into number of sublists = to number of cores in computer
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

In [68]:
# this loops through the chunks of the dataframe.  see *chunksize* below
def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [69]:
# read csv one chunk at a time
df_trn = pd.read_csv(lm_path/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(lm_path/'test.csv', header=None, chunksize=chunksize)

In [70]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

0
1
2
3
0


In [71]:
' '.join(tok_trn[0])

'\n xbos xfld 1 before god awful pieces of trash like the punisher , there was another film that showed a man who \'s wife was murdered by a gang and wanted justice , but wanted to deliver it in his own way . that film was death wish . and instead of a hero that did n\'t seem to care too much about this killing ( though it was mostly just the actor , tom jane)we had the cool three dimensional charles bronson . you may ask yourself , " three dimensional ? " . and i say " sure " he cried about his wife before he went on to kill a great many scum bags on the streets of new york . the bottom line is that bronson was a bad ass in this film and all the rest of the death wish films which i also enjoy very much . we know that this movie will be great from the beginning when bronson cashes in a 20 dollar bill for two rolls of quarters so that he can stuff them in a sock and smash it across some dude \'s face . it \'s brilliant . they do n\'t do they kind of raw action in movies anymore , and i 

In [72]:
(lm_path/'tmp').mkdir(exist_ok=True)

np.save(lm_path/'tmp'/'tok_trn.npy', tok_trn)
np.save(lm_path/'tmp'/'tok_val.npy', tok_val)

In [11]:
tok_trn = np.load(lm_path/'tmp'/'tok_trn.npy')
tok_val = np.load(lm_path/'tmp'/'tok_val.npy')

### Numericalize

In [12]:
# o - text, p - word
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

[('the', 1208855),
 ('.', 992627),
 (',', 986565),
 ('and', 588519),
 ('a', 584215),
 ('of', 525176),
 ('to', 485727),
 ('is', 394350),
 ('it', 342020),
 ('in', 337720),
 ('i', 308300),
 ('this', 270423),
 ('that', 261431),
 ('"', 237416),
 ("'s", 221363),
 ('-', 188066),
 ('was', 180206),
 ('\n\n', 178828),
 ('as', 165606),
 ('with', 159370),
 ('for', 158626),
 ('movie', 157843),
 ('but', 150587),
 ('film', 144151),
 ('you', 124511)]

In [13]:
max_vocab=60000
min_freq=2

In [14]:
itos = [word for word,count in freq.most_common(max_vocab) if count>min_freq]
itos.insert(0, '_unk_')
itos.insert(1, '_pad_')
len(itos)

In [15]:
stoi = collections.defaultdict(lambda: 0, {v:k for k,v in enumerate(itos)})

In [93]:
# call stoi for every word in every sentence
trn_lm = np.array([[stoi[word] for word in sentence] for sentence in tok_trn])
val_lm = np.array([[stoi[word] for word in sentence] for sentence in tok_val])

In [94]:
np.save(lm_path/'tmp'/'trn_ids.npy', trn_lm)
np.save(lm_path/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(lm_path/'tmp'/'itos.pkl', 'wb'))

In [4]:
trn_lm = np.load(lm_path/'tmp'/'trn_ids.npy')
val_lm = np.load(lm_path/'tmp'/'val_ids.npy')
itos = pickle.load(open(lm_path/'tmp'/'itos.pkl', 'rb'))

In [5]:
vs = len(itos)
vs, len(trn_lm)

(60002, 90000)

### load Wikitext103 pretrained model

In [6]:
em_sz,nh,nl = 400,1150,3

In [7]:
pre_path = PATH/'models'/'wt103'
pre_lm_path = pre_path/'fwd_wt103.h5'

In [8]:
wgts = torch.load(pre_lm_path, map_location=lambda storage, loc: storage )

In [24]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_mean = enc_wgts.mean(0)
enc_wgts.shape

(238462, 400)

In [10]:
wiki_itos = pickle.load((pre_path/'itos_wt103.pkl').open('rb'))
wiki_stoi = collections.defaultdict(lambda: -1, {v:k for k,v in enumerate(wiki_itos)})

In [25]:
# map weights from pretrained wikitext103 model onto our itos; filling unmatched values with the mean weight
new_w = np.zeros((vs, em_sz), dtype=np.float32)
for idx,word in enumerate(itos):
    wiki_int = wiki_stoi[word]
    new_w[idx] = enc_wgts[wiki_int] if wiki_int >= 0 else row_mean

In [26]:
wgts['0.encoder.weight'] = T(new_w)
# weight tying
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

### Train

In [27]:
wd = 1e-7
bptt = 70
bs = 52
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [28]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [29]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [30]:
learner = md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.freeze_to(-1)

In [31]:
learner.model.load_state_dict(wgts)

In [34]:
learner.lr_find()

A Jupyter Widget

  0%|          | 0/6874 [00:00<?, ?it/s]                       


Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Users/adamschiller/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/adamschiller/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/adamschiller/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



  0%|          | 24/6874 [05:38<26:50:35, 14.11s/it, loss=5.58]


KeyboardInterrupt: 

In [32]:
lr=1e-3

In [33]:
learner.fit(lr/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)

A Jupyter Widget

  0%|          | 24/6874 [05:22<25:32:40, 13.42s/it, loss=5.59]

KeyboardInterrupt: 

In [None]:
learner.save('lm_last_ft')

In [None]:
learner.unfreeze()

In [None]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=15)

In [None]:
learner.save('lm1')

In [None]:
learner.save_encoder('lm1_enc')