In [1]:
from fastai.text import *
import html

In [2]:
PATH = Path('data/aclImdb')

In [3]:
clas_path = Path('data/imdb_clas/')
os.makedirs(clas_path, exist_ok=True)

lm_path = Path('data/imdb_lm/')
os.makedirs(lm_path, exist_ok=True)

## Get data in standard format

In [4]:
classes = ['pos', 'neg', 'unsup']

def get_text(path):
    texts,labels = [],[]
    for idx,label in enumerate(classes):
        for fname in Path(path/label).glob('*.txt'):
            texts.append(fname.open('r').read())
            labels.append(idx)
    return np.array(texts), np.array(labels)

trn_texts, trn_labels = get_text(PATH/'train')
val_texts, val_labels = get_text(PATH/'test')

In [5]:
len(trn_texts), len(val_texts)

(75000, 25000)

### Shuffle data

In [6]:
trn_idxs = np.random.permutation(len(trn_texts))
val_idxs = np.random.permutation(len(val_texts))

In [7]:
trn_texts = trn_texts[trn_idxs]
trn_labels = trn_labels[trn_idxs]

val_texts = val_texts[val_idxs]
val_labels = val_labels[val_idxs]

### Save to csv

In [8]:
cols = ['labels','text']

#### Classifier

In [9]:
df_trn = pd.DataFrame({'text': trn_texts, 'labels': trn_labels}, columns=cols)
df_val = pd.DataFrame({'text': val_texts, 'labels': val_labels}, columns=cols)

In [10]:
# remove unsup labels for classifier
df_trn[df_trn['labels'] != 2].to_csv(clas_path/'train.csv', header=False, index=False)
df_val.to_csv(clas_path/'test.csv', header=False, index=False)

In [11]:
# save classes.txt 
(clas_path/'classes.txt').open('w').writelines(f'{o}\n' for o in classes)

#### Language Model  
Language model doesn't need any labels and can be trained on combination of trn/val data.  Predicts next word only...

In [12]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(np.concatenate([trn_texts,val_texts]), test_size=0.1)
len(trn_texts), len(val_texts)

(90000, 10000)

In [13]:
df_trn = pd.DataFrame({'text': trn_texts, 'labels': [0]*len(trn_texts)}, columns=cols)
df_val = pd.DataFrame({'text': val_texts, 'labels': [0]*len(val_texts)}, columns=cols)

df_trn.to_csv(lm_path/'train.csv', header=False, index=False)
df_val.to_csv(lm_path/'test.csv', header=False, index=False)

## Language Model

### Tokenize

In [14]:
chunksize=24000

BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [15]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [16]:
def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)

    # process all multiprocessing -> parallelization
    # partition_by_cores -> takes a list and splits into number of sublists = to number of cores in computer
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

In [17]:
# this loops through the chunks of the dataframe.  see *chunksize* below
def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [18]:
# read csv one chunk at a time
df_trn = pd.read_csv(lm_path/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(lm_path/'test.csv', header=None, chunksize=chunksize)

In [19]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

0
1
2
3
0


In [20]:
' '.join(tok_trn[0])

'\n xbos xfld 1 i thought that i was never going to find a horror movie as bad as " the return of the texas chainsaw massacre " , but this film compete with it . \n\n i´m not a person that get asleep when watching a movie , but i did it 15 minutes after the trance started . i woke up , and started to watching it agian . why did i deserve that ? all the movie was a torture , i have to use fast forward to watch it complete . \n\n i can´t stand why one of my favourites actors of all time ( mr . t_up walken ) could done this thing . i have to think that he made the director a favor , or he was really in the need of money , because film after film he is doing , he is ruining himself ; and so fast ... \n\n what about the movie ? it´s not scary , stupid plot , characters are awful ( but i really liked the one played by jared harris ) , effects are very poor , lack of deaths & blood , etc ; in three words , it has anything . and i mean it . can´t stand how a director can make a film like this 

In [21]:
(lm_path/'tmp').mkdir(exist_ok=True)

np.save(lm_path/'tmp'/'tok_trn.npy', tok_trn)
np.save(lm_path/'tmp'/'tok_val.npy', tok_val)

In [11]:
# tok_trn = np.load(lm_path/'tmp'/'tok_trn.npy')
# tok_val = np.load(lm_path/'tmp'/'tok_val.npy')

### Numericalize

In [22]:
# o - text, p - word
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

[('the', 1209039),
 ('.', 992282),
 (',', 986142),
 ('and', 587765),
 ('a', 583323),
 ('of', 524421),
 ('to', 485465),
 ('is', 393549),
 ('it', 341450),
 ('in', 337760),
 ('i', 308257),
 ('this', 270443),
 ('that', 261140),
 ('"', 237524),
 ("'s", 221377),
 ('-', 188391),
 ('was', 180478),
 ('\n\n', 178728),
 ('as', 165965),
 ('with', 159289),
 ('for', 158912),
 ('movie', 157674),
 ('but', 150343),
 ('film', 144112),
 ('you', 123964)]

In [23]:
max_vocab=60000
min_freq=2

In [24]:
itos = [word for word,count in freq.most_common(max_vocab) if count>min_freq]
itos.insert(0, '_unk_')
itos.insert(1, '_pad_')
len(itos)

60002

In [25]:
stoi = collections.defaultdict(lambda: 0, {v:k for k,v in enumerate(itos)})

In [26]:
# call stoi for every word in every sentence
trn_lm = np.array([[stoi[word] for word in sentence] for sentence in tok_trn])
val_lm = np.array([[stoi[word] for word in sentence] for sentence in tok_val])

In [27]:
np.save(lm_path/'tmp'/'trn_ids.npy', trn_lm)
np.save(lm_path/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(lm_path/'tmp'/'itos.pkl', 'wb'))

In [4]:
trn_lm = np.load(lm_path/'tmp'/'trn_ids.npy')
val_lm = np.load(lm_path/'tmp'/'val_ids.npy')
itos = pickle.load(open(lm_path/'tmp'/'itos.pkl', 'rb'))

In [5]:
vs = len(itos)
vs, len(trn_lm)

(60002, 90000)

### load Wikitext103 pretrained model

In [6]:
em_sz,nh,nl = 400,1150,3

In [7]:
pre_path = PATH/'models'/'wt103'
pre_lm_path = pre_path/'fwd_wt103.h5'

In [8]:
wgts = torch.load(pre_lm_path, map_location=lambda storage, loc: storage )

In [9]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_mean = enc_wgts.mean(0)
enc_wgts.shape

(238462, 400)

In [10]:
wiki_itos = pickle.load((pre_path/'itos_wt103.pkl').open('rb'))
wiki_stoi = collections.defaultdict(lambda: -1, {v:k for k,v in enumerate(wiki_itos)})

In [11]:
# map weights from pretrained wikitext103 model onto our itos; filling unmatched values with the mean weight
new_w = np.zeros((vs, em_sz), dtype=np.float32)
for idx,word in enumerate(itos):
    wiki_int = wiki_stoi[word]
    new_w[idx] = enc_wgts[wiki_int] if wiki_int >= 0 else row_mean

In [12]:
wgts['0.encoder.weight'] = T(new_w)
# weight tying
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

In [13]:
torch.save(wgts, pre_path/'pretrained.h5')

In [8]:
wgts = torch.load(pre_path/'pretrained.h5')

### Train

In [9]:
wd = 1e-7
bptt = 70
bs = 30
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [10]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [11]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [12]:
learner = md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.freeze_to(-1)

In [13]:
learner.model.load_state_dict(wgts)

In [15]:
# http://forums.fast.ai/t/pytorch-internal-error-while-doing-the-imdb-notebook/16828
# need to use this when training only one layer:  learner.freeze_to(-1)
torch.backends.cudnn.enabled = False

In [16]:
learner.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

  2%|▏         | 294/11905 [02:01<1:19:46,  2.43it/s, loss=5.5] 

KeyboardInterrupt: 

In [14]:
lr=1e-3

In [18]:
learner.fit(lr/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                         
    0      4.72632    4.468971   0.257368  


[4.468971221108451, 0.25736787981632253]

In [19]:
learner.save('lm_last_ft')

In [15]:
learner.load('lm_last_ft')

In [16]:
learner.unfreeze()

In [18]:
learner.fit(lr, 1, wds=wd, use_clr=(20,10), cycle_len=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                         
    0      4.396989   4.213354   0.282952  
  0%|          | 12/11905 [00:05<1:25:33,  2.32it/s, loss=4.41]

KeyboardInterrupt: 

In [19]:
learner.save('lm1')

In [20]:
learner.save_encoder('lm1_enc')

## Classifier

### Tokenize

In [None]:
# read csv one chunk at a time
df_trn = pd.read_csv(clas_path/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(clas_path/'test.csv', header=None, chunksize=chunksize)

In [None]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

In [None]:
(clas_path/'tmp').mkdir(exist_ok=True)

np.save(clas_path/'tmp'/'tok_trn.npy', tok_trn)
np.save(clas_path/'tmp'/'tok_val.npy', tok_val)

np.save(clas_path/'tmp'/'trn_labels.npy', trn_labels)
np.save(clas_path/'tmp'/'val_labels.npy', val_labels)