<a href="https://colab.research.google.com/github/bcollister01/course-nlp/blob/master/Ben_nn_vietnamese_bwd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vietnamese ULMFiT from scratch (backwards)

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

In [None]:
bs=128

In [None]:
data_path = Config.data_path()
lang = 'vi'
name = f'{lang}wiki'
path = data_path/name
dest = path/'docs'
lm_fns = [f'{lang}_wt_bwd', f'{lang}_wt_vocab_bwd']

## Vietnamese wikipedia model

In [None]:
data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()
            .databunch(bs=bs, num_workers=1, backwards=True))

data.save(f'{lang}_databunch_bwd')

In [None]:
data = load_data(dest, f'{lang}_databunch_bwd', bs=bs, backwards=True)



In [None]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [None]:
lr = 3e-3
lr *= bs/48  # Scale learning rate by batch size

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.445849,3.424579,0.401327,32:56
1,3.420865,3.383994,0.402841,33:31
2,3.374694,3.330634,0.4078,33:26
3,3.273197,3.257108,0.416047,32:54
4,3.223044,3.200649,0.422695,32:56
5,3.134357,3.132859,0.430725,31:35
6,3.135637,3.05703,0.439737,31:41
7,3.080461,2.992323,0.447939,31:45
8,3.075036,2.943683,0.454494,31:39
9,2.947997,2.929258,0.4565,31:46


In [None]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

## Vietnamese sentiment analysis

### Language model

In [None]:
train_df = pd.read_csv(path/'train.csv')
train_df.loc[pd.isna(train_df.comment),'comment']='NA'

test_df = pd.read_csv(path/'test.csv')
test_df.loc[pd.isna(test_df.comment),'comment']='NA'
test_df['label'] = 0

df = pd.concat([train_df,test_df])

In [None]:
data_lm = (TextList.from_df(df, path, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()           
    .databunch(bs=bs, num_workers=1, backwards=True))

learn_lm = language_model_learner(data_lm, AWD_LSTM, config={**awd_lstm_lm_config, 'n_hid': 1152},
                                  pretrained_fnames=lm_fns, drop_mult=1.0)

In [None]:
lr = 1e-3
lr *= bs/48

In [None]:
learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.797052,4.025901,0.323326,00:07
1,4.275975,3.91445,0.333719,00:06


In [None]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.99677,3.809489,0.346052,00:09
1,3.856959,3.664919,0.363239,00:09
2,3.726143,3.584303,0.369685,00:09
3,3.608569,3.53139,0.375307,00:09
4,3.514265,3.500826,0.379701,00:09
5,3.446292,3.486931,0.380859,00:09
6,3.392542,3.479732,0.38252,00:09
7,3.357502,3.47893,0.38252,00:09


In [None]:
learn_lm.save(f'{lang}fine_tuned_bwd')
learn_lm.save_encoder(f'{lang}fine_tuned_enc_bwd')

### Classifier

In [None]:
data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1, backwards=True))

data_clas.save(f'{lang}_textlist_class_bwd')

In [None]:
data_clas = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True)

In [None]:
from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [None]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc_bwd')
learn_c.freeze()

In [None]:
lr=2e-2
lr *= bs/48

In [None]:
learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.3693,0.363769,0.834577,0.826098,00:03
1,0.328192,0.278986,0.874378,0.851747,00:02


In [None]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.337875,0.306132,0.876866,0.860107,00:03
1,0.276982,0.23726,0.906095,0.886427,00:03


In [None]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.292297,0.252393,0.896144,0.877916,00:04
1,0.255284,0.213655,0.912313,0.892551,00:04


In [None]:
learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.167376,0.266633,0.904851,0.885386,00:04


In [None]:
learn_c.save(f'{lang}clas_bwd')