In [7]:
"""
Train a classifier on top of a language model trained with `pretrain_lm.py`.
Optionally fine-tune LM before.
"""
import numpy as np
import pickle

import torch
import torch.nn.functional as F
from fastai.text import TextLMDataBunch, TextClasDataBunch, language_model_learner, text_classifier_learner, LanguageLearner
from fastai import fit_one_cycle
from fastai_contrib.utils import PAD, UNK, read_clas_data, PAD_TOKEN_ID, DATASETS, TRN, VAL, TST, ensure_paths_exists, get_sentencepiece
from fastai.text.transform import Vocab
from fastai.metrics import accuracy, accuracy_thresh, fbeta

import fire
from collections import Counter
from pathlib import Path

import pandas as pd
import csv
from functools import partial
from sklearn.metrics import f1_score, precision_score, recall_score
import mlflow

In [8]:
%load_ext autoreload
%autoreload 2

# NTCIR-13 MedWeb
http://research.nii.ac.jp/ntcir/permission/ntcir-13/perm-ja-MedWeb.html

## copy data

In [9]:
!ls /home/ubuntu/dev/download/data/MedWeb_TestCollection/

NTCIR-13_MedWeb_en_test.xlsx	  NTCIR-13_MedWeb_ja_training.xlsx  readme.txt
NTCIR-13_MedWeb_en_training.xlsx  NTCIR-13_MedWeb_zh_test.xlsx
NTCIR-13_MedWeb_ja_test.xlsx	  NTCIR-13_MedWeb_zh_training.xlsx


In [10]:
!cat /home/ubuntu/dev/download/data/MedWeb_TestCollection/readme.txt

NTCIR-13 MedWeb task: Test set

Training data
-MedWeb Japanese subtask train data: NTCIR-13_MedWeb_ja_training.xlsx
-MedWeb English subtask train data: NTCIR-13_MedWeb_en_training.xlsx
-MedWeb Chinese subtask train data: NTCIR-13_MedWeb_zh_training.xlsx

Test data
-MedWeb Japanese subtask test data: NTCIR-13_MedWeb_ja_test.xlsx
-MedWeb English subtask test data: NTCIR-13_MedWeb_en_test.xlsx
-MedWeb Chinese subtask test data: NTCIR-13_MedWeb_zh_test.xlsx


Columns in each sheet are as follows. 
ID: Pseudo tweet ID that corresponds to the corpora of other language (e.g., the tweet of ``1en'' corresponds to the tweets of ``1ja'' and ``1zh'') 
Tweet: Pseudo tweet message 
Influenza: P (Positive) / N (Negative)
Diarrhea: P (Positive) / N (Negative)
Hayfever: P (Positive) / N (Negative)
Cough: P (Positive) / N (Negative)
Headache: P (Positive) / N (Negative)
Fever: P (Positive) / N (Negative)
Runnynose: P (Positive) / N (Negative)
Cold: P (Positive) / N (Negative)

F

In [11]:
!mkdir -p data/MedWeb

In [12]:
!cp /home/ubuntu/dev/download/data/MedWeb_TestCollection/*ja* data/MedWeb/

In [13]:
!ls data/MedWeb/

lbl_names.pkl  NTCIR-13_MedWeb_ja_test.xlsx	 tmp	    valid_ch.txt
models	       NTCIR-13_MedWeb_ja_training.xlsx  train.csv  valid.csv


## read data

### train

In [14]:
trn = pd.read_excel('data/MedWeb/NTCIR-13_MedWeb_ja_training.xlsx',sheet_name='ja_train')

In [15]:
trn.head(100)

Unnamed: 0,ID,Tweet,Influenza,Diarrhea,Hayfever,Cough,Headache,Fever,Runnynose,Cold
0,1ja,風邪を引くと全身がだるくなる。,n,n,n,n,n,n,n,p
1,2ja,花粉症の症状が出てきたのは久し振りだ。,n,n,p,n,n,n,p,n
2,3ja,花粉症のせいでずっと微熱でぼーっとしてる。眠い。,n,n,p,n,n,p,p,n
3,4ja,薬飲んだけど鼻水おさまる気配なし,n,n,n,n,n,n,p,n
4,5ja,ネパールに旅行に行った際に下痢になって大変だったよ。,n,n,n,n,n,n,n,n
5,6ja,咳くらいで休むのはゆとりだけだろ。どんなときでも仕事にでるのは大事だ。,n,n,n,p,n,n,n,n
6,7ja,鼻づまりで今日は休むわー,n,n,n,n,n,n,p,n
7,8ja,まさか花粉症になるとは。,n,n,p,n,n,n,p,n
8,9ja,熱は出てるけどお腹に来る風邪じゃなさそう。,n,n,n,n,n,p,n,p
9,10ja,痰に血が混じってきもい,n,n,n,p,n,n,n,n


In [16]:
# name of labels
lbl_names = trn.columns[2:]
lbl_names

Index(['Influenza', 'Diarrhea', 'Hayfever', 'Cough', 'Headache', 'Fever',
       'Runnynose', 'Cold'],
      dtype='object')

In [17]:
# save
pickle.dump(lbl_names, open('data/MedWeb/lbl_names.pkl','wb'))

3列目以降がpの列名を列挙して、ラベルとする

trn_lbl = ['_'.join(row) for row in ((trn.iloc[:,2:] == 'p')*1).values.astype(str)]

trn_lbl = [','.join(row) for row in trn.columns[2:] + '_' +trn.iloc[:,2:]]

for i, row in trn.iterrows():
    print(','.join(lbl_names[row[2:]=='p']))
    if i > 5:
        break

trn_lbl = []
for i, row in trn.iterrows():
    trn_lbl.append(lbl_idx[(row[2:]=='p').values])

In [18]:
lbl_idx = np.asarray([i for i in range(len(lbl_names))])

In [19]:
lbl_idx

array([0, 1, 2, 3, 4, 5, 6, 7])

In [20]:
trn_lbl = []
for i, row in trn.iterrows():
    trn_lbl.append(lbl_names[(row[2:]=='p')].tolist())

In [21]:
trn_lbl[:5]

[['Cold'],
 ['Hayfever', 'Runnynose'],
 ['Hayfever', 'Fever', 'Runnynose'],
 ['Runnynose'],
 []]

In [22]:
# put data into df and save
trn_df = pd.DataFrame({'text':trn['Tweet'].values, 'labels':trn_lbl},columns=['labels', 'text'])
trn_df.to_csv('data/MedWeb/train.csv', header=False, index=False)


In [23]:
trn_df

Unnamed: 0,labels,text
0,[Cold],風邪を引くと全身がだるくなる。
1,"[Hayfever, Runnynose]",花粉症の症状が出てきたのは久し振りだ。
2,"[Hayfever, Fever, Runnynose]",花粉症のせいでずっと微熱でぼーっとしてる。眠い。
3,[Runnynose],薬飲んだけど鼻水おさまる気配なし
4,[],ネパールに旅行に行った際に下痢になって大変だったよ。
5,[Cough],咳くらいで休むのはゆとりだけだろ。どんなときでも仕事にでるのは大事だ。
6,[Runnynose],鼻づまりで今日は休むわー
7,"[Hayfever, Runnynose]",まさか花粉症になるとは。
8,"[Fever, Cold]",熱は出てるけどお腹に来る風邪じゃなさそう。
9,[Cough],痰に血が混じってきもい


### test

In [24]:
tst = pd.read_excel('data/MedWeb/NTCIR-13_MedWeb_ja_test.xlsx',sheet_name='ja_test')

In [25]:
tst.head(100)

Unnamed: 0,ID,Tweet,Influenza,Diarrhea,Hayfever,Cough,Headache,Fever,Runnynose,Cold
0,1921ja,旅行に行ったら、土産にインフルもらってきた。,p,n,n,n,n,p,n,n
1,1922ja,きつい上司、頭痛の種,n,n,n,n,n,n,n,n
2,1923ja,もう誰か翻訳してくれないときつい、宇宙の言葉ですかってくらい通じなくて頭痛してきた。,n,n,n,n,n,n,n,n
3,1924ja,インフル感染の危機。,p,n,n,n,n,p,n,n
4,1925ja,鼻づまりがひどいからスピーチは無理だ,n,n,n,n,n,n,p,n
5,1926ja,頭痛がしているので帰ることにする。,n,n,n,n,p,n,n,n
6,1927ja,インフルエンザになって部活のみんなから爆笑されたよ,p,n,n,n,n,p,n,n
7,1928ja,下痢って英語でなんていうんだろう。,n,n,n,n,n,n,n,n
8,1929ja,スペイン風邪、香港風邪っていうけど、日本風邪ってあるの？,n,n,n,n,n,n,n,n
9,1930ja,犬って鼻づまりとかするのかな？,n,n,n,n,n,n,n,n


In [26]:
tst_lbl = []
for i, row in tst.iterrows():
    tst_lbl.append(lbl_names[(row[2:]=='p')].tolist())

In [27]:
tst_lbl[:5]

[['Influenza', 'Fever'], [], [], ['Influenza', 'Fever'], ['Runnynose']]

In [28]:
# name of labels
lbl_names = tst.columns[2:]
lbl_names

Index(['Influenza', 'Diarrhea', 'Hayfever', 'Cough', 'Headache', 'Fever',
       'Runnynose', 'Cold'],
      dtype='object')

In [29]:
# put data into df and save
tst_df = pd.DataFrame({'text':tst['Tweet'].values, 'labels':tst_lbl},columns=['labels', 'text'])
# change name from test to valid (fastai lm expects valid)
tst_df.to_csv('data/MedWeb/valid.csv', header=False, index=False)


In [30]:
tst_df.head()

Unnamed: 0,labels,text
0,"[Influenza, Fever]",旅行に行ったら、土産にインフルもらってきた。
1,[],きつい上司、頭痛の種
2,[],もう誰か翻訳してくれないときつい、宇宙の言葉ですかってくらい通じなくて頭痛してきた。
3,"[Influenza, Fever]",インフル感染の危機。
4,[Runnynose],鼻づまりがひどいからスピーチは無理だ


## prepare data for lm fine tuning and classification
basically copying from train_cls.py

In [31]:
data_dir='data'
lang='ja' 
cuda_id=0 
pretrain_name='wt-100' 
model_dir='data/wiki/ja-100/models'
max_vocab=16000
name='MedWeb-clas'
dataset='MedWeb' 
frac_ds=1.0
spm_dir = 'data/wiki/ja/'

In [33]:
data_dir = Path(data_dir)
assert data_dir.name == 'data',\
    f'Error: Name of data directory should be data, not {data_dir.name}.'
dataset_dir = data_dir / dataset
model_dir = Path(model_dir)


In [34]:
if not torch.cuda.is_available():
    print('CUDA not available. Setting device=-1.')
    cuda_id = -1
torch.cuda.set_device(cuda_id)

print(f'Dataset: {dataset}. Language: {lang}.')

Dataset: MedWeb. Language: ja.


In [35]:
# here we're just loading the trained spm model
sp = get_sentencepiece(spm_dir, None, 'wt-all', vocab_size=max_vocab)

In [36]:
# load train, valid in df
train_df = pd.read_csv(dataset_dir/'train.csv',header=None)
valid_df = pd.read_csv(dataset_dir/'valid.csv',header=None)

In [37]:
train_df.head()

Unnamed: 0,0,1
0,['Cold'],風邪を引くと全身がだるくなる。
1,"['Hayfever', 'Runnynose']",花粉症の症状が出てきたのは久し振りだ。
2,"['Hayfever', 'Fever', 'Runnynose']",花粉症のせいでずっと微熱でぼーっとしてる。眠い。
3,['Runnynose'],薬飲んだけど鼻水おさまる気配なし
4,[],ネパールに旅行に行った際に下痢になって大変だったよ。


In [38]:
def strip_brackets(s):
    return s.strip('[]').replace("'","").replace(" ","")

In [39]:
train_df[0] = train_df[0].apply(strip_brackets)

In [40]:
valid_df[0] = valid_df[0].apply(strip_brackets)

In [41]:
train_df.head()

Unnamed: 0,0,1
0,Cold,風邪を引くと全身がだるくなる。
1,"Hayfever,Runnynose",花粉症の症状が出てきたのは久し振りだ。
2,"Hayfever,Fever,Runnynose",花粉症のせいでずっと微熱でぼーっとしてる。眠い。
3,Runnynose,薬飲んだけど鼻水おさまる気配なし
4,,ネパールに旅行に行った際に下痢になって大変だったよ。


### lm data

In [42]:
# set label_delim for multilabel (MultiCategory) data
data_lm = TextLMDataBunch.from_df(path=dataset_dir,train_df=train_df,valid_df=valid_df,label_delim=',',**sp)

### classification data

In [43]:
# set label_delim for multilabel (MultiCategory) data
data_clas = TextClasDataBunch.from_df(path=dataset_dir,train_df=train_df,valid_df=valid_df,label_delim=',',**sp)

In [44]:
data_clas.valid_ds.y.c2i

{'Cold': 0,
 'Cough': 1,
 'Diarrhea': 2,
 'Fever': 3,
 'Hayfever': 4,
 'Headache': 5,
 'Influenza': 6,
 'Runnynose': 7}

# LM fine tuning & Classification

In [363]:
def lm_fine_tuning( data_lm, lm_lr, bptt, emb_sz, nh, nl, qrnn,
                pad_token,
                pretrained_fnames, 
                model_dir,
                lm_enc_finetuned,
                lm_drop_mult, use_pretrained_lm,fine_tune_lm):
    
    
    if not use_pretrained_lm:
        pretrained_fnames = None
    
    learn = language_model_learner(
        data_lm, bptt=bptt, emb_sz=emb_sz, nh=nh, nl=nl, qrnn=qrnn,
        pad_token=pad_token,
        pretrained_fnames=pretrained_fnames, 
        path=model_dir.parent, model_dir=model_dir.name,
        drop_mult=lm_drop_mult
    )
    
    if fine_tune_lm:
        print('Fine-tuning the language model...')
        learn.unfreeze()
        learn.fit(10, slice(lm_lr/(10**2), lm_lr))
    #     learn.fit(10, slice(1e-3, 1e-1))
    else:
        print('Skipping fine tuning')

    # save fine tuned lm
    print(f"Saving models at {learn.path / learn.model_dir}")
    learn.save_encoder(lm_enc_finetuned)
    
    return learn

In [364]:
def classify_multilabel(data_clas, clas_lr, bptt, 
                        pad_token,
                        model_dir,
                        qrnn, emb_sz, nh, nl,clas_drop_mult,
                        fine_tune_lm, use_lm,cls_weights):
    
    
    # change metric from accuracy to accuarcy_thresh and f1
    accuracy_multi =  partial(accuracy_thresh,thresh=0.5,sigmoid=True)
    f1 = partial(fbeta,thresh=0.5,beta=1,sigmoid=True) # corresponds to f1 score in sklearn with 'samples' option?

    print("Starting classifier training")
    learn = text_classifier_learner(data_clas, bptt=bptt, pad_token=pad_token,
                                  path=model_dir.parent, model_dir=model_dir.name,
                                  qrnn=qrnn, emb_sz=emb_sz, nh=nh, nl=nl,drop_mult=clas_drop_mult
                                    )
    learn.model.reset()
    
    if use_lm:
        print('Loading language model')
        lm_enc = lm_enc_finetuned # if fine_tune_lm else lm_name
        learn.load_encoder(lm_enc)
    else:
        print('Training from scratch without language model')

    # change metric
    learn.metrics = [accuracy_multi,f1]

    # CRITICAL STEP
    #- need to adjust pos_weight of loss function to get the model working
    if cls_weights is not None:
        pos_weight = torch.cuda.FloatTensor(cls_weights[data_clas.train_ds.y.classes].values)
        bce_logits_weighted = partial(F.binary_cross_entropy_with_logits,  pos_weight=pos_weight)
        learn.loss_func = bce_logits_weighted

    # train
    learn.freeze_to(-1)
    learn.fit_one_cycle(1, clas_lr, moms=(0.8, 0.7), wd=1e-7)

    learn.freeze_to(-2)
    learn.fit_one_cycle(1, slice(clas_lr / (2.6 ** 4), clas_lr), moms=(0.8, 0.7), wd=1e-7)

    learn.freeze_to(-3)
    learn.fit_one_cycle(1, slice(clas_lr / (2.6 ** 4), clas_lr), moms=(0.8, 0.7), wd=1e-7)

    learn.unfreeze()
    learn.fit_one_cycle(2, slice(clas_lr / (2.6 ** 4), clas_lr), moms=(0.8, 0.7), wd=1e-7)
    
    # results
    results={}
    results['accuracy_multi'] = learn.validate()[1]
    results['F1'] = learn.validate()[2]
    print(results)
    
    # save classifier
    print(f"Saving models at {learn.path / learn.model_dir}")
    learn.save(f'{model_name}_{name}')
    
    # prediction on validation data
    preds=learn.get_preds()

    # prediction
    p=preds[0]
    p = p.tolist()
    p = np.asarray(p)

    # binarize
    y_pred = (p>=0.5)*1.0


    # target
    t = preds[1].tolist()
    y_true = np.asarray(t)
    
    return y_true, y_pred

In [365]:
def evaluation(y_true,y_pred):
    ## metrics
    f1_micro = f1_score(y_true,y_pred,average='micro')
    f1_macro = f1_score(y_true,y_pred,average='macro')

    print('F1 score')
    print('micro: {}'.format(f1_micro))
    print('macro: {}'.format(f1_macro))

    precision_micro = precision_score(y_true,y_pred,average='micro')
    precision_macro = precision_score(y_true,y_pred,average='macro')

    print('\nPrecision score')
    print('micro: {}'.format(precision_micro))
    print('macro: {}'.format(precision_macro))

    recall_micro = recall_score(y_true,y_pred,average='micro')
    recall_macro = recall_score(y_true,y_pred,average='macro')

    print('\nRecall score')
    print('micro: {}'.format(recall_micro))
    print('macro: {}'.format(recall_macro))
    
    results = dict(
        f1_micro=f1_micro,
        f1_macro=f1_macro,
        precision_micro=precision_micro,
        precision_macro=precision_macro,
        recall_micro=recall_micro,
        recall_macro=recall_macro
    )
    
    return results

###  Parameters

In [476]:
# common to lm fine tuning and classification
qrnn=True
bs=20 
bptt=70
pad_token=PAD_TOKEN_ID

if qrnn:
    emb_sz, nh, nl = 400, 1550, 3
else:
    emb_sz, nh, nl = 400, 1150, 3
    
# lm fine tuning
fine_tune_lm=True 
lm_lr=2e-2
lm_drop_mult=0.5
use_pretrained_lm=False

# classification
clas_lr=2e-2
clas_drop_mult=0.1
use_lm=True

# reduction of training data
frac_ds = 1.0

In [477]:
# class weights for classification loss function
# cls_weights = trn.shape[0]/(trn.iloc[:,2:]=='p').sum()/5
cls_weights = pd.Series(np.ones(8,)*1.,index=trn.columns[2:])
cls_weights

Influenza    1.0
Diarrhea     1.0
Hayfever     1.0
Cough        1.0
Headache     1.0
Fever        1.0
Runnynose    1.0
Cold         1.0
dtype: float64

In [478]:
# Create Params dictionary
class Params(object):
    def __init__(self, qrnn,bs,bptt,emb_sz, nh, nl,fine_tune_lm,lm_lr,lm_drop_mult,
                 use_pretrained_lm,clas_lr,clas_drop_mult,use_lm,cls_weights,frac_ds):
        self.qrnn = qrnn
        self.fine_tune_lm = fine_tune_lm
        self.bs = bs
        self.bptt = bptt
        self.emb_sz = emb_sz
        self.nh = nh
        self.nl = nl
        self.lm_lr = lm_lr
        self.lm_drop_mult = lm_drop_mult
        self.use_pretrained_lm = use_pretrained_lm
        self.clas_lr = clas_lr
        self.clas_drop_mult = clas_drop_mult
        self.use_lm = use_lm
        self.cls_weights = cls_weights.values.tolist()
        self.frac_ds = frac_ds

In [479]:
mlflow.set_experiment('20181129_MedWeb')

In [480]:
with mlflow.start_run():
    ## Log our parameters into mlflow
    args = Params(qrnn,bs,bptt,emb_sz, nh, nl,fine_tune_lm,lm_lr,lm_drop_mult,
                 use_pretrained_lm,clas_lr,clas_drop_mult,use_lm,cls_weights,frac_ds)
    for key, value in vars(args).items():
        mlflow.log_param(key, value)
    
    ## create data set
    data_lm = TextLMDataBunch.from_df(path=dataset_dir,train_df=train_df.sample(frac=frac_ds,random_state=42),
                                      valid_df=valid_df,label_delim=',',**sp)
    data_clas = TextClasDataBunch.from_df(path=dataset_dir,train_df=train_df.sample(frac=frac_ds,random_state=42),
                                          valid_df=valid_df,label_delim=',',**sp)
    
    ## preprocess
    if qrnn:
        print('Using QRNNs...')
    model_name = 'qrnn' if qrnn else 'lstm'
    lm_name = f'{model_name}_{pretrain_name}'
    pretrained_fnames = (lm_name, f'itos_{pretrain_name}')

    ensure_paths_exists(data_dir,
                        dataset_dir,
                        model_dir,
                        model_dir/f"{pretrained_fnames[0]}.pth",
                        model_dir/f"{pretrained_fnames[1]}.pkl")
    lm_enc_finetuned  = f"{lm_name}_{dataset}_enc"

    ## fine tune lm
    
    lm=lm_fine_tuning( data_lm, lm_lr, bptt, emb_sz, nh, nl, qrnn,
                pad_token,
                pretrained_fnames, 
                model_dir,
                lm_enc_finetuned,
                lm_drop_mult, use_pretrained_lm,fine_tune_lm)


    ## create classifier
    y_true, y_pred = classify_multilabel(data_clas, clas_lr, bptt, 
                        pad_token,
                        model_dir,
                        qrnn, emb_sz, nh, nl,clas_drop_mult,
                        fine_tune_lm, use_lm,cls_weights)

    ## Evaluation
    results = evaluation(y_true,y_pred)
    
    ## Log metrics
    for key, value in results.items():
        mlflow.log_metric(key, value)
        
    ## Save artifacts
    # pretrained lm
    mlflow.log_artifact(model_dir/f"{pretrained_fnames[0]}.pth")
    mlflow.log_artifact(model_dir/f"{pretrained_fnames[1]}.pkl")
    
    # fine tuned lm
    if fine_tune_lm:
         mlflow.log_artifact(model_dir / f'{lm_enc_finetuned}.pth')
    
    # classifier
    mlflow.log_artifact(model_dir / f'{model_name}_{name}.pth')

F1 score
micro: 0.8531249999999999
macro: 0.8348653953505185

Precision score
micro: 0.7994143484626647
macro: 0.7842473023889733

Recall score
micro: 0.914572864321608
macro: 0.8963490267059729
