In [1]:
import glob

from fastai.text import *
# from fastai.callbacks import SaveModelCallback
import sentencepiece as spm #https://github.com/google/sentencepiece

from prod.sp_tok import *

In [2]:
PATH = Path('./data/rnn')
sp_model = './__all_tweets_es_0521'

defaults.text_spec_tok.append(NL) #add a New Line special char
sp_vocab = Vocab( get_itos(sp_model) )
mycust_tok = CustomTokenizer(SPTokenizer,sp_model,pre_rules=default_rules)

In [3]:
test_df = pd.read_csv('./data/haha_2019_test.csv')
test_df['is_humor'] = 0
test_df['is_humor'].iloc[0] = 1
## apply the rules
raw_text = test_df.loc[:,'text']

print("Default Rules:\n",[x.__name__ for x in default_rules],"\n\n")
for rule in default_rules:
    raw_text = raw_text.apply(lambda x: rule(str(x)))    
test_df['new_text'] =  raw_text #tokenizer adds xxbos

data_tst = TextClasDataBunch.from_df(PATH, test_df, test_df,
                               tokenizer=mycust_tok, vocab=sp_vocab,
                               text_cols='new_text', label_cols='is_humor')

In [4]:
model_wt_fnames = [x.split('/')[-1] for x in glob.glob( str(PATH/'models')+'/haha_clas_*.pth')]
len(model_wt_fnames)

199

In [5]:
ensemble_preds = []
for wt_fname in model_wt_fnames:
    config = awd_lstm_clas_config.copy()
    config['qrnn'] = True
    config['n_hid'] = 2304
    
    learn = text_classifier_learner(data_tst, AWD_LSTM, drop_mult=0.7,pretrained=False,
                               config=config)
    learn = learn.load(wt_fname.split('.')[0]) #load weights
    preds_t,y_t,losses_t = learn.get_preds(DatasetType.Valid,with_loss=True, ordered=True)
    ensemble_preds.append(to_np(preds_t))

In [6]:
np.array(ensemble_preds).shape

(199, 6000, 2)

In [7]:
np.array(ensemble_preds)

array([[[0.299484, 0.700516],
        [0.964773, 0.035227],
        [0.078358, 0.921642],
        [0.956461, 0.043539],
        ...,
        [0.891351, 0.108649],
        [0.95695 , 0.04305 ],
        [0.432712, 0.567288],
        [0.231802, 0.768198]],

       [[0.213403, 0.786598],
        [0.901597, 0.098403],
        [0.059899, 0.940101],
        [0.943502, 0.056498],
        ...,
        [0.953848, 0.046152],
        [0.970879, 0.029121],
        [0.249986, 0.750014],
        [0.130831, 0.869169]],

       [[0.332257, 0.667743],
        [0.899529, 0.100471],
        [0.08617 , 0.91383 ],
        [0.957049, 0.042951],
        ...,
        [0.962774, 0.037226],
        [0.960516, 0.039484],
        [0.492404, 0.507596],
        [0.135533, 0.864467]],

       [[0.286082, 0.713918],
        [0.643556, 0.356444],
        [0.068208, 0.931792],
        [0.936208, 0.063791],
        ...,
        [0.423314, 0.576686],
        [0.947833, 0.052167],
        [0.300313, 0.699687],
        [0.2

In [8]:
ens_mean = np.array(ensemble_preds).mean(axis=0)
ens_mean

array([[0.274937, 0.725063],
       [0.796563, 0.203436],
       [0.064306, 0.935694],
       [0.936562, 0.063438],
       ...,
       [0.783576, 0.216424],
       [0.952769, 0.047231],
       [0.364324, 0.635676],
       [0.187843, 0.812157]], dtype=float32)

In [9]:
out_df = test_df['id'].to_frame().copy()
out_df['is_humor'] = ens_mean.argmax(axis=1)
out_df['funniness_average'] = 3.

In [10]:
out_df.head()

Unnamed: 0,id,is_humor,funniness_average
0,tweet1,1,3.0
1,tweet2,0,3.0
2,tweet3,1,3.0
3,tweet4,0,3.0
4,tweet5,0,3.0


In [11]:
out_df.to_csv('sub0523_1.csv',header=True,index=False)

In [12]:
!zip  sub0523_1.zip sub0523_1.csv 

updating: sub0523_1.csv (deflated 84%)


In [13]:
!cp sub0523_1.zip /mnt/awcnas4_AWX/tmp/