In [1]:
import glob

from fastai.text import *
# from fastai.callbacks import SaveModelCallback
import sentencepiece as spm #https://github.com/google/sentencepiece

from prod.sp_tok import *

In [2]:
PATH = Path('./data/rnn')
sp_model = './__all_tweets_es_0521'

defaults.text_spec_tok.append(NL) #add a New Line special char
sp_vocab = Vocab( get_itos(sp_model) )
mycust_tok = CustomTokenizer(SPTokenizer,sp_model,pre_rules=default_rules)

In [3]:
test_df = pd.read_csv('./data/haha_2019_test.csv')
test_df['funniness_average'] = 0
test_df['funniness_average'].iloc[0] = 0.1
test_df['funniness_average'].iloc[1] = 1.2
test_df['funniness_average'].iloc[2] = 3.4
## apply the rules
raw_text = test_df.loc[:,'text']

print("Default Rules:\n",[x.__name__ for x in default_rules],"\n\n")
for rule in default_rules:
    raw_text = raw_text.apply(lambda x: rule(str(x)))    
test_df['new_text'] =  raw_text #tokenizer adds xxbos

data_tst = TextClasDataBunch.from_df(PATH, test_df, test_df,
                               tokenizer=mycust_tok, vocab=sp_vocab,
                               text_cols='new_text', label_cols='funniness_average')

In [4]:
model_wt_fnames = [x.split('/')[-1] for x in glob.glob( str(PATH/'models')+'/haha_regr_*.pth')]
len(model_wt_fnames)

200

In [23]:
ensemble_preds = []
for wt_fname in model_wt_fnames:
    config = awd_lstm_clas_config.copy()
    config['qrnn'] = True
    config['n_hid'] = 2304
    
    learn = text_classifier_learner(data_tst, AWD_LSTM, drop_mult=0.7,pretrained=False,
                               config=config)
    learn = learn.load(wt_fname.split('.')[0]) #load weights
    preds_t,y_t,losses_t = learn.get_preds(DatasetType.Valid,with_loss=True, ordered=True)
    ensemble_preds.append(to_np(preds_t))

In [24]:
np.array(ensemble_preds).shape

(200, 6000, 1)

In [25]:
np.array(ensemble_preds)

array([[[ 1.366066],
        [ 0.189519],
        [ 2.35355 ],
        [ 0.092145],
        ...,
        [ 0.300147],
        [-0.016088],
        [ 1.218295],
        [ 1.771805]],

       [[ 1.774276],
        [ 0.32638 ],
        [ 2.367456],
        [ 0.044346],
        ...,
        [ 0.520407],
        [-0.024221],
        [ 1.696346],
        [ 1.747051]],

       [[ 1.524559],
        [ 0.259706],
        [ 2.562716],
        [ 0.139008],
        ...,
        [ 0.31596 ],
        [ 0.137315],
        [ 1.034351],
        [ 1.675185]],

       [[ 1.178282],
        [ 0.320009],
        [ 2.318033],
        [ 0.101329],
        ...,
        [ 0.604481],
        [ 0.085859],
        [ 1.097252],
        [ 0.92805 ]],

       ...,

       [[ 1.292455],
        [ 0.65805 ],
        [ 2.279075],
        [ 0.12204 ],
        ...,
        [ 0.690125],
        [ 0.124922],
        [ 1.647528],
        [ 1.66852 ]],

       [[ 1.055442],
        [ 0.129389],
        [ 2.38532 ],
        [

In [26]:
ens_mean = np.array(ensemble_preds).mean(axis=0)
ens_mean

array([[1.522744],
       [0.351772],
       [2.398724],
       [0.138691],
       ...,
       [0.662723],
       [0.06224 ],
       [1.389218],
       [1.560528]], dtype=float32)

In [27]:
out_df = pd.read_csv('./data/submission/sub0523_1.csv')
out_df.head()

Unnamed: 0,id,is_humor,funniness_average
0,tweet1,1,3.0
1,tweet2,0,3.0
2,tweet3,1,3.0
3,tweet4,0,3.0
4,tweet5,0,3.0


In [28]:
out_df.shape, ens_mean.shape

((6000, 3), (6000, 1))

In [29]:
out_df.funniness_average = ens_mean

In [33]:
out_df.head()

Unnamed: 0,id,is_humor,funniness_average
0,tweet1,1,1.522744
1,tweet2,0,0.351772
2,tweet3,1,2.398724
3,tweet4,0,0.138691
4,tweet5,0,0.021646


In [30]:
out_df.to_csv('sub0601_1.csv',header=True,index=False)

In [31]:
!zip  sub0601_1.zip sub0601_1.csv 

  adding: sub0601_1.csv (deflated 67%)


In [32]:
!cp sub0601_1.zip /mnt/awcnas4_AWX/tmp/