In [1]:
from fastai2.text.all import *
import math

In [2]:
path = Path.home()/'.fastai/data/nlp-getting-started'

In [3]:
df_train = pd.read_csv(path/'train.csv')
df_test = pd.read_csv(path/'test.csv')

We add 'keyword' and 'location' also to the sentences with prefixes. But there should be a better way, like handling them as categorical features, instead of like a part of the sentence. 

BTW, it did not lead to any improvement in the model performance. So the cell below can be commented also. 

In [4]:
df_train[['keyword','location']] = df_train[['keyword','location']].fillna('')
df_test[['keyword','location']] = df_test[['keyword','location']].fillna('')

# def add_data(r):
#     txt = r['text']
#     if r['keyword']:
#         txt = ' '.join(['xxkeyword',r['keyword'], txt])
    
#     if (r['location']):
#         txt = ' '.join(['xxlocation',r['location'], txt])
#     return txt

# df_train['text2'] = df_train.apply(add_data, axis=1)
# df_test['text2'] = df_test.apply(add_data, axis=1)

# for d in [df_train, df_test]:
#     d.rename({'text': 'text_org', 'text2': 'text'}, inplace=True)

In [5]:
df = pd.concat([df_train[['text']], df_test[['text']]], axis=0)
df.iloc[100:110].head(2)

Unnamed: 0,text
100,.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad
101,I still have not heard Church Leaders of Kenya coming forward to comment on the accident issue and disciplinary measures#ArrestPastorNganga


# Approach A

We will follow https://github.com/fastai/fastbook/blob/master/10_nlp.ipynb 

In [43]:
text_col = 'text'
text_transform_block = TextBlock.from_df(text_cols=[text_col], is_lm=True)

# DataBlock is a "Generic container to quickly build `Datasets` and `DataLoaders`"
# In other words, it contains the 'blocks of transformations to x andy', how to get x, 
# how to get y and how to split them. We create a dataloader out of it by providing 
# the datasource and batch size. 

lm_dl = DataBlock(
    blocks=text_transform_block,
    get_x=attrgetter(text_col),
    splitter=RandomSplitter()
).dataloaders(df, bs=128, seq_len=72)

lm_dl.show_batch(max_n=2)

Unnamed: 0,text,text_
0,xxbos xxmaj sleeping xxmaj with xxmaj sirens - xxmaj iris ( goo xxmaj goo xxmaj dolls xxmaj cover ) http : / / t.co / xxunk xxbos xxmaj beyonce xxmaj is my pick for http : / / t.co / nnmqlz91o9 xxmaj fan xxmaj army # xxmaj beyhive http : / / t.co / o91f3cyy0r xxunk xxbos xxmaj the annihilation of xxmaj jeb xxmaj christie & & xxmaj xxunk is less than,xxmaj sleeping xxmaj with xxmaj sirens - xxmaj iris ( goo xxmaj goo xxmaj dolls xxmaj cover ) http : / / t.co / xxunk xxbos xxmaj beyonce xxmaj is my pick for http : / / t.co / nnmqlz91o9 xxmaj fan xxmaj army # xxmaj beyhive http : / / t.co / o91f3cyy0r xxunk xxbos xxmaj the annihilation of xxmaj jeb xxmaj christie & & xxmaj xxunk is less than 24
1,: ' people who have been forced to leave their country in order to escape war xxunk or natural disaster ' xxbos xxunk _ xxmaj my xxmaj xxunk will be devastated lol # xxunk xxbos xxmaj we rescued my dog at least 9 years ago ? ? she 's old but still sweet as ever ? ? @zak_bagans http : / / t.co / xxunk xxbos xxmaj police investigating after an e,' people who have been forced to leave their country in order to escape war xxunk or natural disaster ' xxbos xxunk _ xxmaj my xxmaj xxunk will be devastated lol # xxunk xxbos xxmaj we rescued my dog at least 9 years ago ? ? she 's old but still sweet as ever ? ? @zak_bagans http : / / t.co / xxunk xxbos xxmaj police investigating after an e -


In [14]:
print('vocab len: ', len(lm_dl.vocab))

vocab len:  5832


In [15]:
print(text_transform_block.type_tfms)
print(text_transform_block.item_tfms)
print(text_transform_block.batch_tfms)

(#2) [Tokenizer: (str,object) -> encodes
(Path,object) -> encodes (object,object) -> decodes,Numericalize: (object,object) -> encodes (object,object) -> decodes]
(#1) [<class 'fastai2.data.transforms.ToTensor'>]
(#0) []


In [16]:
learn = language_model_learner(lm_dl, AWD_LSTM, metrics=Perplexity())
learn.fit_one_cycle(8, 2e-2, moms=(0.8, 0.7, 0.8))
learn.save('epoch_8')

epoch,train_loss,valid_loss,perplexity,time
0,5.407336,4.23466,69.038216,00:18
1,4.607827,3.434359,31.011522,00:18
2,4.09323,3.223242,25.109388,00:18
3,3.743704,3.129892,22.871511,00:18
4,3.507817,3.088863,21.952103,00:18
5,3.321668,3.064706,21.428164,00:18
6,3.194085,3.057027,21.264252,00:18
7,3.109205,3.055584,21.233576,00:18


In [17]:
learn.unfreeze()
learn.fit_one_cycle(4, 2e-3)
learn.save_encoder('finetuned')

epoch,train_loss,valid_loss,perplexity,time
0,2.949537,2.986506,19.816313,00:20
1,2.867903,2.954794,19.197765,00:20
2,2.752208,2.929241,18.713421,00:20
3,2.671687,2.93193,18.763807,00:19


## training for classification

Let us add 'location' and 'keyword' to the text with unique keywords as prefixes

In [18]:
clf_tranform_blocks = [TextBlock.from_df(text_cols=['text'], vocab=lm_dl.vocab), CategoryBlock]

print(clf_tranform_blocks[0].type_tfms)
print(clf_tranform_blocks[0].item_tfms)
print(clf_tranform_blocks[0].batch_tfms)

(#2) [Tokenizer: (str,object) -> encodes
(Path,object) -> encodes (object,object) -> decodes,Numericalize: (object,object) -> encodes (object,object) -> decodes]
(#1) [<class 'fastai2.data.transforms.ToTensor'>]
(#0) []


In [19]:
clf_dl = DataBlock(
    blocks=clf_tranform_blocks,
    get_x=attrgetter('text'),
    get_y=attrgetter('target'),
    splitter=RandomSplitter(0.1)
).dataloaders(df_train, verbose=True)

Setting up after_item: Pipeline: ToTensor
Setting up before_batch: Pipeline: partial
Setting up after_batch: Pipeline: 


In [42]:
clf_dl.show_batch(max_n=2)

Unnamed: 0,text,category
0,xxbos _ \n▁ xxrep 5 ? xxup retweet \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup follow xxup all xxup who xxup rt \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup xxunk \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup gain xxup with \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup follow ? xxunk # xxup xxunk \n▁ # xxup ty,0
1,xxbos xxup info xxup s. xxup wnd : xxunk / 6 . xxup xxunk : xxup xxunk xxup xxunk . xxup exp xxup inst xxup apch . xxup rwy 05 . xxup curfew xxup in xxup oper xxup until 2030 xxup z. xxup taxiways xxup foxtrot 5 & & xxup foxtrot 6 xxup navbl . xxup tmp : 10 .,0


In [32]:
learn = text_classifier_learner(clf_dl, AWD_LSTM, path=path, metrics=accuracy)
learn = learn.load_encoder('finetuned')

learn.fit_one_cycle(2, 2e-2, moms=(0.8, 0.7, 0.8))

epoch,train_loss,valid_loss,accuracy,time
0,0.67269,0.492945,0.768725,00:05
1,0.625081,0.478833,0.777924,00:05


In [33]:
learn.freeze_to(-2)
learn.fit_one_cycle(2, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.562895,0.478458,0.791064,00:05
1,0.464479,0.482533,0.792378,00:05


In [34]:
learn.freeze_to(-3)
learn.fit_one_cycle(2, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.429314,0.470474,0.804205,00:07
1,0.352418,0.510408,0.797635,00:06


In [35]:
learn.unfreeze()
learn.fit_one_cycle(4, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.3092,0.5217,0.793693,00:08
1,0.272631,0.567236,0.78975,00:08
2,0.23688,0.594942,0.805519,00:08
3,0.222788,0.607421,0.798949,00:08


In [36]:
learn.export('tuned_classifier.pkl')

In [37]:
learn = load_learner('/home/achinta/.fastai/data/nlp-getting-started/tuned_classifier.pkl')

In [38]:
test_dl = learn.dls.test_dl(df_test)
inp, preds, x, dec_preds = learn.get_preds(dl=test_dl, with_input=True, with_decoded=True)

**Note that the predictions are not the same order as in the dataframe** 

In [29]:
output = pd.DataFrame({
    'id': df_test.iloc[test_dl.get_idxs()]['id'],
    'target': dec_preds
}).sort_values('id')

output.to_csv('output.csv', index=False)

In [30]:
!kaggle competitions submit -c nlp-getting-started -f output.csv -m 'added location and target'

100%|##########| 22.2k/22.2k [00:09<00:00, 2.35kB/s]
Successfully submitted to Real or Not? NLP with Disaster Tweets

This gave a leaderboard score of 0.80572 (1251/3337)

## Approach B: Expanding abbreviations

In [6]:
# Thanks to https://www.kaggle.com/rftexas/text-only-kfold-bert
with open('abbreviations.json','r') as f:
    abbreviations = json.load(f)

In [7]:
class MySpacyTokenizer(SpacyTokenizer):
    def __init__(self, lang='en', special_toks=None, buf_sz=5000):
        super().__init__(lang, special_toks, buf_sz)
        self.abbr = abbreviations

    def __call__(self, items):
        # replace abbreviation with ites expansion fo 
        if self.abbr:
            items2 = [' '.join([abbreviations.get(o, o) for o in item.split()]) for item in items]
        else:
            items2 = items
        return super().__call__(items2)

In [20]:
text_col = 'text'
text_transform_block = TextBlock.from_df(text_cols=[text_col], tok_func=MySpacyTokenizer, is_lm=True)

# DataBlock is a "Generic container to quickly build `Datasets` and `DataLoaders`"
# In other words, it contains the 'blocks of transformations to x andy', how to get x, 
# how to get y and how to split them. We create a dataloader out of it by providing 
# the datasource and batch size. 

lm_dl = DataBlock(
    blocks=text_transform_block,
    get_x=attrgetter(text_col),
    splitter=RandomSplitter()
).dataloaders(df, bs=256, seq_len=72)
print(len(lm_dl.vocab))

5816


In [26]:
learn = language_model_learner(lm_dl, AWD_LSTM, metrics=Perplexity())
learn.fit_one_cycle(8, 2e-2, moms=(0.8, 0.7, 0.8))
learn.save('lm_1')

epoch,train_loss,valid_loss,perplexity,time
0,5.594144,4.666418,106.316193,00:20
1,4.945578,3.655969,38.705025,00:20
2,4.435383,3.391801,29.719433,00:20
3,4.091724,3.291511,26.883455,00:20
4,3.8607,3.234438,25.392101,00:20
5,3.685889,3.209291,24.761511,00:20
6,3.554682,3.195355,24.418839,00:20
7,3.459992,3.19269,24.353861,00:20


In [27]:
learn.load('lm_1')
learn.unfreeze()
learn.fit_one_cycle(4, 2e-3)
learn.save('lm_2')

epoch,train_loss,valid_loss,perplexity,time
0,3.075059,3.096713,22.125113,00:22
1,3.046757,3.105889,22.329067,00:22
2,2.97203,3.038295,20.869637,00:22
3,2.903151,3.030007,20.697374,00:22


In [28]:
learn.load('lm_2')
learn.fit_one_cycle(4, 1e-3)
learn.save('lm_3')

epoch,train_loss,valid_loss,perplexity,time
0,2.794242,3.019825,20.487705,00:22
1,2.782044,3.009561,20.278502,00:22
2,2.763209,3.00828,20.252531,00:22
3,2.727956,3.003245,20.150814,00:22


In [29]:
learn.save_encoder('abbr')

### Classification

In [30]:
len(lm_dl.vocab)

5816

In [31]:
clf_tranform_blocks = [TextBlock.from_df(text_cols=['text'], tok_func=MySpacyTokenizer, vocab=lm_dl.vocab), CategoryBlock]

print(clf_tranform_blocks[0].type_tfms)
print(clf_tranform_blocks[0].item_tfms)
print(clf_tranform_blocks[0].batch_tfms)

(#2) [Tokenizer: (str,object) -> encodes
(Path,object) -> encodes (object,object) -> decodes,Numericalize: (object,object) -> encodes (object,object) -> decodes]
(#1) [<class 'fastai2.data.transforms.ToTensor'>]
(#0) []


In [32]:
clf_dl = DataBlock(
    blocks=clf_tranform_blocks,
    get_x=attrgetter('text'),
    get_y=attrgetter('target'),
    splitter=RandomSplitter(0.05)
).dataloaders(df_train, verbose=True)

Setting up after_item: Pipeline: ToTensor
Setting up before_batch: Pipeline: partial
Setting up after_batch: Pipeline: 


In [33]:
clf_dl.show_batch(max_n=2)

Unnamed: 0,text,category
0,xxbos . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : xxup retweet xxunk : # xxunk xxmaj indian xxmaj army xxunk _ http : / / t.co / xxunk g,0
1,xxbos xxmaj rare xxunk into # terror and xxmaj how to fight it http : / / t.co / xxunk # xxmaj cameroon # xxup usa # xxmaj xxunk # xxup xxunk # xxup fr # xxmaj nigeria # xxup uk # xxmaj africa # xxup de # xxup ca # xxup au # xxup jp,1


In [38]:
learn = text_classifier_learner(clf_dl, AWD_LSTM, path=path, metrics=accuracy)
learn = learn.load_encoder('abbr')

learn.fit_one_cycle(4, 2e-2, moms=(0.8, 0.7, 0.8))

epoch,train_loss,valid_loss,accuracy,time
0,0.719345,0.528575,0.760526,00:05
1,0.671253,0.500551,0.755263,00:05
2,0.649146,0.491686,0.773684,00:05
3,0.644446,0.497334,0.755263,00:05


In [35]:
learn.freeze_to(-2)
learn.fit_one_cycle(2, slice(1e-2/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.567271,0.450082,0.786842,00:05
1,0.4754,0.44844,0.794737,00:05


In [39]:
learn.freeze_to(-3)
learn.fit_one_cycle(2, slice(5e-3/(2.6**4),5e-4))

epoch,train_loss,valid_loss,accuracy,time
0,0.562773,0.47138,0.773684,00:07
1,0.496642,0.46069,0.797368,00:07


In [40]:
learn.unfreeze()
learn.fit_one_cycle(4, slice(1e-3/(2.6**4),1e-4))

epoch,train_loss,valid_loss,accuracy,time
0,0.469011,0.457653,0.789474,00:09
1,0.450835,0.457642,0.792105,00:09
2,0.444488,0.458775,0.792105,00:09
3,0.433657,0.453173,0.8,00:09


In [41]:
learn.fit_one_cycle(4, slice(1e-3/(2.6**4),1e-4))
learn.export('tuned_classifier.pkl')

epoch,train_loss,valid_loss,accuracy,time
0,0.42851,0.45765,0.797368,00:09
1,0.407471,0.461325,0.797368,00:09
2,0.409867,0.448576,0.805263,00:09
3,0.395462,0.448021,0.802632,00:09


In [43]:
def save_preds():
    learn = load_learner('/home/achinta/.fastai/data/nlp-getting-started/tuned_classifier.pkl')
    test_dl = learn.dls.test_dl(df_test)
    inp, preds, x, dec_preds = learn.get_preds(dl=test_dl, with_input=True, with_decoded=True)
    output = pd.DataFrame({
        'id': df_test.iloc[test_dl.get_idxs()]['id'],
        'target': dec_preds
    }).sort_values('id')

    output.to_csv('output.csv', index=False)
    
save_preds()
!kaggle competitions submit -c nlp-getting-started -f output.csv -m 'added abbreviations'

100%|##########| 22.2k/22.2k [00:15<00:00, 1.44kB/s]
Successfully submitted to Real or Not? NLP with Disaster Tweets

### Approach C - other features

### Approach D - Glove embeddings

In [51]:
%%time
import bcolz
glove_path = Path.home()/'data/glove'

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.300d.dat', mode='w')

with open(glove_path/'glove.6B.300d.txt', 'rb') as f:
    for idx, l in enumerate(f):
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
        
vectors = bcolz.carray(vectors[1:].reshape((400000, 300)), rootdir=f'{glove_path}/6B.300.dat', mode='w')

CPU times: user 1min 56s, sys: 8.2 s, total: 2min 4s
Wall time: 2min 4s


In [55]:
# vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.300_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.300_idx.pkl', 'wb'))

In [57]:
vectors = bcolz.open(f'{glove_path}/6B.300.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.300_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.300_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

Lets create random numpy array and thenload it with glove embeddings

In [74]:
emb_np = np.random.rand(len(lm_dl.vocab), 300)
missing_vocab = []
for idx, word in enumerate(lm_dl.vocab):
    if word in glove:
        emb_np[idx,:] = glove[word]
    else:
        missing_vocab.append(word)
print('missing words in glove: ', len(missing_vocab))

missing words in glove:  450


In [79]:
emb = nn.Embedding.from_pretrained(torch.from_numpy(emb_np)); emb

Embedding(5832, 300)

As AWD_LSTM does not support pretrained embeddings, lets override it

In [83]:
class MyAWD_LSTM(AWD_LSTM):
    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1, hidden_p=0.2, input_p=0.6, embed_p=0.1,
             weight_p=0.5, bidir=False):
        super().__init__(vocab_sz, emb_sz, n_hid, n_layers, pad_token, hidden_p, input_p, embed_p, weight_p, bidir)
        self.encoder = emb
    
awd_lstm_lm_config['emb_sz'] = 300

In [84]:
text_col = 'text'
text_transform_block = TextBlock.from_df(text_cols=[text_col], tok_func=MySpacyTokenizer, is_lm=True)

lm_dl = DataBlock(
    blocks=text_transform_block,
    get_x=attrgetter(text_col),
    splitter=RandomSplitter()
).dataloaders(df, bs=256, seq_len=72)

In [85]:
learn = language_model_learner(lm_dl, AWD_LSTM, metrics=Perplexity())
learn.fit_one_cycle(8, 2e-2, moms=(0.8, 0.7, 0.8))
learn.save('epoch_8')

RuntimeError: Error(s) in loading state_dict for SequentialRNN:
	size mismatch for 0.encoder.weight: copying a param with shape torch.Size([5816, 400]) from checkpoint, the shape in current model is torch.Size([5816, 300]).
	size mismatch for 0.encoder_dp.emb.weight: copying a param with shape torch.Size([5816, 400]) from checkpoint, the shape in current model is torch.Size([5816, 300]).
	size mismatch for 0.rnns.0.module.weight_ih_l0: copying a param with shape torch.Size([4608, 400]) from checkpoint, the shape in current model is torch.Size([4608, 300]).
	size mismatch for 0.rnns.2.weight_hh_l0_raw: copying a param with shape torch.Size([1600, 400]) from checkpoint, the shape in current model is torch.Size([1200, 300]).
	size mismatch for 0.rnns.2.module.weight_ih_l0: copying a param with shape torch.Size([1600, 1152]) from checkpoint, the shape in current model is torch.Size([1200, 1152]).
	size mismatch for 0.rnns.2.module.weight_hh_l0: copying a param with shape torch.Size([1600, 400]) from checkpoint, the shape in current model is torch.Size([1200, 300]).
	size mismatch for 0.rnns.2.module.bias_ih_l0: copying a param with shape torch.Size([1600]) from checkpoint, the shape in current model is torch.Size([1200]).
	size mismatch for 0.rnns.2.module.bias_hh_l0: copying a param with shape torch.Size([1600]) from checkpoint, the shape in current model is torch.Size([1200]).
	size mismatch for 1.decoder.weight: copying a param with shape torch.Size([5816, 400]) from checkpoint, the shape in current model is torch.Size([5816, 300]).

## Playground

In [None]:
learn.validate(dl=dl)

In [None]:
print(len(df_train.location.unique()))
print(df_train.location.unique()[:20])

In [None]:
test_dl.get_idxs()

In [None]:
type(test_dl)

In [None]:
print(len(test_dl.get_idxs()))
test_dl.get_idxs()[:10]

In [None]:
_, _, _, dec_preds = learn.get_preds(dl=test_dl, with_input=True, with_decoded=True)

In [None]:
output_oneshot = pd.DataFrame({
    'id': df_test.iloc[test_dl.get_idxs()]['id'],
    'preds_oneshot': dec_preds
}).sort_values('id')

In [None]:
import pandas as pd

In [None]:
output_oneshot.iloc[100:120].head(20)

In [None]:
output.iloc[100:120].head(20)

In [None]:
combined = output.join(output_oneshot, on='id',lsuffix='_l', rsuffix='_r')

In [None]:
combined.head()

In [41]:
type(AWD_LSTM)

fastcore.foundation.PrePostInitMeta