In [1]:
from fastai2.text.all import *
import math

In [2]:
path = Path.home()/'.fastai/data/nlp-getting-started'

In [10]:
df_train = pd.read_csv(path/'train.csv')
df_test = pd.read_csv(path/'test.csv')

We add 'keyword' and 'location' also to the sentences with prefixes. But there should be a better way, like handling them as categorical features, instead of like a part of the sentence. 

BTW, it did not lead to any improvement in the model performance. So the cell below can be commented also. 

In [11]:
df_train[['keyword','location']] = df_train[['keyword','location']].fillna('')
df_test[['keyword','location']] = df_test[['keyword','location']].fillna('')

def add_data(r):
    txt = r['text']
    if r['keyword']:
        txt = ' '.join(['xxkeyword',r['keyword'], txt])
    
    if (r['location']):
        txt = ' '.join(['xxlocation',r['location'], txt])
    return txt

df_train['text2'] = df_train.apply(add_data, axis=1)
df_test['text2'] = df_test.apply(add_data, axis=1)

for d in [df_train, df_test]:
    d.rename({'text': 'text_org', 'text2': 'text'}, inplace=True)

In [12]:
df = pd.concat([df_train, df_test], axis=0)
df.iloc[100:110].head(2)

Unnamed: 0,id,keyword,location,text,target,text2
100,144,accident,UK,.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad,1.0,xxlocation UK xxkeyword accident .@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad
101,145,accident,"Nairobi, Kenya",I still have not heard Church Leaders of Kenya coming forward to comment on the accident issue and disciplinary measures#ArrestPastorNganga,0.0,"xxlocation Nairobi, Kenya xxkeyword accident I still have not heard Church Leaders of Kenya coming forward to comment on the accident issue and disciplinary measures#ArrestPastorNganga"


# Approach A

We will follow https://github.com/fastai/fastbook/blob/master/10_nlp.ipynb 

In [43]:
text_col = 'text'
text_transform_block = TextBlock.from_df(text_cols=[text_col], is_lm=True)

# DataBlock is a "Generic container to quickly build `Datasets` and `DataLoaders`"
# In other words, it contains the 'blocks of transformations to x andy', how to get x, 
# how to get y and how to split them. We create a dataloader out of it by providing 
# the datasource and batch size. 

lm_dl = DataBlock(
    blocks=text_transform_block,
    get_x=attrgetter(text_col),
    splitter=RandomSplitter()
).dataloaders(df, bs=128, seq_len=72)

lm_dl.show_batch(max_n=2)

Unnamed: 0,text,text_
0,xxbos xxmaj sleeping xxmaj with xxmaj sirens - xxmaj iris ( goo xxmaj goo xxmaj dolls xxmaj cover ) http : / / t.co / xxunk xxbos xxmaj beyonce xxmaj is my pick for http : / / t.co / nnmqlz91o9 xxmaj fan xxmaj army # xxmaj beyhive http : / / t.co / o91f3cyy0r xxunk xxbos xxmaj the annihilation of xxmaj jeb xxmaj christie & & xxmaj xxunk is less than,xxmaj sleeping xxmaj with xxmaj sirens - xxmaj iris ( goo xxmaj goo xxmaj dolls xxmaj cover ) http : / / t.co / xxunk xxbos xxmaj beyonce xxmaj is my pick for http : / / t.co / nnmqlz91o9 xxmaj fan xxmaj army # xxmaj beyhive http : / / t.co / o91f3cyy0r xxunk xxbos xxmaj the annihilation of xxmaj jeb xxmaj christie & & xxmaj xxunk is less than 24
1,: ' people who have been forced to leave their country in order to escape war xxunk or natural disaster ' xxbos xxunk _ xxmaj my xxmaj xxunk will be devastated lol # xxunk xxbos xxmaj we rescued my dog at least 9 years ago ? ? she 's old but still sweet as ever ? ? @zak_bagans http : / / t.co / xxunk xxbos xxmaj police investigating after an e,' people who have been forced to leave their country in order to escape war xxunk or natural disaster ' xxbos xxunk _ xxmaj my xxmaj xxunk will be devastated lol # xxunk xxbos xxmaj we rescued my dog at least 9 years ago ? ? she 's old but still sweet as ever ? ? @zak_bagans http : / / t.co / xxunk xxbos xxmaj police investigating after an e -


In [14]:
print('vocab len: ', len(lm_dl.vocab))

vocab len:  5832


In [15]:
print(text_transform_block.type_tfms)
print(text_transform_block.item_tfms)
print(text_transform_block.batch_tfms)

(#2) [Tokenizer: (str,object) -> encodes
(Path,object) -> encodes (object,object) -> decodes,Numericalize: (object,object) -> encodes (object,object) -> decodes]
(#1) [<class 'fastai2.data.transforms.ToTensor'>]
(#0) []


In [16]:
learn = language_model_learner(lm_dl, AWD_LSTM, metrics=Perplexity())
learn.fit_one_cycle(8, 2e-2, moms=(0.8, 0.7, 0.8))
learn.save('epoch_8')

epoch,train_loss,valid_loss,perplexity,time
0,5.407336,4.23466,69.038216,00:18
1,4.607827,3.434359,31.011522,00:18
2,4.09323,3.223242,25.109388,00:18
3,3.743704,3.129892,22.871511,00:18
4,3.507817,3.088863,21.952103,00:18
5,3.321668,3.064706,21.428164,00:18
6,3.194085,3.057027,21.264252,00:18
7,3.109205,3.055584,21.233576,00:18


In [17]:
learn.unfreeze()
learn.fit_one_cycle(4, 2e-3)
learn.save_encoder('finetuned')

epoch,train_loss,valid_loss,perplexity,time
0,2.949537,2.986506,19.816313,00:20
1,2.867903,2.954794,19.197765,00:20
2,2.752208,2.929241,18.713421,00:20
3,2.671687,2.93193,18.763807,00:19


## training for classification

Let us add 'location' and 'keyword' to the text with unique keywords as prefixes

In [18]:
clf_tranform_blocks = [TextBlock.from_df(text_cols=['text'], vocab=lm_dl.vocab), CategoryBlock]

print(clf_tranform_blocks[0].type_tfms)
print(clf_tranform_blocks[0].item_tfms)
print(clf_tranform_blocks[0].batch_tfms)

(#2) [Tokenizer: (str,object) -> encodes
(Path,object) -> encodes (object,object) -> decodes,Numericalize: (object,object) -> encodes (object,object) -> decodes]
(#1) [<class 'fastai2.data.transforms.ToTensor'>]
(#0) []


In [19]:
clf_dl = DataBlock(
    blocks=clf_tranform_blocks,
    get_x=attrgetter('text'),
    get_y=attrgetter('target'),
    splitter=RandomSplitter(0.1)
).dataloaders(df_train, verbose=True)

Setting up after_item: Pipeline: ToTensor
Setting up before_batch: Pipeline: partial
Setting up after_batch: Pipeline: 


In [42]:
clf_dl.show_batch(max_n=2)

Unnamed: 0,text,category
0,xxbos _ \n▁ xxrep 5 ? xxup retweet \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup follow xxup all xxup who xxup rt \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup xxunk \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup gain xxup with \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup follow ? xxunk # xxup xxunk \n▁ # xxup ty,0
1,xxbos xxup info xxup s. xxup wnd : xxunk / 6 . xxup xxunk : xxup xxunk xxup xxunk . xxup exp xxup inst xxup apch . xxup rwy 05 . xxup curfew xxup in xxup oper xxup until 2030 xxup z. xxup taxiways xxup foxtrot 5 & & xxup foxtrot 6 xxup navbl . xxup tmp : 10 .,0


In [32]:
learn = text_classifier_learner(clf_dl, AWD_LSTM, path=path, metrics=accuracy)
learn = learn.load_encoder('finetuned')

learn.fit_one_cycle(2, 2e-2, moms=(0.8, 0.7, 0.8))

epoch,train_loss,valid_loss,accuracy,time
0,0.67269,0.492945,0.768725,00:05
1,0.625081,0.478833,0.777924,00:05


In [33]:
learn.freeze_to(-2)
learn.fit_one_cycle(2, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.562895,0.478458,0.791064,00:05
1,0.464479,0.482533,0.792378,00:05


In [34]:
learn.freeze_to(-3)
learn.fit_one_cycle(2, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.429314,0.470474,0.804205,00:07
1,0.352418,0.510408,0.797635,00:06


In [35]:
learn.unfreeze()
learn.fit_one_cycle(4, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.3092,0.5217,0.793693,00:08
1,0.272631,0.567236,0.78975,00:08
2,0.23688,0.594942,0.805519,00:08
3,0.222788,0.607421,0.798949,00:08


In [36]:
learn.export('tuned_classifier.pkl')

In [37]:
learn = load_learner('/home/achinta/.fastai/data/nlp-getting-started/tuned_classifier.pkl')

In [38]:
test_dl = learn.dls.test_dl(df_test)
inp, preds, x, dec_preds = learn.get_preds(dl=test_dl, with_input=True, with_decoded=True)

**Note that the predictions are not the same order as in the dataframe** 

In [29]:
output = pd.DataFrame({
    'id': df_test.iloc[test_dl.get_idxs()]['id'],
    'target': dec_preds
}).sort_values('id')

output.to_csv('output.csv', index=False)

In [30]:
!kaggle competitions submit -c nlp-getting-started -f output.csv -m 'added location and target'

100%|##########| 22.2k/22.2k [00:09<00:00, 2.35kB/s]
Successfully submitted to Real or Not? NLP with Disaster Tweets

This gave a leaderboard score of 0.80572 (1251/3337)

## Playground

In [None]:
learn.validate(dl=dl)

In [None]:
print(len(df_train.location.unique()))
print(df_train.location.unique()[:20])

In [None]:
test_dl.get_idxs()

In [None]:
type(test_dl)

In [None]:
print(len(test_dl.get_idxs()))
test_dl.get_idxs()[:10]

In [None]:
_, _, _, dec_preds = learn.get_preds(dl=test_dl, with_input=True, with_decoded=True)

In [None]:
output_oneshot = pd.DataFrame({
    'id': df_test.iloc[test_dl.get_idxs()]['id'],
    'preds_oneshot': dec_preds
}).sort_values('id')

In [None]:
import pandas as pd

In [None]:
output_oneshot.iloc[100:120].head(20)

In [None]:
output.iloc[100:120].head(20)

In [None]:
combined = output.join(output_oneshot, on='id',lsuffix='_l', rsuffix='_r')

In [None]:
combined.head()

In [41]:
type(AWD_LSTM)

fastcore.foundation.PrePostInitMeta