### Import Data

In [1]:
import pandas as pd

In [21]:
path = '../disaster_npl_prediction/'

In [22]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [23]:
train = train[['text', 'target']]

In [24]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [25]:
train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
train['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
5       #RockyFire Update => California Hwy. 20 closed...
6       #flood #disaster Heavy rain causes flash flood...
7       I'm on top of the hill and I can see a fire in...
8       There's an emergency evacuation happening now ...
9       I'm afraid that the tornado is coming to our a...
10            Three people died from the heat wave so far
11      Haha South Tampa is getting flooded hah- WAIT ...
12      #raining #flooding #Florida #TampaBay #Tampa 1...
13                #Flood in Bago Myanmar #We arrived Bago
14      Damage to school bus on 80 in multi car crash ...
15                                         What's up man?
16                                          I love fruits
17            

### Remove URL, RT, mention(@)

In [47]:
train.text = train.text.str.replace(r'http(\S)+', r'')
train.text = train.text.str.replace(r'http ...', r'')

In [48]:
train.text[train.text.str.contains(r'http')]

121    Aftershock: Protect Yourself and Profit in the...
Name: text, dtype: object

In [49]:
train.text[121]

'Aftershock: Protect Yourself and Profit in the Next Global Financial Meltdown by David Wiedemer http '

In [43]:
train.text = train.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')

In [45]:
train.text[train.text.str.contains(r'RT[ ]?@')]

Series([], Name: text, dtype: object)

In [50]:
train.text = train.text.str.replace(r'@[\S]+', r'')

In [52]:
train.text[train.text.str.contains(r'@[\S]')]

Series([], Name: text, dtype: object)

### Remove extra space

In [55]:
train.text = train.text.str.replace(r'[ ]{2, }',r' ')

### &, < and >

In [56]:
train.text = train.text.str.replace(r'&amp;?', r'and')

In [57]:
train.text = train.text.str.replace(r'&lt;',r'<')
train.text = train.text.str.replace(r'&gt;',r'>')

### Insert space between words and punctuation marks

In [58]:
train.text = train.text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')

In [59]:
train.text = train.text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

### Lowercased and strip

In [60]:
train.text = train.text.str.lower()
train.trext = train.text.str.strip()

  


In [61]:
train.head()

Unnamed: 0,text,target
0,our deeds are the reason of this # earthquake ...,1
1,forest fire near la ronge sask . canada,1
2,all residents asked to ' shelter in place ' ar...,1
3,"13 , 000 people receive # wildfires evacuation...",1
4,just got sent this photo from ruby # alaska as...,1


In [67]:
train['ProcessedText_length'] = [len(texts.split(' ')) for texts in train.text]


In [70]:
train.head()

Unnamed: 0,text,target,ProcessedText_length
0,our deeds are the reason of this # earthquake ...,1,14
1,forest fire near la ronge sask . canada,1,8
2,all residents asked to ' shelter in place ' ar...,1,25
3,"13 , 000 people receive # wildfires evacuation...",1,12
4,just got sent this photo from ruby # alaska as...,1,19


In [72]:
train['ProcessedText_length'].value_counts()

14    398
20    392
22    387
23    385
17    365
21    341
18    341
15    334
11    328
13    326
19    326
16    323
12    323
24    312
25    310
10    261
9     252
26    242
27    197
8     188
28    173
7     163
29    143
6     143
5     122
30    114
31     92
4      82
32     53
3      52
33     37
2      23
34     21
35     20
36     12
42      6
37      5
1       5
43      5
40      3
38      3
41      2
44      1
39      1
59      1
Name: ProcessedText_length, dtype: int64

### Drop texts with length <=3 and drop duplicates

In [75]:
train = train[train['ProcessedText_length']>3]

In [76]:
train = train.drop_duplicates(subset=['text'])

In [81]:
train = train.drop(columns=['ProcessedText_length'])

In [83]:
train.head()

Unnamed: 0,text,target
0,our deeds are the reason of this # earthquake ...,1
1,forest fire near la ronge sask . canada,1
2,all residents asked to ' shelter in place ' ar...,1
3,"13 , 000 people receive # wildfires evacuation...",1
4,just got sent this photo from ruby # alaska as...,1


In [82]:
train.shape

(6871, 2)

In [84]:
train.target.value_counts()

0    4037
1    2834
Name: target, dtype: int64

### BERT preprocess

In [85]:
train['ProcessedText_BERT'] = '[CLS] '+train.text


In [87]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 870kB/s ta 0:00:01
Collecting boto3 (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/3f/f9/9798c5221d45b637ae1f42f0e0467e3bdfc3af46769b6bc7a29d93b2ecf6/boto3-1.10.46-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 1.8MB/s ta 0:00:011
[?25hCollecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/fc/a6/2c153ced229ba51f941d15554a66293c9f79e2d5b9a18c1d1b0c52a31a1f/regex-2019.12.20.tar.gz (679kB)
[K    100% |████████████████████████████████| 686kB 15.3MB/s ta 0:00:01
Collecting botocore<1.14.0,>=1.13.46 (from boto3->pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages

In [88]:
from pytorch_pretrained_bert import BertTokenizer

In [89]:
train.head()

Unnamed: 0,text,target,ProcessedText_BERT
0,our deeds are the reason of this # earthquake ...,1,[CLS] our deeds are the reason of this # earth...
1,forest fire near la ronge sask . canada,1,[CLS] forest fire near la ronge sask . canada
2,all residents asked to ' shelter in place ' ar...,1,[CLS] all residents asked to ' shelter in plac...
3,"13 , 000 people receive # wildfires evacuation...",1,"[CLS] 13 , 000 people receive # wildfires evac..."
4,just got sent this photo from ruby # alaska as...,1,[CLS] just got sent this photo from ruby # ala...


In [90]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train['ProcessedText_BERTbase_length'] = [len(tokenizer.tokenize(sent)) for sent in train.ProcessedText_BERT]

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
train['ProcessedText_BERTlarge_length'] = [len(tokenizer.tokenize(sent)) for sent in train.ProcessedText_BERT]

100%|██████████| 231508/231508 [00:00<00:00, 2289891.92B/s]
100%|██████████| 231508/231508 [00:00<00:00, 2505120.92B/s]


### Int label for later use in softmax and cross entropy loss

In [91]:
label_dict = dict()
for i, l in enumerate(list(train.text.value_counts().keys())):
    label_dict.update({l: i})

train['InformationType_label'] = [label_dict[label] for label in train.text]

### Save data

In [92]:
train.to_csv('preprocessed_data.csv')