<a href="https://colab.research.google.com/github/bandpooja/Kaggle-Competitions/blob/master/NLP_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP with disaster tweets

### unzipping the data

In [1]:
!unzip /content/nlp-getting-started.zip

Archive:  /content/nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


### importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read train data file

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Install tensorflow nlp modules

In [4]:
!pip install -q -U tensorflow-text
!pip install -q tf-models-official

[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
[K     |████████████████████████████████| 1.8 MB 5.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 40.3 MB/s 
[K     |████████████████████████████████| 596 kB 44.0 MB/s 
[K     |████████████████████████████████| 90 kB 9.3 MB/s 
[K     |████████████████████████████████| 37.1 MB 50 kB/s 
[K     |████████████████████████████████| 99 kB 9.5 MB/s 
[K     |████████████████████████████████| 352 kB 44.2 MB/s 
[K     |████████████████████████████████| 213 kB 47.7 MB/s 
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 33.1 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


### Importing tensorflow and nlp libraries

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

### Split data for training and validation

In [6]:
train = df.sample(7300)
val = df.sample(313)

In [7]:
train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [8]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'target'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

### selecting a pre-train bert model

In [9]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 

map_name_to_handle = {
    
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
}

map_model_to_preprocess = {
    
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    }

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')


BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


### Transfering the bert preprocess to Keras

In [10]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [11]:
dummy = [train['text'].values.tolist()[50]]

In [12]:
text_preprocessed = bert_preprocess_model(dummy)

In [13]:
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_mask', 'input_word_ids', 'input_type_ids']
Shape      : (1, 128)
Word Ids   : [  101  1037  7596  2000  1996  4231  1029  5777  2007 20675  1029  1037]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [14]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [15]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(['I am bad']))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.3439777]], shape=(1, 1), dtype=float32)


In [16]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()


In [17]:
epochs = 20
steps_per_epoch = int(len(train)/epochs)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


In [18]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)


In [19]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train['text'].values.tolist(), y=train['target'].values.tolist(),
                              epochs=epochs)


Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
loss, accuracy = classifier_model.evaluate(val['text'].values.tolist(), val['target'].values.tolist())

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')


Loss: 0.08151674270629883
Accuracy: 0.9840255379676819


In [30]:
test_df = pd.read_csv('test.csv')
ids = []
targets = []
import tqdm
for i,row in tqdm.tqdm(test_df.iterrows()):
  bert_raw_result = classifier_model(tf.constant([row['text']]))
  p = tf.sigmoid(bert_raw_result).numpy()[0]
  label = 1 if p > 0.5 else 0
  ids.append(row['id'])
  targets.append(label)


submission = pd.DataFrame()
submission['id'] = ids
submission['target'] = targets
submission.to_csv('submission.csv', index=False)


3263it [01:00, 53.51it/s]
