# Disaster tweet prediction by fine tuning a distilbert pre-trained model from Huggingface.

In [1]:
!pip install wandb # for callbacks

Installing collected packages: smmap, gitdb, yaspin, shortuuid, sentry-sdk, pathtools, GitPython, docker-pycreds, wandb
Successfully installed GitPython-3.1.26 docker-pycreds-0.4.0 gitdb-4.0.9 pathtools-0.1.2 sentry-sdk-1.5.4 shortuuid-1.0.8 smmap-5.0.0 wandb-0.12.10 yaspin-2.1.0


In [2]:
!pip install transformers -q # hugging face NLP library

[K     |████████████████████████████████| 3.5 MB 9.7 MB/s 
[K     |████████████████████████████████| 596 kB 63.6 MB/s 
[K     |████████████████████████████████| 6.8 MB 55.3 MB/s 
[K     |████████████████████████████████| 67 kB 6.0 MB/s 
[K     |████████████████████████████████| 895 kB 56.1 MB/s 
[?25h

In [3]:
!pip install unidecode -q

[?25l[K     |█▍                              | 10 kB 31.3 MB/s eta 0:00:01[K     |██▉                             | 20 kB 35.0 MB/s eta 0:00:01[K     |████▏                           | 30 kB 39.2 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 41.8 MB/s eta 0:00:01[K     |███████                         | 51 kB 31.5 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 34.4 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 24.4 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 25.4 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 27.3 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 26.3 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 26.3 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 26.3 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 26.3 MB/s eta 0:00:01[K     |███████████████████▌            | 143 kB 26.3 MB/s eta 0:

In [4]:
pip install datasets -q # hugging face dataset format

[K     |████████████████████████████████| 311 kB 17.6 MB/s 
[K     |████████████████████████████████| 133 kB 53.6 MB/s 
[K     |████████████████████████████████| 243 kB 51.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 54.8 MB/s 
[K     |████████████████████████████████| 94 kB 2.4 MB/s 
[K     |████████████████████████████████| 271 kB 51.8 MB/s 
[K     |████████████████████████████████| 144 kB 59.1 MB/s 
[?25h

In [5]:
import datasets 

In [104]:
import tensorflow as tf
import pandas as pd
import numpy as np

import re
import unidecode
import html

import transformers
from transformers import TFDistilBertForSequenceClassification,  DistilBertTokenizer

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [7]:
import wandb
from wandb.keras import WandbCallback

**Data Load and transform**

In [8]:
tweets_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Cours Deep Learning/Projet NLP/train.csv', dtype={'target':np.int8})

In [37]:
def text_prepare(tweets):#tweets is a pandas series containing strings

  # replace special characters with space then .strip() 
  # We tried removing & # . ! ? but it turns out it works better with these characters.

  tweets = tweets.apply(html.unescape) #decodes html
  tweets = tweets.apply(unidecode.unidecode) #decodes all accent and special character
  tweets = tweets.apply(lambda x: re.sub(r"http://t.co/\w+"," URL", x)) #replace urls by "URL"
  tweets = tweets.apply(lambda x: re.sub(r"[/\[_:'\"\-<>()!?{}@%*\]]", ' ' , x)) #keeps & and .
  tweets = tweets.apply(lambda x: re.sub(r"&", ' and ' , x).strip()) # replace & with "and"
  tweets = tweets.apply(lambda x: re.sub(r'\s+', ' ', x)) #for some reason .strip() doesn't works all the time...
  return tweets


In [38]:
tweets = text_prepare(tweets_df.text)

In [39]:
#Max number of words in a sentence? 
tweet_length = tweets.apply(lambda x: len(x.split(' ')))
print(max(tweet_length))
#35+2 = 37. let's round it up to 50 in case there are longer sequences in the test file
length = 50

35


In [40]:
y = tweets_df.target.values
y_train, y_val, X_train, X_val = train_test_split(y,tweets.values, random_state = 0, stratify = y)

In [78]:
#######################################################################################################################################################################################################
#######################################################################################################################################################################################################

## Prepare data for distilbert

In [41]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [42]:
train_ds = { 'sentence1' : X_train, 'labels' : y_train }
val_ds = { 'sentence1' : X_val, 'labels' : y_val }

train_ds = datasets.Dataset.from_dict(train_ds)
val_ds = datasets.Dataset.from_dict(val_ds)

In [43]:
train_ds = train_ds.map(lambda x: tokenizer(x['sentence1'], truncation=True, padding='max_length', max_length = 50), batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

In [44]:
val_ds = val_ds.map(lambda x: tokenizer(x['sentence1'], truncation=True, padding='max_length', max_length = 50), batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [47]:
train_ds1 = train_ds.to_tf_dataset( 
  columns=['input_ids', 'attention_mask'],
  label_cols=["labels"],
  shuffle=True,
  batch_size=32,
  collate_fn=data_collator,
)

In [48]:
val_ds1 = val_ds.to_tf_dataset(
  columns=['input_ids', 'attention_mask'],
  label_cols=["labels"],
  shuffle=True,
  batch_size=32,
  collate_fn=data_collator,
)

In [73]:
next(iter(val_ds1))[0]

{'attention_mask': <tf.Tensor: shape=(32, 50), dtype=int64, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>,
 'input_ids': <tf.Tensor: shape=(32, 50), dtype=int64, numpy=
 array([[  101,  1015, 14766, ...,     0,     0,     0],
        [  101,  2092,  2008, ...,     0,     0,     0],
        [  101,  3857,  2115, ...,     0,     0,     0],
        ...,
        [  101, 13710, 20868, ...,     0,     0,     0],
        [  101, 18178,  4877, ...,     0,     0,     0],
        [  101,  2009,  1055, ...,     0,     0,     0]])>}

## Model fit and training

In [95]:
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_98']
You should probably TRAIN this model on a down-stream task to be able to use i

In [96]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

In [97]:
wandb.init(project="disaster_distilbert", entity="padda")


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▂▃▄▄▅▅▆▇▇█▁▁▁
loss,█▆▄▃▃▃▂▂▂▂▂▁▁▁▁
sparse_categorical_accuracy,▁▆▆▇▇▇▇▇▇▇█████
val_loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁
val_sparse_categorical_accuracy,▁▅▆▇▇▇█████████

0,1
best_epoch,8.0
best_val_loss,0.38335
epoch,0.0
loss,0.30494
sparse_categorical_accuracy,0.88553
val_loss,0.38999
val_sparse_categorical_accuracy,0.84057


In [100]:
model.fit(train_ds1, validation_data = val_ds1, epochs=1, callbacks=[WandbCallback()])









<keras.callbacks.History at 0x7f67dac8ae90>

model 1:   
  1: loss: 0.4470 - sparse_categorical_accuracy: 0.8018 - val_loss: 0.3892 - val_sparse_categorical_accuracy: 0.8369

  2: loss: 0.3098 - sparse_categorical_accuracy: 0.8750 - val_loss: 0.4617 - val_sparse_categorical_accuracy: 0.8358

model 2 (avec !? et #):   
  loss: 0.4366 - sparse_categorical_accuracy: 0.8083 - val_loss: 0.3924 - val_sparse_categorical_accuracy: 0.8379  
  loss: 0.3523 - sparse_categorical_accuracy: 0.8545 - val_loss: 0.3837 - val_sparse_categorical_accuracy: 0.8432


model 3 (avec tout les caractères spéciaux):   
     loss: 0.4401 - sparse_categorical_accuracy: 0.8053 - val_loss: 0.3831 - val_sparse_categorical_accuracy: 0.8374


In [101]:
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/Cours Deep Learning/Projet NLP/distilbert_0')

## Load model and try it on test data

In [58]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [103]:
model_load = TFDistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Colab Notebooks/Cours Deep Learning/Projet NLP/distilbert_0')

Some layers from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/Cours Deep Learning/Projet NLP/distilbert_0 were not used when initializing TFDistilBertForSequenceClassification: ['dropout_98']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/Cours Deep Learning/Projet NLP/distilbert_0 and are newly initialized: ['dropout_118']
You should probably TRAIN this model on a down-strea

In [105]:
def cm_metrics(true_label,pred_label, labels_):
  cm = confusion_matrix(true_label,pred_label,labels = labels_)
  cmDisp = ConfusionMatrixDisplay(cm,display_labels=labels_)
  recall = { i:cm[i][i]/cm[i].sum()  for i in range(len(cm))}
  precision = { i:cm[i][i]/cm[:,i].sum()  for i in range(len(cm))}
  F1_score = { i: 2 / (1/recall[i] + 1/precision[i]) for i in range(len(cm))}
  accuracy = (cm[0][0]+cm[1][1]+cm[2][2])/cm.sum()
  print('accuracy: ', accuracy)
  print('Recall: \n',recall,'\nPrecision: \n', precision,'\nF1-score: \n',F1_score)
  cmDisp.plot()
  plt.show()
  return None

In [106]:
tweets_test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Cours Deep Learning/Projet NLP/test.csv', dtype={'target':np.int8})

In [107]:
tweets = text_prepare(tweets_test_df.text)

In [121]:
test_ds = datasets.Dataset.from_dict({'sentence1' : tweets.values})


In [122]:
test_ds = test_ds.map(lambda x: tokenizer(x['sentence1'], truncation=True, padding='max_length', max_length = 50), batched=False)

0ex [00:00, ?ex/s]

In [123]:
test_ds = test_ds.to_tf_dataset( 
  columns = ['input_ids', 'attention_mask'],
  shuffle = False,
  batch_size = 3263,
  collate_fn=data_collator,
)

In [124]:
y_test_pred = model.predict(next(iter(test_ds)))

In [130]:
y_test_pred[0][:,0]

array([-0.43074745, -1.2097101 , -1.5050466 , ..., -1.7923384 ,
       -1.7224505 , -1.0387433 ], dtype=float32)

In [135]:
results = pd.concat([tweets, pd.Series(y_test_pred[0][:,0]), pd.Series(y_test_pred[0][:,1])], axis = 1)

In [137]:
results.head(30)

Unnamed: 0,text,0,1
0,Just happened a terrible car crash,-0.430747,0.536187
1,"Heard about #earthquake is different cities, s...",-1.20971,1.309152
2,"there is a forest fire at spot pond, geese are...",-1.505047,1.500758
3,Apocalypse lighting. #Spokane #wildfires,-1.864865,1.832608
4,Typhoon Soudelor kills 28 in China and Taiwan,-1.65357,1.654951
5,We re shaking...It s an earthquake,-1.242789,1.319532
6,They d probably still show more life than Arse...,1.309688,-1.197088
7,Hey How are you,1.388582,-1.255024
8,What a nice hat,1.227165,-1.090086
9,Fuck off,1.227662,-1.234658
