# NLP with Disaster Tweets #

### I- RigidClassifier

We will start by assuming that words present in each tweet are a great indicator of disasters. Therefore, in the code below we will use Count Vectorizer to count the number of times each token appears in each tweet, and then we will use a linear model (scikit-learn Rigid Classifier) to classify whether there is a real disaster or not. In fact, we will suppose that there is a linear separation between the two classes.

In [2]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, metrics

In [4]:
# reading train and test files
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [5]:
# displaying the 5 first columns of train_df
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
# using CountVectorizer: tokenize all the texts and create a sparse matrix where each row represents a document,
# and each column represents a unique token. The cell values indicate how many times each token appears in each document.

countvectorizer = feature_extraction.text.CountVectorizer()
train_countvectorizer_matrix = countvectorizer.fit_transform(train_df["text"])
print(f"Resulting countvectorizer matrix shape: {train_countvectorizer_matrix.shape}")


Resulting countvectorizer matrix shape: (7613, 21637)


In [7]:
# using scikit learn RidgeClassifier as a model
classifier = linear_model.RidgeClassifier()

# apply cross validation on the model and display the score
score = model_selection.cross_val_score(estimator= classifier, X= train_countvectorizer_matrix, y=train_df["target"], cv = 5, scoring = "f1")
score


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

array([0.6025641 , 0.50168919, 0.56985004, 0.50781969, 0.67275495])

The score obtained with Ridge classifier is around 0.55.
To improve this score we will next take into account the context in each tweet instead of only counting the number of times certain tokens appear.

In order to do so, we will fine-tune Bert pretrained models.

### II- Fine-tuning Bert pretrained model

In [46]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import re

In [47]:
# split the data into train, test and validation sets
X_train, X_temp, y_train, y_temp = model_selection.train_test_split(train_df["text"], train_df["target"], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = model_selection.train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [53]:
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords
def preprocess_tweet(text):
    text = remove_stopwords(text)
    text = text.lower()
    text = re.sub('[^\w\s]','',text)
    return text

In [54]:
# text data preprocessing
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def preprocessing(X):
    max_length = max(len(text) for text in X)
    input_ids = []
    attention_masks = []
    for text in X:
        text = preprocess_tweet(text)
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

train_input_ids, train_attention_masks = preprocessing(X_train)
validation_input_ids, validation_attention_masks = preprocessing(X_val)


In [55]:
train_labels = torch.tensor(y_train.values)
validation_labels = torch.tensor(y_val.values)

In [56]:
# create a data loader
def create_data_loader(input_ids, attention_masks, labels):
    dataset = TensorDataset(input_ids, attention_masks, labels)
    batch_size = 32
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return train_loader

train_loader = create_data_loader(train_input_ids, train_attention_masks, train_labels)
validation_loader = create_data_loader(validation_input_ids, validation_attention_masks, validation_labels)

In [57]:
# define model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, 2)
)

# freeze parameters
for param in model.bert.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
# defining loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.classifier.parameters(), lr=1e-5)

In [59]:
# fine tune the model
num_epochs = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_losses, valid_losses = [], []
valid_loss_min = np.Inf

for epoch in range(num_epochs):
    train_loss, valid_loss = 0 , 0
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
        }
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = criterion(outputs.logits, inputs["labels"])
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs["labels"].size(0)
    
    model.eval()
    for batch in validation_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
        }
        with torch.no_grad():
              output = model(**inputs)
        loss = criterion(output.logits,inputs["labels"])
        valid_loss += loss.item() * inputs["labels"].size(0)

    train_loss /= len(train_loader.sampler)
    valid_loss /= len(validation_loader.sampler)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print('epoch: {} \ttraining Loss: {:.6f} \tvalidation Loss: {:.6f}'.format(epoch+1, train_loss, valid_loss))

    if valid_loss <= valid_loss_min:
        print('validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

KeyboardInterrupt: 

In [None]:
# testing the model and evaluating it using F1 score
test_input_ids, test_attention_masks = preprocessing(X_test)
test_labels = torch.tensor(y_test.values)
test_loader = create_data_loader(test_input_ids, test_attention_masks, test_labels)

true_labels = []
predicted_labels = []

model.load_state_dict(torch.load('model.pt', map_location=device))

model.eval()
for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {
        "input_ids": batch[0],
        "attention_mask": batch[1],
        "labels": batch[2]
    }
    with torch.no_grad():
        output = model(**inputs)
    _, predicted = torch.max(output.logits, 1)
    true_labels.extend(inputs["labels"].cpu().numpy())
    predicted_labels.extend(predicted.cpu().numpy())



In [None]:
f1 = metrics.f1_score(true_labels, predicted_labels)
print(f"F1 Score: {f1}")