In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-with-disaster-tweets-cleaning-data/test_data_cleaning.csv
/kaggle/input/nlp-with-disaster-tweets-cleaning-data/train_data_cleaning.csv
/kaggle/input/nlp-with-disaster-tweets-cleaning-data/test_data_cleaning2.csv
/kaggle/input/nlp-with-disaster-tweets-cleaning-data/train_data_cleaning2.csv
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# NLP with Disaster Tweets #

## I- RigidClassifier

We will start by assuming that words present in each tweet are a great indicator of disasters. Therefore, in the code below we will use Count Vectorizer to count the number of times each token appears in each tweet, and then we will use a linear model (scikit-learn Rigid Classifier) to classify whether there is a real disaster or not. In fact, we will suppose that there is a linear separation between the two classes.

In [2]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, metrics



In [3]:
# reading train and test files
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
# displaying the 5 first columns of train_df
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# using CountVectorizer: tokenize all the texts and create a sparse matrix where each row represents a document,
# and each column represents a unique token. The cell values indicate how many times each token appears in each document.

countvectorizer = feature_extraction.text.CountVectorizer()
train_countvectorizer_matrix = countvectorizer.fit_transform(train_df["text"])
print(f"Resulting countvectorizer matrix shape: {train_countvectorizer_matrix.shape}")

Resulting countvectorizer matrix shape: (7613, 21637)


In [6]:
# using scikit learn RidgeClassifier as a model
classifier = linear_model.RidgeClassifier()

# apply cross validation on the model and display the score
score = model_selection.cross_val_score(estimator= classifier, X= train_countvectorizer_matrix, y=train_df["target"], cv = 5, scoring = "f1")
score

array([0.6025641 , 0.50168919, 0.56985004, 0.50781969, 0.67275495])

The score obtained with Ridge classifier is around 0.55.
To improve this score we will next take into account the context in each tweet instead of only counting the number of times certain tokens appear.

In order to do so, we will fine-tune Bert pretrained models.

## II- Fine-tuning Bert pretrained model

In [7]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import re
from sklearn import feature_extraction, linear_model, model_selection, metrics

In [8]:
## fill NA values in keyword and location columns
train_df.keyword = train_df.keyword.fillna('')
train_df.location = train_df.location.fillna('')

## Combine location and keyword with the main text 
train_df['full_text'] = train_df.location + ' ' + train_df.keyword + ' ' + train_df.text

### Splitting data into train, validation and test

In [9]:
# split the data into train, test and validation sets
X_train, X_temp, y_train, y_temp = model_selection.train_test_split(train_df["full_text"], train_df["target"], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = model_selection.train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [11]:
!pip install gensim



### Data Preprocessing

In [12]:
import preprocessor as p
def preprocess_tweet(text):
    text = text.lower()
    text = re.sub('[^\w\s]','',text)
    re.sub(r'http\S+', '', text)
    return text

In [13]:
# text data preprocessing
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def preprocessing(X):
    max_length = max(len(text) for text in X)
    input_ids = []
    attention_masks = []
    for text in X:
        text = preprocess_tweet(text)
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

train_input_ids, train_attention_masks = preprocessing(X_train)
validation_input_ids, validation_attention_masks = preprocessing(X_val)

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
train_labels = torch.tensor(y_train.values)
validation_labels = torch.tensor(y_val.values)

### Data Loaders for training and validation

In [15]:
# create a data loader
def create_data_loader(input_ids, attention_masks, labels):
    dataset = TensorDataset(input_ids, attention_masks, labels)
    batch_size = 32
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return train_loader

train_loader = create_data_loader(train_input_ids, train_attention_masks, train_labels)
validation_loader = create_data_loader(validation_input_ids, validation_attention_masks, validation_labels)

### Defining the model, optimizer, and loss function 

In [16]:
# define model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, 2),
)

# freeze parameters
for param in model.bert.parameters():
    param.requires_grad = False

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# defining loss function and optimizer
from transformers import get_linear_schedule_with_warmup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)
total_steps = len(train_loader) * 30
# Create a learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

### Fine tuning the model

In [18]:
# fine tune the model
num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_losses, valid_losses = [], []
valid_loss_min = np.Inf

for epoch in range(num_epochs):
    train_loss, valid_loss = 0 , 0
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
        }
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = criterion(outputs.logits, inputs["labels"])
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item() * inputs["labels"].size(0)
    
    model.eval()
    for batch in validation_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
        }
        with torch.no_grad():
              output = model(**inputs)
        loss = criterion(output.logits,inputs["labels"])
        valid_loss += loss.item() * inputs["labels"].size(0)

    train_loss /= len(train_loader.sampler)
    valid_loss /= len(validation_loader.sampler)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print('epoch: {} \ttraining Loss: {:.6f} \tvalidation Loss: {:.6f}'.format(epoch+1, train_loss, valid_loss))

    if valid_loss <= valid_loss_min:
        print('validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

epoch: 1 	training Loss: 0.621891 	validation Loss: 0.651734
validation loss decreased (inf --> 0.651734).  Saving model ...
epoch: 2 	training Loss: 0.573217 	validation Loss: 0.547575
validation loss decreased (0.651734 --> 0.547575).  Saving model ...
epoch: 3 	training Loss: 0.550110 	validation Loss: 0.578565
epoch: 4 	training Loss: 0.549973 	validation Loss: 0.526134
validation loss decreased (0.547575 --> 0.526134).  Saving model ...
epoch: 5 	training Loss: 0.538517 	validation Loss: 0.525436
validation loss decreased (0.526134 --> 0.525436).  Saving model ...
epoch: 6 	training Loss: 0.530203 	validation Loss: 0.494742
validation loss decreased (0.525436 --> 0.494742).  Saving model ...
epoch: 7 	training Loss: 0.527712 	validation Loss: 0.502406
epoch: 8 	training Loss: 0.529782 	validation Loss: 0.495369
epoch: 9 	training Loss: 0.521471 	validation Loss: 0.496375
epoch: 10 	training Loss: 0.523993 	validation Loss: 0.717052
epoch: 11 	training Loss: 0.525722 	validation Lo

### Testing the model on unseen data and evaluation

In [19]:
# testing the model and evaluating it using F1 score
test_input_ids, test_attention_masks = preprocessing(X_test)
test_labels = torch.tensor(y_test.values)
test_loader = create_data_loader(test_input_ids, test_attention_masks, test_labels)

true_labels = []
predicted_labels = []

model.load_state_dict(torch.load('model.pt', map_location=device))

model.eval()
for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {
        "input_ids": batch[0],
        "attention_mask": batch[1],
        "labels": batch[2]
    }
    with torch.no_grad():
        output = model(**inputs)
    _, predicted = torch.max(output.logits, 1)
    true_labels.extend(inputs["labels"].cpu().numpy())
    predicted_labels.extend(predicted.cpu().numpy())



In [20]:
f1 = metrics.f1_score(true_labels, predicted_labels)
print(f"F1 Score: {f1}")

F1 Score: 0.7500000000000001


We can see an improvement in the obtained F1 score. Lets see how we can improve it even more by displaying examples of bad predictions.

### Interpretation of the score

In [31]:
# displaying examples of bad predictions
bad_pred_index = [i for i in range(len(predicted_labels)) if predicted_labels[i] != true_labels[i]]
test_index_list = X_test.index.tolist()
test_set_array = train_df.iloc[test_index_list]
test_set_array["full_text"] = [preprocess_tweet(text) for text in test_set_array["full_text"]] 
test_set_array["predicted labels"] = predicted_labels
test_set_array["true labels"] = true_labels
test_set_bad_pred= test_set_array.iloc[bad_pred_index]
pd.set_option('display.max_colwidth', None)
test_set_bad_pred[1:10]





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set_array["full_text"] = [preprocess_tweet(text) for text in test_set_array["full_text"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set_array["predicted labels"] = predicted_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set_array["true labels"] = true_labels


Unnamed: 0,id,keyword,location,text,target,full_text,predicted labels,true labels
7561,10810,wrecked,6,@Tunes_WGG lol. U got wrecked,0,6 wrecked tunes_wgg lol u got wrecked,1,0
5386,7686,panic,Milwaukee WI,Someone asked me about a monkey fist about 2 feet long with a panic snap like the one pictured to be used as a... http://t.co/Yi9BBbx3FE,0,milwaukee wi panic someone asked me about a monkey fist about 2 feet long with a panic snap like the one pictured to be used as a httptcoyi9bbbx3fe,0,1
3541,5063,famine,"New York, USA",'Food crematoria' provoke outrage amid crisis famine memories... http://t.co/fABVlvN5MS,1,new york usa famine food crematoria provoke outrage amid crisis famine memories httptcofabvlvn5ms,0,1
5498,7847,quarantined,,Top link: Reddit's new content policy goes into effect many horrible subreddits banned or quarantined http://t.co/u9ao3A4oGC,0,quarantined top link reddits new content policy goes into effect many horrible subreddits banned or quarantined httptcou9ao3a4ogc,0,1
640,929,blaze,California,@Kaotix_Blaze craving u,0,california blaze kaotix_blaze craving u,0,1
3021,4336,dust%20storm,CA via Brum,Wall of noise is one thing - but a wall of dust? Moving at 60MPH? http://t.co/9NwAJLi9cr How to not get blown away! http://t.co/j4NI4N0yFZ,1,ca via brum dust20storm wall of noise is one thing but a wall of dust moving at 60mph httptco9nwajli9cr how to not get blown away httptcoj4ni4n0yfz,1,0
1215,1753,buildings%20burning,somewhere over a rainbow,@DoctorFluxx @StefanEJones @spinnellii @themermacorn No burning buildings and rob during a riot. That's embarrassing &amp; ruining this nation.,1,somewhere over a rainbow buildings20burning doctorfluxx stefanejones spinnellii themermacorn no burning buildings and rob during a riot thats embarrassing amp ruining this nation,0,1
4771,6789,lightning,Reddit,Lightning strike in the distance via /r/pics http://t.co/iDmhSwewQw #pics,1,reddit lightning lightning strike in the distance via rpics httptcoidmhswewqw pics,0,1
3098,4448,electrocuted,,When I was cooking earlier I got electrocuted some crucial ?????? now I'm psychic lol,0,electrocuted when i was cooking earlier i got electrocuted some crucial now im psychic lol,1,0


In [32]:
# displaying examples of good predictions
good_pred_index = [i for i in range(len(predicted_labels)) if predicted_labels[i] == true_labels[i]]
test_set_good_pred= test_set_array.iloc[good_pred_index]
pd.set_option('display.max_colwidth', None)
test_set_good_pred[1:10]

Unnamed: 0,id,keyword,location,text,target,full_text,predicted labels,true labels
2662,3824,detonate,Worldwide,52.214904 5.139055 Nuke please. Target Hilversum please detonate 800 meters below surface.,1,worldwide detonate 52214904 5139055 nuke please target hilversum please detonate 800 meters below surface,0,0
5378,7675,panic,"Elsewhere, NZ",Lose bus card.\nPanic.\nKind bus driver.\nReplace bus card.\nFind bus card.\nHeaddesk.,0,elsewhere nz panic lose bus card\npanic\nkind bus driver\nreplace bus card\nfind bus card\nheaddesk,1,1
4740,6742,lava,USA,Check This Deal : http://t.co/uOoYgBb6aZ Sivan Health and Fitness Basalt Lava Hot Stone Massage Kit with 36 PieceÛ_ http://t.co/JJxcnwBp15,0,usa lava check this deal httptcouooygbb6az sivan health and fitness basalt lava hot stone massage kit with 36 pieceû_ httptcojjxcnwbp15,0,0
4013,5699,floods,,Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE,0,floods who is bringing the tornadoes and floods who is bringing the climate change god is after america he is plaguing her\n \nfarrakhan quote,0,0
7278,10418,whirlwind,"London, Sydney",Two hours to get to a client meeting. Whirlwind of emotions with this #tubestrike,1,london sydney whirlwind two hours to get to a client meeting whirlwind of emotions with this tubestrike,0,0
3912,5563,flood,United States,JKL cancels Flash Flood Warning for Bell Harlan Knox [KY] http://t.co/4rY6zhcPOQ #WX,1,united states flood jkl cancels flash flood warning for bell harlan knox ky httptco4ry6zhcpoq wx,0,0
2484,3567,desolate,,If the Taken movies took place in India 2 (Vine by @JusReign) https://t.co/hxM8C8e33D,0,desolate if the taken movies took place in india 2 vine by jusreign httpstcohxm8c8e33d,0,0
7486,10708,wreck,,I'm an emotional wreck right now.,0,wreck im an emotional wreck right now,0,0
4864,6926,mass%20murderer,Hemel Hempstead,If your friends really were your friends they'd support you regardless of your decisions.\n\nUnless you become a mass-murderer or something,0,hemel hempstead mass20murderer if your friends really were your friends theyd support you regardless of your decisions\n\nunless you become a massmurderer or something,1,1


We can see through the examples, that the preprocessing of the dataset can be improved by : not removing the punctuation, removing the links to websites, and finding a better way to combine keyword and location to the text that will not change the meaning of the text.