In [1]:
import numpy as np 
import pandas as pd


In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [7]:
def get_text_target_lists(dataframe):
    if 'target' in dataframe.columns:
        return list(dataframe.text), list(dataframe.target)
    else:
        return list(dataframe.text)

In [8]:
train_text, train_labels = get_text_target_lists(df_train)
test_text = get_text_target_lists(df_test) 

In [9]:
from transformers import DistilBertTokenizer

In [10]:
model_checkpoint = "distilbert-base-uncased"
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [11]:
distilbert_tokenizer.vocab_size

30522

In [12]:
distilbert_tokenizer.model_max_length

512

In [13]:
distilbert_tokenizer.model_input_names

['input_ids', 'attention_mask']

In [14]:
distilbert_tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [15]:
for token, _id in zip(distilbert_tokenizer.all_special_tokens, distilbert_tokenizer.all_special_ids):
    print(token, _id)

[UNK] 100
[SEP] 102
[PAD] 0
[CLS] 101
[MASK] 103


In [16]:
distilbert_tokenizer.is_fast

False

In [17]:
distilbert_tokenizer.do_lower_case

True

In [18]:
sample_text1 = df_train.text[0]
sample_text1

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [19]:
encoded_text = distilbert_tokenizer(sample_text1)
print("input_ids: ", encoded_text['input_ids'])
print("attention_mask: ",encoded_text['attention_mask'])

input_ids:  [101, 2256, 15616, 2024, 1996, 3114, 1997, 2023, 1001, 8372, 2089, 16455, 9641, 2149, 2035, 102]
attention_mask:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [20]:
tokens = distilbert_tokenizer.convert_ids_to_tokens(encoded_text['input_ids'])
print(tokens)

['[CLS]', 'our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all', '[SEP]']


In [21]:
def print_tokenizer_outputs(i):
    sample_text2 = df_train.text[i]
    print("Sample Text: ", sample_text2)
    encoded_text = distilbert_tokenizer(sample_text2)
    print("input_ids: ", encoded_text['input_ids'])
    print("attention_mask: ",encoded_text['attention_mask'])
    tokens = distilbert_tokenizer.convert_ids_to_tokens(encoded_text['input_ids'])
    print("Tokens: ", tokens)
    
for n in range(0, 6):
    print_tokenizer_outputs(n)
    print("---")

Sample Text:  Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
input_ids:  [101, 2256, 15616, 2024, 1996, 3114, 1997, 2023, 1001, 8372, 2089, 16455, 9641, 2149, 2035, 102]
attention_mask:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens:  ['[CLS]', 'our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all', '[SEP]']
---
Sample Text:  Forest fire near La Ronge Sask. Canada
input_ids:  [101, 3224, 2543, 2379, 2474, 6902, 3351, 21871, 2243, 1012, 2710, 102]
attention_mask:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens:  ['[CLS]', 'forest', 'fire', 'near', 'la', 'ron', '##ge', 'sas', '##k', '.', 'canada', '[SEP]']
---
Sample Text:  All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
input_ids:  [101, 2035, 3901, 2356, 2000, 1005, 7713, 1999, 2173, 1005, 2024, 2108, 19488, 2011, 3738, 1012, 2053, 2060, 13982, 2030, 7713, 1999, 217

In [22]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_text, train_labels, test_size=.2)

In [23]:
train_encodings = distilbert_tokenizer(train_texts,
                                       truncation=True,
                                       padding=True)
val_encodings = distilbert_tokenizer(val_texts,
                                     truncation=True,
                                     padding=True)
test_encodings = distilbert_tokenizer(test_text,
                                     truncation=True,
                                     padding=True)

In [24]:
len(train_encodings), len(test_encodings), len(val_encodings)

(2, 2, 2)

In [25]:
print(train_encodings['input_ids'][:2])

[[101, 1037, 6925, 1997, 2048, 13433, 2595, 1011, 2303, 22812, 8299, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 1059, 2475, 7646, 2102, 2487, 2243, 2692, 7875, 1001, 7865, 1001, 16514, 10521, 19500, 2015, 1001, 16012, 3334, 29165, 2964, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1001, 2088, 2638, 9333, 5357, 2373, 12735, 2006, 1043, 1024, 4957, 12517, 1024, 10651, 1024, 2543, 10604, 2031, 13377, 2039, 2000, 2382, 5467, 2040, 2020, 19817, 1012, 1012, 1012, 8299, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 1041, 7274, 2615, 2615, 4143, 2581, 4160, 2213, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [26]:
import torch

class NaturalDisastersData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NaturalDisastersData(train_encodings, train_labels)
val_dataset = NaturalDisastersData(val_encodings, val_labels)

In [27]:
len(train_dataset.encodings), len(train_dataset.encodings['input_ids']), len(train_dataset.encodings['attention_mask']), len(train_dataset.labels)

(2, 6090, 6090, 6090)

In [28]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [29]:
from sklearn.metrics import classification_report

In [30]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,              
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=64,   
    warmup_steps=500,               
    weight_decay=0.01,               
    logging_dir='./logs',         
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                     
    args=training_args,                
    train_dataset=train_dataset,    
    eval_dataset=val_dataset           
)

trainer.train()

  device: Optional[torch.device] = torch.device("cuda"),
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

Step,Training Loss
10,0.6981
20,0.6938
30,0.6886
40,0.6873
50,0.6779
60,0.668
70,0.6447
80,0.604
90,0.5694
100,0.5088


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json
Model weights saved in ./results\checkpoint-1500\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1905, training_loss=0.2645924522886126, metrics={'train_runtime': 6315.2107, 'train_samples_per_second': 4.822, 'train_steps_per_second': 0.302, 'total_flos': 661767797426400.0, 'train_loss': 0.2645924522886126, 'epoch': 5.0})

In [41]:
predictions = trainer.predict(val_dataset)

***** Running Prediction *****
  Num examples = 1523
  Batch size = 64


RuntimeError: Numpy is not available

In [42]:
predictions.metrics

NameError: name 'predictions' is not defined

In [None]:
y_preds = np.argmax(predictions.predictions, axis=1)

In [None]:
print(classification_report(y_preds, val_dataset.labels))

In [None]:
df_test['target_dummy'] = 0
test_dataset = NaturalDisastersData(test_encodings, list(df_test['target_dummy']))
predictions_test = trainer.predict(test_dataset)

In [None]:
y_preds_test = np.argmax(predictions_test.predictions, axis=1)
df_submission = pd.DataFrame()
df_submission['id'] = df_test['id']
df_submission['target'] = y_preds_test

In [None]:
df_submission.info()

In [None]:
df_submission.to_csv('submission.csv', index=False)