<a href="https://www.kaggle.com/code/aadishchopra/predictingdisastertweets?scriptVersionId=125700501" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Install relevant packages

In [1]:
import numpy as np 
import pandas as pd
import torch
import datasets
#import transformers
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments
from transformers import TextClassificationPipeline
from sklearn.model_selection import train_test_split
from datasets import load_metric
from sklearn.metrics import roc_auc_score,f1_score,accuracy_score
# need to install evaluate as it is not present in base kaggle image
!pip install evaluate
import evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [2]:
torch.cuda.is_available()

True

In [3]:
pd.set_option('display.max_colwidth', None)
df=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

In [4]:
#uncomment for quick debugging/troubleshooting
#df=df[1:500]

In [5]:
df.sample()

Unnamed: 0,id,keyword,location,text,target
2481,3560,desolate,,Me watching Law &amp; Order (IB: @sauldale305) (Vine by @NaturalExample) https://t.co/tl29LnU44O,1


# Remove non-ascii characters




In [6]:
#1. Remove characters other than ascii characters
df['new_text']=df['text'].apply(lambda x : x.encode('ascii',errors='ignore').decode())

In [7]:
pd.set_option('display.max_colwidth', None)
df_test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
df_test=pd.DataFrame(df_test,columns=['id','text'])
df_test['new_text']=df_test['text'].apply(lambda x : x.encode('ascii',errors='ignore').decode())
df_test.head()

Unnamed: 0,id,text,new_text
0,0,Just happened a terrible car crash,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, stay safe everyone.","Heard about #earthquake is different cities, stay safe everyone."
2,3,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all","there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan


### Need to convert to dataset dict format to apply tokenizer and pass labels

In [8]:
df_test_dict=datasets.Dataset.from_dict(df_test)

In [9]:
df=df[['new_text','target']]
df.head()
df_train,df_eval=train_test_split(df,test_size=0.1)

In [10]:
df_dict=datasets.Dataset.from_dict(df_train)
df_eval_dict=datasets.Dataset.from_dict(df_eval)

#### Using pre-trained models . Benefit of using pre-trained models is that they have pre-trained weights. 

In [11]:
tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
#tokenizer.encode_plus('try this string')
#>{'input_ids': [101, 3046, 2023, 5164, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [13]:
def letstokenize(text):
    text_t=text['new_text']
    target=text['target']
    encoding=tokenizer(text_t,padding="max_length",truncation=True,max_length=128)
    encoding["labels"]=target
    return encoding

encoded_df=df_dict.map(letstokenize)
encoded_eval_dict=df_eval_dict.map(letstokenize)

  0%|          | 0/6851 [00:00<?, ?ex/s]

  0%|          | 0/762 [00:00<?, ?ex/s]

In [14]:
disaster_bert=AutoModelForSequenceClassification.from_pretrained(
                                                                'bert-base-uncased',
                                                                 num_labels=2
                                                                )

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### Huggingface implementation requires training arguments to be supplied as shown below :-

In [15]:
# can upload it to huggingface as well. I have currently turned the flag off
batch_size=8
metric_name='f1'
repo_name='kaggle-disaster-tweets'

args=TrainingArguments(
                  output_dir=repo_name,
                  evaluation_strategy='epoch',
                  metric_for_best_model=metric_name,
                  per_device_train_batch_size=batch_size,
                  num_train_epochs=5,
                  load_best_model_at_end=True,
                  #no_cuda=True,
                  per_device_eval_batch_size=batch_size,
                  weight_decay=0.05,
                  save_strategy='epoch',
                  learning_rate=5e-5,
                  report_to="none"
                  #push_to_hub=True
                    )

In [16]:
def compute_metrics (eval_pred):
    metric = evaluate.load('f1')
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    return metric.compute(predictions=preds, references = labels)

In [17]:
trainer = Trainer(
    disaster_bert,
    args,
    train_dataset=encoded_df,
    eval_dataset=encoded_eval_dict,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
    )

In [18]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.4831,0.447873,0.789308
2,0.3727,0.636467,0.798742
3,0.2936,0.71362,0.7744
4,0.2165,0.865474,0.777251
5,0.1306,1.013612,0.766296


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=4285, training_loss=0.2869514670088244, metrics={'train_runtime': 598.6471, 'train_samples_per_second': 57.221, 'train_steps_per_second': 7.158, 'total_flos': 2253217300339200.0, 'train_loss': 0.2869514670088244, 'epoch': 5.0})

In [19]:
trainer.evaluate()

{'eval_loss': 0.6364667415618896,
 'eval_f1': 0.7987421383647799,
 'eval_runtime': 4.0806,
 'eval_samples_per_second': 186.739,
 'eval_steps_per_second': 23.526,
 'epoch': 5.0}

# Prediction

In [20]:
pipe = TextClassificationPipeline(model=disaster_bert, tokenizer=tokenizer, return_all_scores=False,device=0)
# outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759},  {'label': 'POSITIVE', 'score': 0.9998776316642761}]]




In [21]:
df_test['result']=pipe(df_test['new_text'].tolist())

In [22]:
# adjusting the output format to be in sync with the submission file
df_test['target']=df_test['result'].apply(lambda x: int(x['label'][-1]))

In [23]:
df_test

Unnamed: 0,id,text,new_text,result,target
0,0,Just happened a terrible car crash,Just happened a terrible car crash,"{'label': 'LABEL_1', 'score': 0.9663482308387756}",1
1,2,"Heard about #earthquake is different cities, stay safe everyone.","Heard about #earthquake is different cities, stay safe everyone.","{'label': 'LABEL_1', 'score': 0.9963130354881287}",1
2,3,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all","there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all","{'label': 'LABEL_1', 'score': 0.9950481057167053}",1
3,9,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. #Spokane #wildfires,"{'label': 'LABEL_1', 'score': 0.995076596736908}",1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"{'label': 'LABEL_1', 'score': 0.9953733086585999}",1
...,...,...,...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn,EARTHQUAKE SAFETY LOS ANGELES SAFETY FASTENERS XrWn,"{'label': 'LABEL_1', 'score': 0.995026171207428}",1
3259,10865,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power,"{'label': 'LABEL_1', 'score': 0.9946268200874329}",1
3260,10868,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,Green Line derailment in Chicago http://t.co/UtbXLcBIuY,"{'label': 'LABEL_1', 'score': 0.9951019287109375}",1
3261,10874,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3,"{'label': 'LABEL_1', 'score': 0.9951803684234619}",1


In [24]:
submit=df_test[['id','target']]

In [25]:
submit.to_csv('submission.csv',index=False)