In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AdamW
import torch

In [2]:
import nltk
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
import pandas as pd
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

In [5]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r"http\S+", "",text)
    html=re.compile(r'<.*?>')
    text = html.sub(r'',text)

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'')

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [7]:
df['text'] = df['text'].apply(clean_text)

In [8]:
df = df.drop(columns=['id','keyword','location'], axis=1)

In [9]:
df

Unnamed: 0,text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,", people receive wildfires evacuation orders c...",1
4,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...
7608,two giant cranes holding bridge collapse nearb...,1
7609,aria ahrary thetawniest control wild fires cal...,1
7610,utc km volcano hawaii http tco zdtoyd ebj,1
7611,police investigating e bike collided car littl...,1


In [10]:
!pip install -q datasets

In [11]:
from datasets import Dataset
dataset = Dataset.from_pandas(df[['text', 'target']])

In [12]:
dataset

Dataset({
    features: ['text', 'target'],
    num_rows: 7613
})

In [13]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [14]:
max_length=0
for tweet in dataset['text']:
    length = len(tweet)
    if length > max_length:
        max_length = length
print(length)

74


In [15]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=45,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

In [16]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [17]:
train_dataset = train_dataset.rename_column("target", "labels")
val_dataset = val_dataset.rename_column("target", "labels")

In [18]:
train_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 6090
})

In [19]:
val_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 1523
})

In [20]:
train_dataset = train_dataset.remove_columns(['text'])
val_dataset = val_dataset.remove_columns(['text'])

In [21]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [23]:
def model_init(trial):
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False,
    output_hidden_states=False)

In [24]:
training_args = TrainingArguments(
    output_dir='./results',
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to='none'
)

In [25]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1, 
        'precision': precision,
        'recall': recall
    }


In [26]:
trainer = Trainer(
    model=None,
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
!pip install optuna

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [28]:
def optuna_hp_space(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
        'per_device_train_batch_size': trial.suggest_categorical('per_device_train_batch_size', [16, 32]),
        'per_device_eval_batch_size': trial.suggest_categorical('per_device_eval_batch_size', [16, 32]),
        'num_train_epochs': trial.suggest_int('num_train_epochs', 2, 4),
        'weight_decay': trial.suggest_float('weight_decay', 0.1, 0.3)
    }

In [29]:
def compute_objective(metrics):
    return metrics["eval_accuracy"] 

In [30]:
best_trials = trainer.hyperparameter_search(
    direction='maximize',
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5,
    compute_objective=compute_objective,
)

[I 2024-10-08 17:01:15,753] A new study created in memory with name: no-name-dfb24c6f-c026-4883-9f32-45212613199b
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6345,0.525384,0.751149,0.726748,0.687585,0.770642
200,0.4628,0.442042,0.804334,0.769706,0.778125,0.761468
300,0.4063,0.43685,0.820749,0.779305,0.826758,0.737003
400,0.3794,0.464576,0.822718,0.780488,0.833333,0.733945
500,0.3234,0.42827,0.820092,0.78357,0.810458,0.75841
600,0.3277,0.59103,0.816809,0.764954,0.851782,0.69419
700,0.2182,0.495416,0.822062,0.783026,0.821849,0.747706


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
[I 2024-10-08 17:04:54,697] Trial 0 finished with value: 0.8220617202889035 and parameters: {'learning_rate': 4.171479503407097e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.1395962005551185}. Best is trial 0 with value: 0.8220617202889035.
Some weight

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6491,0.557748,0.732108,0.706897,0.666667,0.752294
200,0.4806,0.462656,0.794485,0.76694,0.74746,0.787462
300,0.4126,0.421935,0.826658,0.782537,0.848214,0.7263
400,0.3793,0.450321,0.820092,0.775777,0.834507,0.724771
500,0.3284,0.430183,0.826001,0.786118,0.832479,0.744648
600,0.3275,0.511029,0.820749,0.776046,0.837168,0.723242
700,0.2245,0.509201,0.813526,0.773885,0.807309,0.743119


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
[I 2024-10-08 17:08:32,198] Trial 1 finished with value: 0.8135259356533159 and parameters: {'learning_rate': 3.106056060729316e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.22702882612090408}. Best is trial 0 with value: 0.8220617202889035.
Some weigh

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6625,0.58004,0.726855,0.68437,0.679217,0.689602
200,0.5056,0.450428,0.80499,0.748092,0.84,0.674312


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
[I 2024-10-08 17:10:24,698] Trial 2 finished with value: 0.8049901510177282 and parameters: {'learning_rate': 1.6811650114424644e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.17264794730000776}. Best is trial 0 with value: 0.8220617202889035.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6517,0.554487,0.740643,0.697318,0.698925,0.695719
200,0.4898,0.447127,0.803677,0.762887,0.792422,0.735474


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
[I 2024-10-08 17:12:17,427] Trial 3 finished with value: 0.8036769533814839 and parameters: {'learning_rate': 2.1732852859885136e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.26574147870442477}. Best is trial 0 with value: 0.8220617202889035.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6291,0.52267,0.756402,0.727406,0.700141,0.756881
200,0.4635,0.443832,0.806303,0.772902,0.778295,0.767584
300,0.4076,0.42456,0.826658,0.77665,0.869318,0.701835
400,0.3794,0.463897,0.821405,0.772194,0.853704,0.704893
500,0.3216,0.433273,0.822062,0.781275,0.82735,0.740061


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
[I 2024-10-08 17:15:04,762] Trial 4 finished with value: 0.8220617202889035 and parameters: {'learning_rate': 4.841185304730115e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.26655700779931013}. Best is trial 0 with value: 0.8220617202889035.


In [31]:
best_params = best_trials.hyperparameters
print(best_params)

{'learning_rate': 4.171479503407097e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.1395962005551185}


In [32]:
training_args = TrainingArguments(
    output_dir='./model',
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to='none',
    **best_params )

trainer = Trainer(
    model=model_init(best_trials),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
trainer.train()

trainer.save_model('./best_model')

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6496,0.494417,0.773473,0.726841,0.753695,0.701835
200,0.4541,0.439334,0.812869,0.775059,0.800979,0.750765
300,0.412,0.411703,0.817466,0.772131,0.832155,0.720183
400,0.3744,0.455324,0.820749,0.775309,0.839572,0.720183
500,0.3198,0.434426,0.818779,0.766102,0.859316,0.691131
600,0.324,0.535977,0.815496,0.768724,0.832442,0.714067
700,0.2157,0.547293,0.819435,0.776241,0.829565,0.729358


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [35]:
best_model = BertForSequenceClassification.from_pretrained('./best_model')

In [36]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [37]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [38]:
test_df['text'] = test_df['text'].apply(clean_text)

In [39]:
test_df = test_df.drop(columns=['keyword','location'], axis=1)
test_df

Unnamed: 0,id,text
0,0,happened terrible car crash
1,2,"heard earthquake different cities, stay safe e..."
2,3,"forest fire spot pond, geese fleeing across st..."
3,9,apocalypse lighting spokane wildfires
4,11,typhoon soudelor kills china taiwan
...,...,...
3258,10861,earthquake safety los angeles safety fasteners...
3259,10865,storm ri worse last hurricane city amp others ...
3260,10868,green line derailment chicago http tco utbxlcbiuy
3261,10874,meg issues hazardous weather outlook hwo http ...


In [67]:
text = test_df['text'].values

In [68]:
test_input_ids = []
test_attention_masks = []
for tweet in text:
    encoded_dict =tokenizer(tweet,
                add_special_tokens = True,
                return_tensors='pt',
                return_attention_mask=True,
                max_length=45,
                pad_to_max_length=True
                )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [79]:
test_input_ids

tensor([[  101,  3047,  6659,  ...,     0,     0,     0],
        [  101,  2657,  8372,  ...,     0,     0,     0],
        [  101,  3224,  2543,  ...,     0,     0,     0],
        ...,
        [  101,  2665,  2240,  ...,     0,     0,     0],
        [  101, 12669,  3314,  ...,     0,     0,     0],
        [  101,  2103, 11253,  ...,     0,     0,     0]])

In [80]:
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
test_dataset = TensorDataset(test_input_ids,test_attention_masks)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset),batch_size=32)

In [84]:
import numpy as np
predictions = []
for index,batch in enumerate(test_dataloader):
    b_input_ids = batch[0]
    b_input_mask = batch[1]
    with torch.no_grad():
        output = best_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = output.logits
        logits = logits.detach().numpy()
        pred_flat = np.argmax(logits, axis=1).flatten()
        predictions.extend(list(pred_flat))

In [89]:
df = pd.DataFrame()
df['id'] = test_df['id']
df['target'] = predictions
df.to_csv('submission.csv',index=False)