In [None]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
from nltk.corpus import stopwords
import re
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertTokenizer, get_linear_schedule_with_warmup

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
df = pd.read_csv('/content/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
def contains_url(text):
    return bool(re.search(r'http[s]?://', text))

def contains_html_tags(text):
    return bool(re.search(r'<.*?>', text))

def contains_punctuation(text):
    return bool(re.search(r'[^\w\s]', text))

def contains_stopwords(text):
    stop_words = stopwords.words('english')

    words = text.split()
    return any(word.lower() in stop_words for word in words)


df['contains_url'] = df['text'].apply(contains_url)
df['contains_html_tags'] = df['text'].apply(contains_html_tags)
df['contains_punctuation'] = df['text'].apply(contains_punctuation)
df['contains_stopwords'] = df['text'].apply(contains_stopwords)

filtered_df = df[df[['contains_url', 'contains_html_tags', 'contains_punctuation', 'contains_stopwords']].any(axis=1)]

print(f"Bu koşulları sağlayan veri sayısı: {len(filtered_df)}")

Bu koşulları sağlayan veri sayısı: 7581


In [None]:
df.shape

(7613, 9)

In [None]:
sw = stopwords.words('english')

def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r"http\S+", "",text)
    html=re.compile(r'<.*?>')
    text = html.sub(r'',text)

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'')

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
df['text'] = df['text'].apply(lambda x: clean_text(x))

In [None]:
tweets = df.text.values
tweets

array(['deeds reason earthquake may allah forgive us',
       'forest fire near la ronge sask canada',
       'residents asked shelter place notified officers evacuation shelter place orders expected',
       ..., 'utc km volcano hawaii http tco zdtoyd ebj',
       'police investigating e bike collided car little portugal e bike rider suffered serious non life threatening injuries',
       'latest homes razed northern california wildfire abc news http tco ymy rskq'],
      dtype=object)

In [None]:
labels = df.target.values
labels

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
print('Original: ',tweets[0])
print('Tokenized: ',tokenizer.tokenize(tweets[0]))
print('Token IDs: ',tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[0])))

Original:  deeds reason earthquake may allah forgive us
Tokenized:  ['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']
Token IDs:  [15616, 3114, 8372, 2089, 16455, 9641, 2149]


In [None]:
tokenizer.encode(tweets[0], add_special_tokens=False)

[15616, 3114, 8372, 2089, 16455, 9641, 2149]

In [None]:
max_len = 0

for sent in tweets:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max tweet length: ', max_len)

Max tweet length:  45


In [None]:
input_ids = []
attention_masks = []

for tweet in tweets:
  encoded_dict = tokenizer.encode_plus(
      tweet, add_special_tokens=True,
      max_length=max_len, pad_to_max_length=True,
      return_attention_mask=True, return_tensors='pt'
  )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
type(input_ids)

list

In [None]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [None]:
attention_masks.shape

torch.Size([7613, 45])

In [None]:
input_ids.shape

torch.Size([7613, 45])

In [None]:
print('Original: ', tweets[0])
print('Token IDs:', input_ids[0])

Original:  deeds reason earthquake may allah forgive us
Token IDs: tensor([  101, 15616,  3114,  8372,  2089, 16455,  9641,  2149,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])


In [None]:
dataset  = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8*len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('Train size: ',len(train_dataset))

Train size:  6090


In [None]:
batch_size=32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset),batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=RandomSampler(val_dataset),batch_size=batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)



In [None]:
epochs = 4
total_steps = len(train_dataloader)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {} / {} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {}".format(training_time))
    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.48
  Training epcoh took: 0:00:47

Running Validation...
  Accuracy: 0.84

Training...

  Average training loss: 0.36
  Training epcoh took: 0:00:49

Running Validation...
  Accuracy: 0.83

Training...

  Average training loss: 0.28
  Training epcoh took: 0:00:48

Running Validation...
  Accuracy: 0.84

Training...

  Average training loss: 0.23
  Training epcoh took: 0:00:48

Running Validation...
  Accuracy: 0.84

Training complete!
Total training took 0:03:46 (h:mm:ss)


In [None]:
print(training_stats)

[{'epoch': 1, 'Training Loss': 0.47939148662290026, 'Valid. Loss': 0.38535459867368144, 'Valid. Accur.': 0.8383018092105262, 'Training Time': '0:00:47', 'Validation Time': '0:00:04'}, {'epoch': 2, 'Training Loss': 0.35982049252662357, 'Valid. Loss': 0.4086890642841657, 'Valid. Accur.': 0.8276795504385964, 'Training Time': '0:00:49', 'Validation Time': '0:00:04'}, {'epoch': 3, 'Training Loss': 0.28469531815401544, 'Valid. Loss': 0.45347599995632965, 'Valid. Accur.': 0.8404605263157895, 'Training Time': '0:00:48', 'Validation Time': '0:00:04'}, {'epoch': 4, 'Training Loss': 0.22583140520603245, 'Valid. Loss': 0.45519067098697025, 'Valid. Accur.': 0.8393983004385964, 'Training Time': '0:00:48', 'Validation Time': '0:00:04'}]


In [None]:
model = torch.load('bert_model')

  model = torch.load('bert_model')


In [None]:
df_test = pd.read_csv('/content/test.csv')

In [None]:
df_test['text'] = df_test['text'].apply(lambda x: clean_text(x))
test_tweets = df_test['text'].values

In [None]:
test_input_ids = []
test_attention_masks = []
for tweet in test_tweets:
  encoded_dict = tokenizer.encode_plus(tweet,
                add_special_tokens = True,
                return_tensors='pt',
                return_attention_mask=True,
                max_length=max_len,
                pad_to_max_length=True
                )
  test_input_ids.append(encoded_dict['input_ids'])
  test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)



In [None]:
len(test_attention_masks)
print(test_attention_masks)

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [None]:
test_dataset = TensorDataset(test_input_ids,test_attention_masks)
test_dataloader = DataLoader(test_dataset, sampler=RandomSampler(test_dataset),batch_size=batch_size)

In [None]:
predictions = []
for index,batch in enumerate(test_dataloader):
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  with torch.no_grad():
    output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = output.logits
    logits = logits.detach().cpu().numpy()
    pred_flat = np.argmax(logits, axis=1).flatten()
    predictions.extend(list(pred_flat))