In [39]:
kaggle=False
kaggle_path='/kaggle/input/nlp-getting-started/train.csv'
local_path='train.csv'
import os
if kaggle:
    os.system('pip install neptune')
    
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import pandas as pd 
import numpy as np
from tqdm.notebook import tqdm
# models: https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


df=pd.read_csv(kaggle_path if kaggle else local_path)
df



Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [40]:
df.keyword.value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [41]:
df.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [42]:
# str replaye %20 to space

def prepare_df(df):
    df=df.copy()
    df.keyword=df.keyword.str.replace('%20',' ')
    
    # enrich text with location and text, when they are not null
    df['text']='location: ' + df['location'].fillna('') + ' keyword: ' +  df['keyword'].fillna('') + ' text: ' + df['text']
    return df

df=prepare_df(df)
df.sample(10)

Unnamed: 0,id,keyword,location,text,target
3849,5477,flames,houstn,location: houstn keyword: flames text: that ne...,0
2357,3393,demolition,everywhere,location: everywhere keyword: demolition text:...,0
5989,8553,screams,"San Juan, Puerto Rico","location: San Juan, Puerto Rico keyword: screa...",0
1719,2480,collided,,location: keyword: collided text: 16 dead in ...,1
443,642,arsonist,United States,location: United States keyword: arsonist text...,0
3020,4335,dust storm,"Atlanta, GA","location: Atlanta, GA keyword: dust storm text...",1
3616,5162,fatalities,,location: keyword: fatalities text: EXCLUSIVE...,1
7282,10422,whirlwind,Florida,location: Florida keyword: whirlwind text: Set...,0
3789,5381,fire truck,"Saipan, CNMI","location: Saipan, CNMI keyword: fire truck tex...",1
2726,3916,devastated,,location: keyword: devastated text: Obama Dec...,1


In [5]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])

In [6]:
# define dataset and dataloader

SEQ_LEN = 50
KEYWORD_MAX_SEQ_LEN=3

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, test=False):
        self.df = df
        self.test = test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text=self.df.iloc[idx]['text']

        input_ids = tokenizer.encode(text, add_special_tokens=True)
        attention_mask = [1] * len(input_ids)

        if (len(input_ids)<SEQ_LEN):
            input_ids=input_ids+[0]*(SEQ_LEN-len(input_ids))
            attention_mask=attention_mask+[0]*(SEQ_LEN-len(attention_mask))
        elif (len(input_ids)>SEQ_LEN):
            input_ids=input_ids[:SEQ_LEN]
            attention_mask=attention_mask[:SEQ_LEN]

        
        keyword=self.df.iloc[idx]['keyword']
        keyword_ids = tokenizer.encode(keyword, add_special_tokens=True)
        if (len(keyword_ids)<KEYWORD_MAX_SEQ_LEN):
            keyword_ids=keyword_ids+[0]*(KEYWORD_MAX_SEQ_LEN-len(keyword_ids))
        elif (len(keyword_ids)>KEYWORD_MAX_SEQ_LEN):
            keyword_ids=keyword_ids[:KEYWORD_MAX_SEQ_LEN]

        if self.test:
            return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(keyword_ids)
        else:
            label=self.df.iloc[idx]['target']
            return torch.tensor(input_ids), torch.tensor(label), torch.tensor(attention_mask), torch.tensor(keyword_ids)
        
train_dataset = Dataset(train_df)
valid_dataset = Dataset(valid_df)

In [7]:

class ClassifierModel(torch.nn.Module):
    def __init__(self, p_dropout=0.5):
        super().__init__()
        self.model = DistilBertModel.from_pretrained('distilbert-base-cased')
        self.linear1 = torch.nn.Linear(768, 2)
        
    def forward(self, x, attention_mask=None, keywords=None):
        output = self.model(x, attention_mask=attention_mask)["last_hidden_state"]
        pooled_output = torch.mean(output, dim=1)
        output = self.linear1(pooled_output)
        return output
    
my_model=ClassifierModel(p_dropout=0.5)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
for xb, yb, att_mask, keywords in train_dataloader:
    print("xb.shape", xb.shape)
    print("yb.shape", yb.shape)
    print("att_mask.shape", att_mask.shape)
    print("keywords.shape", keywords.shape)
    print(my_model(xb, attention_mask=att_mask, keywords=keywords).shape)
    break



Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


xb.shape torch.Size([32, 30])
yb.shape torch.Size([32])
att_mask.shape torch.Size([32, 30])
keywords.shape torch.Size([32, 3])
torch.Size([32, 2])


In [8]:
optimizer=torch.optim.AdamW(my_model.parameters(), lr=0.0001)
loss_fn=torch.nn.CrossEntropyLoss()
import neptune

class Learner():
    def __init__(self, model, optimizer, loss_fn, scheduler, batch_size=32):
        self.model=model
        self.optimizer=optimizer
        self.loss_fn=loss_fn
        self.scheduler=scheduler
        self.device=torch.device("cpu")
        if torch.cuda.is_available():
            self.device=torch.device("cuda")
        #elif torch.backends.mps.is_available():
        #    self.device=torch.device("mps")

        self.model.to(self.device)
        self.run=neptune.init_run(
            project="bernd.heidemann/clickbait-classification",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
        )
        self.batch_size=batch_size
        self.train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        self.valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=True)

    def fit(self, lr=0.001, epochs=10):
        self.run["parameters"] = {
            "lr": lr,
            "epochs": epochs,
            "batch_size": self.batch_size,
        }
        bar = tqdm(total=len(self.train_dataloader) * epochs, desc="Training")
        bar.set_description("Epoch 0/{}".format(epochs))

        for epoch in range(epochs):
            self.model.train()            
            for xb, yb, att_mask, keywords in self.train_dataloader:
                
                xb=xb.to(self.device)
                yb=yb.to(self.device)
                att_mask=att_mask.to(self.device)
                keywords=keywords.to(self.device)
                pred=self.model(xb, attention_mask=att_mask, keywords=keywords)
                loss=self.loss_fn(pred, yb)
                self.run["train_loss"].log(loss.item())
                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                bar.update(1)
            self.scheduler.step()
            self.model.eval()
            # log current state to neptune
            metrics=self.get_accuracy()
            self.run["valid_accuracy"].log(metrics["accuracy"])
            self.run["valid_loss"].log(metrics["loss"])
            
                
    def get_accuracy(self):
        self.model.eval()
        with torch.no_grad():
            correct=0
            losses=[]
            for xb, yb, att_mask, keywords in self.valid_dataloader:
                xb=xb.to(self.device)
                yb=yb.to(self.device)
                att_mask=att_mask.to(self.device)
                keywords=keywords.to(self.device)
                pred=self.model(xb, attention_mask=att_mask, keywords=keywords)
                loss=self.loss_fn(pred, yb)
                losses.append(loss.item())
                pred=torch.argmax(pred, dim=1)
                correct+=torch.sum(pred==yb).item()
            return {
                "accuracy": correct/len(valid_dataset),
                "loss": np.mean(losses)
            }
    

In [9]:
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 5, eta_min=0.0001)

my_model=ClassifierModel(p_dropout=0.5)
learner=Learner(my_model, optimizer, loss_fn, scheduler, batch_size=128)
learner.fit(lr=0.0001, epochs=2)

  self.run=neptune.init_run(


https://app.neptune.ai/bernd.heidemann/clickbait-classification/e/CLIC-79


Training:   0%|          | 0/96 [00:00<?, ?it/s]

In [10]:
learner.get_accuracy()

{'accuracy': 0.8266579120157583, 'loss': 0.4035767888029416}

In [11]:
local_test_path='test.csv'
kaggle_test_path='/kaggle/input/nlp-getting-started/test.csv'

df_submission_test_data=pd.read_csv(kaggle_test_path if kaggle else local_test_path)
df_submission_test_data.sample(5)

Unnamed: 0,id,keyword,location,text
513,1680,bridge%20collapse,,New: Two giant cranes holding a bridge collaps...
1619,5451,first%20responders,"Tennessee, USA",Please pray for employees residents and first...
1391,4586,emergency%20plan,In erotic world,Calgary takes another beating from summer stor...
2281,7620,pandemonium,"Toronto, Canada",On the Christie Hillside: Game 4 - Pandemonium...
2561,8548,screams,,one of my fav lydia screams is in 4x11 when sh...


In [12]:
df_submission_test_data=prepare_df(df_submission_test_data)

In [13]:
test_dataset=Dataset(df_submission_test_data, test=True)
test_loader=torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

for xb, att_mask, keywords in test_loader:
    xb=xb.to(learner.device)
    att_mask=att_mask.to(learner.device)
    keywords=keywords.to(learner.device)
    print(xb.shape)
    print(att_mask.shape)
    print(my_model(xb, attention_mask=att_mask, keywords=keywords).shape)
    break


torch.Size([32, 30])
torch.Size([32, 30])
torch.Size([32, 2])


In [14]:
predictions=[]

for xb, att_mask, keywords in test_loader:
    xb=xb.to(learner.device)
    att_mask=att_mask.to(learner.device)
    keywords=keywords.to(learner.device)
    pred=my_model(xb, attention_mask=att_mask, keywords=keywords)
    pred=torch.argmax(pred, dim=1)
    predictions+=pred.tolist()

df_submission_test_data['target']=predictions
df_submission_test_data[['id', 'target']].to_csv('submission.csv', index=False)

In [15]:
# 0.797