In [1]:
kaggle=False
kaggle_path='/kaggle/input/nlp-getting-started/train.csv'
local_path='train.csv'
import os
if kaggle:
    os.system('pip install neptune')
    
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import pandas as pd 
import numpy as np
from tqdm.notebook import tqdm
# models: https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


df=pd.read_csv(kaggle_path if kaggle else local_path)


def prepare_df(df):
    df=df.copy()
    df.keyword=df.keyword.str.replace('%20',' ')
    
    # enrich text with location and text, when they are not null
    df['text']='location: ' + df['location'].fillna('') + ' | keyword: ' +  df['keyword'].fillna('') + ' | text: ' + df['text'].fillna('')
    # drop location and keyword
    df=df.drop(columns=['location','keyword'])
    # drop id
    df=df.drop(columns=['id'])
    return df

df=prepare_df(df)
df.sample(10)



Unnamed: 0,text,target
1853,location: | keyword: crush | text: Ina Buted ...,1
4319,"location: Benicia, CA | keyword: hellfire | t...",0
6999,location: | keyword: twister | text: @carolin...,0
6337,location: USA | keyword: structural failure | ...,1
1742,location: | keyword: collision | text: Beat:G...,1
5091,location: Inexpressible Island | keyword: nuc...,1
6823,location: New York City | keyword: trapped | t...,0
5122,location: | keyword: nuclear reactor | text: ...,1
2242,"location: College Station, TX | keyword: delug...",0
5494,"location: San Diego, CA | keyword: quarantine ...",0


In [2]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])

In [3]:
# define dataset and dataloader

SEQ_LEN = 50

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, test=False):
        self.df = df
        self.test = test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text=self.df.iloc[idx]['text']

        input_ids = tokenizer.encode(text, add_special_tokens=True)
        attention_mask = [1] * len(input_ids)

        if (len(input_ids)<SEQ_LEN):
            input_ids=input_ids+[0]*(SEQ_LEN-len(input_ids))
            attention_mask=attention_mask+[0]*(SEQ_LEN-len(attention_mask))
        elif (len(input_ids)>SEQ_LEN):
            input_ids=input_ids[:SEQ_LEN]
            attention_mask=attention_mask[:SEQ_LEN]

        if self.test:
            return torch.tensor(input_ids), torch.tensor(attention_mask)
        else:
            label=self.df.iloc[idx]['target']
            return torch.tensor(input_ids), torch.tensor(label), torch.tensor(attention_mask)
        
train_dataset = Dataset(train_df)
valid_dataset = Dataset(valid_df)






In [4]:

class ClassifierModel(torch.nn.Module):
    def __init__(self, p_dropout=0.5):
        super().__init__()
        #self.model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

        self.model = DistilBertModel.from_pretrained('distilbert-base-cased')

        #self.freeze()
        self.linear1 = torch.nn.Linear(768, 2)
        #self.linear2=torch.nn.Linear(350, 2)
        #self.maxpool=torch.nn.MaxPool1d(SEQ_LEN)
        self.dropout=torch.nn.Dropout(p_dropout)
        #self.batchnorm=torch.nn.BatchNorm1d(350)
        #self.relu=torch.nn.ReLU()
        

    def freeze(self):
        for param in self.model.parameters():
            param.requires_grad = False

    def forward(self, x, attention_mask=None):
        output = self.model(x, attention_mask=attention_mask)["last_hidden_state"]
        pooled_output = torch.mean(output, dim=1)
        pooled_output = self.dropout(pooled_output)
        output = self.linear1(pooled_output)
        #output = self.relu(output)
        #output = self.dropout(output)
        #output = self.batchnorm(output)
        #output = self.linear2(output)
        return output
    
my_model=ClassifierModel(p_dropout=0.5)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
for xb, yb, att_mask in train_dataloader:
    print(xb.shape)
    print(yb.shape)
    print(att_mask.shape)
    print(my_model(xb, attention_mask=att_mask).shape)
    break



torch.Size([32, 50])
torch.Size([32])
torch.Size([32, 50])
torch.Size([32, 2])


In [5]:

import neptune

class Learner():
    def __init__(self, model, train_dataloader, valid_dataloader, batch_size=32):
        self.model=model
        self.loss_fn=torch.nn.CrossEntropyLoss()
        self.device=torch.device("cpu")
        if torch.cuda.is_available():
            self.device=torch.device("cuda")
        #elif torch.backends.mps.is_available():
        #    self.device=torch.device("mps")

        self.model.to(self.device)
        self.run=neptune.init_run(
            project="bernd.heidemann/clickbait-classification",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
        )
        self.batch_size=batch_size
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader

    def fit(self, lr=0.001, epochs=10):
        self.run["parameters"] = {
            "lr": lr,
            "epochs": epochs,
            "batch_size": self.batch_size,
        }
        optimizer=torch.optim.AdamW(self.model.parameters(), lr=lr)
        scheduler=scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
        bar = tqdm(total=len(self.train_dataloader) * epochs, desc="Training")
        bar.set_description("Epoch 0/{}".format(epochs))

        for epoch in range(epochs):
            self.model.train()            
            for xb, yb, att_mask in self.train_dataloader:
                
                xb=xb.to(self.device)
                yb=yb.to(self.device)
                att_mask=att_mask.to(self.device)
                pred=self.model(xb, attention_mask=att_mask)
                loss=self.loss_fn(pred, yb)
                self.run["train_loss"].log(loss.item())
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                bar.update(1)
            scheduler.step()
            self.model.eval()
            # log current state to neptune
            if self.valid_dataloader is not None:
                metrics=self.get_accuracy()
                self.run["valid_accuracy"].log(metrics["accuracy"])
                self.run["valid_loss"].log(metrics["loss"])
                bar.set_description("Epoch {}/{} validAccuracy: {:.2f} validLoss: {:.2f}".format(epoch+1, epochs, metrics["accuracy"], metrics["loss"]))
            
                
    def get_accuracy(self):
        self.model.eval()
        with torch.no_grad():
            correct=0
            losses=[]
            for xb, yb, att_mask in self.valid_dataloader:
                xb=xb.to(self.device)
                yb=yb.to(self.device)
                att_mask=att_mask.to(self.device)
                pred=self.model(xb, attention_mask=att_mask)
                loss=self.loss_fn(pred, yb)
                losses.append(loss.item())
                pred=torch.argmax(pred, dim=1)
                correct+=torch.sum(pred==yb).item()
            return {
                "accuracy": correct/len(valid_dataset),
                "loss": np.mean(losses)
            }
    

In [6]:

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)

learner=Learner(my_model, train_dataloader, valid_dataloader, batch_size=128)
learner.fit(lr=0.0001, epochs=2)



https://app.neptune.ai/bernd.heidemann/clickbait-classification/e/CLIC-105


Training:   0%|          | 0/382 [00:00<?, ?it/s]

In [7]:
learner.get_accuracy()

{'accuracy': 0.7806959947472094, 'loss': 0.5223116160680851}

In [19]:
my_model=ClassifierModel(p_dropout=0.5)
full_dataset = Dataset(df, test=False)
full_dataloader = torch.utils.data.DataLoader(full_dataset, batch_size=32, shuffle=False)

learner=Learner(my_model, full_dataloader, None, batch_size=128)
learner.fit(lr=0.0001, epochs=1)

https://app.neptune.ai/bernd.heidemann/clickbait-classification/e/CLIC-108


Training:   0%|          | 0/238 [00:00<?, ?it/s]

In [20]:
local_test_path='test.csv'
kaggle_test_path='/kaggle/input/nlp-getting-started/test.csv'

df_submission_test_data=pd.read_csv(kaggle_test_path if kaggle else local_test_path)
df_submission_test_data.sample(5)

Unnamed: 0,id,keyword,location,text
2653,8870,smoke,your mom,would definitely have way more money if i didn...
1966,6633,inundated,That London,.@38_degrees Hello. I have been inundated by p...
40,125,accident,"Frankfurt, Germany",@DaveOshry @Soembie So if I say that I met her...
271,887,bioterrorism,Amsterdam NL or Greenwich USA,Is it time to hedge against catastrophic risks...
1962,6615,inundated,Chicagoland,WARNING: This string will be inundated with wi...


In [21]:
df_submission_test_data=prepare_df(df_submission_test_data)
df_submission_test_data.sample(5)

Unnamed: 0,text
2516,location: | keyword: ruin | text: To respect ...
2091,location: | keyword: mayhem | text: RETWEET #...
2749,location: | keyword: suicide bomber | text: q...
289,location: Mo.City | keyword: blaze | text: @_A...
2254,location: | keyword: oil spill | text: Refugi...


In [22]:
test_dataset=Dataset(df_submission_test_data, test=True)
test_loader=torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

for xb, att_mask in test_loader:
    xb=xb.to(learner.device)
    att_mask=att_mask.to(learner.device)
    print(xb.shape)
    print(att_mask.shape)
    print(my_model(xb, attention_mask=att_mask).shape)
    break


torch.Size([32, 50])
torch.Size([32, 50])
torch.Size([32, 2])


In [23]:
predictions=[]
my_model.eval()

for xb, att_mask in test_loader:
    xb=xb.to(learner.device)
    att_mask=att_mask.to(learner.device)
    pred=my_model(xb, attention_mask=att_mask)
    pred=torch.argmax(pred, dim=1)
    predictions+=pred.tolist()


df_submission_test_data['target']=predictions
df_submission_test_data['id']=pd.read_csv(local_test_path)['id']



df_submission_test_data[['id', 'target']].to_csv('submission.csv', index=False)

In [24]:
df_submission_test_data[['id', 'target']].sample(10)

Unnamed: 0,id,target
38,123,0
275,901,0
1064,3507,1
3117,10326,0
2074,6969,1
883,2914,0
1826,6176,0
2176,7279,1
1800,6080,0
2805,9327,0


In [None]:
#0.7768