In [7]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import pandas as pd
import numpy as np
# models: https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

df=pd.read_csv('clickbait_data.csv')
df

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


In [8]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# define dataset and dataloader

SEQ_LEN = 30

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text=self.df.iloc[idx]['headline']

        input_ids = tokenizer.encode(text, add_special_tokens=True)
        if (len(input_ids)<SEQ_LEN):
            input_ids=input_ids+[0]*(SEQ_LEN-len(input_ids))
        elif (len(input_ids)>SEQ_LEN):
            input_ids=input_ids[:SEQ_LEN]
        label=self.df.iloc[idx]['clickbait']
        return torch.tensor(input_ids), torch.tensor(label)
    
train_dataset = Dataset(train_df)
valid_dataset = Dataset(valid_df)






In [10]:

class Classifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.linear = torch.nn.Linear(768, 2)

    def forward(self, x):
        output = self.model(x)["last_hidden_state"]   # shape (batch_size, seq_len, 768)
        pooled_output = torch.mean(output, dim=1)     # shape (batch_size, 768)
        output = self.linear(pooled_output)           # shape (batch_size, 2)
        return output
    
my_model=Classifier()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
for xb, yb in train_dataloader:
    print(xb.shape)
    print(yb.shape)
    print(my_model(xb).shape)
    break

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([32, 30])
torch.Size([32])
torch.Size([32, 2])


In [11]:
optimizer=torch.optim.AdamW(my_model.parameters(), lr=0.0001)
loss_fn=torch.nn.CrossEntropyLoss()
import neptune

class Learner():
    def __init__(self, model, optimizer, loss_fn, batch_size=32):
        self.model=model
        self.optimizer=optimizer
        self.loss_fn=loss_fn

        self.device=torch.device("cpu")
        if torch.cuda.is_available():
            self.device=torch.device("cuda")
        #elif torch.backends.mps.is_available():
        #    self.device=torch.device("mps")

        self.model.to(self.device)
        self.run=neptune.init_run(
            project="bernd.heidemann/clickbait-classification",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
        )
        self.batch_size=batch_size
        self.train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        self.valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=True)

    def fit(self, lr=0.001, epochs=10):
        self.run["parameters"] = {
            "lr": lr,
            "epochs": epochs,
            "batch_size": self.batch_size,
        }
        for epoch in range(epochs):
            self.model.train()
            batch_count=0
            for xb, yb in self.train_dataloader:
                batch_count+=1
                xb=xb.to(self.device)
                yb=yb.to(self.device)
                pred=self.model(xb)
                loss=self.loss_fn(pred, yb)
                self.run["train_loss"].log(loss.item())
                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
            self.model.eval()
            # log current state to neptune
            with torch.no_grad():
                valid_loss=0
                for xb, yb in self.valid_dataloader:
                    xb=xb.to(self.device)
                    yb=yb.to(self.device)
                    pred=self.model(xb)
                    loss=self.loss_fn(pred, yb)
                    valid_loss+=loss.item()
                print("epoch: {}, valid_loss: {}".format(epoch, valid_loss/len(self.valid_dataloader)))
                self.run["valid_loss"].log(valid_loss/len(self.valid_dataloader))
                
    def get_accuracy(self):
        self.model.eval()
        with torch.no_grad():
            correct=0
            for xb, yb in self.valid_dataloader:
                xb=xb.to(self.device)
                yb=yb.to(self.device)
                pred=self.model(xb)
                pred=torch.argmax(pred, dim=1)
                correct+=torch.sum(pred==yb).item()
            return correct/len(valid_dataset)

        

In [12]:
learner=Learner(my_model, optimizer, loss_fn, batch_size=128)
learner.fit(lr=0.0001, epochs=1)

https://app.neptune.ai/bernd.heidemann/clickbait-classification/e/CLIC-122
batches:  200
batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
batch: 27
batch: 28
batch: 29
batch: 30
batch: 31
batch: 32
batch: 33
batch: 34
batch: 35
batch: 36
batch: 37
batch: 38
batch: 39
batch: 40
batch: 41
batch: 42
batch: 43
batch: 44
batch: 45
batch: 46
batch: 47
batch: 48
batch: 49
batch: 50
batch: 51
batch: 52
batch: 53
batch: 54
batch: 55
batch: 56
batch: 57
batch: 58
batch: 59
batch: 60
batch: 61
batch: 62
batch: 63
batch: 64
batch: 65
batch: 66
batch: 67
batch: 68
batch: 69
batch: 70
batch: 71
batch: 72
batch: 73
batch: 74
batch: 75
batch: 76
batch: 77
batch: 78


KeyboardInterrupt: 

In [13]:
learner.get_accuracy()

0.975

In [None]:
learner.run.stop()

Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/bernd.heidemann/clickbait-classification/e/CLIC-17/metadata
