In [151]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/best-model/best_model/config.json
/kaggle/input/best-model/best_model/model.safetensors


In [None]:
pip install peft


In [2]:
from datasets import load_dataset
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer,AutoModelForTokenClassification
from peft import get_peft_model,TaskType,LoraConfig

# social_text_dataset

In [6]:
dataset=load_dataset("conll2003") 

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [8]:
train_data=dataset['train']
test_data=dataset["test"]
validation_data=dataset['validation']

In [9]:
train_data['ner_tags'][0]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [10]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [123]:
labels=dataset['train'].features['ner_tags'].feature.names


In [124]:
print("NER Labels:", labels)

NER Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [125]:
def labels_to_idx(label_list):
    dic={}
    for i in range(len(label_list)):
        key=label_list[i]
        dic[key]=i
    return dic
    

In [126]:
labels_to_idx=labels_to_idx(labels)
labels_to_idx

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [127]:
def idx_to_labels(labels_list):
    dic={}
    for i in range(len(labels_list)):
        dic[i]=labels[i]
    return dic
        
        

In [128]:
idx2labels=idx_to_labels(labels)
idx2labels

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [130]:
from transformers import AutoModelForTokenClassification


model = AutoModelForTokenClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=9,
    id2label=idx2labels
)


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [131]:
lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, 
    r=16,                          
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_lin","k_lin","v_lin"]
)

In [132]:
peftmodel=get_peft_model(base_model,lora_config)

wow it's nearly 0.7% of all parameters 

In [133]:
trainable_parameters=sum(p.numel() for p in peftmodel.parameters() if p.requires_grad)
all_parameters=sum(p.numel() for p in loaded_model.parameters())
print(f"trainable_parameters : {trainable_parameters}")
print(f"all_parameters : {all_parameters}")

print(f"ratio is {(trainable_parameters/all_parameters)*100}")


trainable_parameters : 449289
all_parameters : 65197833
ratio is 0.6891164618922226


In [111]:
# for name, p in model.named_parameters():
#     print(f"Layer: {name}")
#     print(f"Shape: {p.shape}")
#     print(f"Parameters: {p.numel():,}")
#     print(f"Trainable: {p.requires_grad}")
#     print("-" * 50)

now let's make our customdaset class

In [134]:
import torch
class collndataset(Dataset):
    def __init__(self,data,tokenizer):
        self.text=data['tokens']
        self.labels=data['ner_tags']
        self.tokenizer=tokenizer
    def __len__(self):
        return len(self.labels)
    def __getitem__(self,idx):
        text=self.text[idx]
        labels=self.labels[idx]
        inputs=self.tokenizer(text,
                              is_split_into_words=True,
                             padding="max_length",
                             max_length=512,
                             truncation=True,
                             return_tensors="pt")
        word_ids = inputs.word_ids()
        prev_word_idx = None
        label_ids = []  
        
        for word_idx in word_ids:
            if word_idx is None:  
                label_ids.append(-100)
            elif word_idx != prev_word_idx: 
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx  
    
            
                
                
        return {
            "input_ids":inputs["input_ids"].squeeze(0),
            "attention_mask":inputs["attention_mask"].squeeze(0),
            "labels":torch.tensor(label_ids,dtype=torch.long)
        }

In [135]:
traindata=collndataset(train_data,tokenizer)
validationdata=collndataset(validation_data,tokenizer)
testdata=collndataset(test_data,tokenizer)


In [136]:
traindataloader=DataLoader(traindata,batch_size=16,shuffle=True)
validationdataloader=DataLoader(validationdata,batch_size=16,shuffle=True)
testdataloader=DataLoader(testdata,batch_size=16,shuffle=True)

In [None]:
# batch=next(iter(traindataloader))
# print(batch.keys())
# print(train_data['tokens'][0])
# print(batch['input_ids'].shape)
# print(batch['labels'].shape)
# break


In [137]:
from transformers import get_linear_schedule_with_warmup

epochs=2

num_training_steps=len(traindataloader)*epochs

optimizer = torch.optim.AdamW(model.parameters(),lr=2e-5)

scheduler=get_linear_schedule_with_warmup(optimizer,
                                        num_warmup_steps=0,
                                        num_training_steps=num_training_steps)


In [138]:
def evaluate(model, eval_dataloader):
    model.eval()
    total_eval_loss = 0
    device = "cuda" if torch.cuda.is_available() else 'cpu'
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_eval_loss += outputs.loss.item()
    
    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    return avg_eval_loss

i aleardy trianed the model and load it with this method :

**Note:i will use (model) the already trained one ,but if you will run this notebook you will replace (model) with (peftmode)**



In [156]:
# model.load_state_dict(torch.load("model_state.pt"))

In [92]:
import tqdm
device = "cuda" if torch.cuda.is_available() else 'cpu'
peftmodel.to(device)

best_loss=float('inf')

for i in range(epochs):
    peftmodel.train()

    total_loss=0
    
    #progress_bar = tqdm(traindataloader, desc=f"Epoch {i+1}")
    
    for batch_idx,batch in enumerate( traindataloader ) :
        
        
        input_ids=batch['input_ids'].to(device)
        attention_mask=batch['attention_mask'].to(device)
        labels=batch['labels'].to(device)

        optimizer.zero_grad()
        
        output=peftmodel(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels)
                    
        loss = output.loss
        
        total_loss += loss.item()
        
        loss.backward()
        # torch.nn.utils.clip_grad_norm(model.parameters(),max_norm=1.00)
        
        optimizer.step()
        scheduler.step()
        if (batch_idx+1)%100==0:
            avg_loss=total_loss/(batch_idx+1)
            print(f"epoch:{i+1}-batch:{batch_idx+1}-loss : {avg_loss}")
        # progress_bar.set_postfix({'loss':loss.item()})
        
    peftmodel.eval()
    eval_loss=evaluate(peftmodel,validationdataloader)
    avg_loss = total_loss/len(traindataloader)
    print(f"epoch num {i+1} with error = {avg_loss:.4f}")
    print(f"validation_loss : {eval_loss:.4f}")    


if eval_loss<best_loss:
    best_loss=eval_loss
    torch.save(peftmodel.state_dict(), "model_state.pt")

    

epoch:1-batch:100-loss : 0.4874436432123184
epoch:1-batch:200-loss : 0.3761404311656952
epoch:1-batch:300-loss : 0.3335909086465836
epoch:1-batch:400-loss : 0.3047833403199911
epoch:1-batch:500-loss : 0.2825640935227275
epoch:1-batch:600-loss : 0.2681832083687186
epoch:1-batch:700-loss : 0.25770818634756976
epoch:1-batch:800-loss : 0.2476762123592198
epoch num 1 with error = 0.2398
validation_loss : 0.1753
epoch:2-batch:100-loss : 0.14724009685218334
epoch:2-batch:200-loss : 0.1508475597575307
epoch:2-batch:300-loss : 0.15356309238821267
epoch:2-batch:400-loss : 0.15277369162067772
epoch:2-batch:500-loss : 0.15017040549218655
epoch:2-batch:600-loss : 0.14775292112181584
epoch:2-batch:700-loss : 0.14656131351100546
epoch:2-batch:800-loss : 0.14546042322646827
epoch num 2 with error = 0.1452
validation_loss : 0.1641


In [147]:
device = "cuda" if torch.cuda.is_available() else 'cpu'

In [148]:
device

'cuda'

In [149]:
    peftmodel.eval()
    peftmodel.to(device)
    eval_loss=evaluate(peftmodel,validationdataloader)
    print(f"validation_loss : {eval_loss:.4f}")   

validation_loss : 0.1605


In [143]:
def compute_metrics(predictions,labels):
    predictions = predictions.cpu().numpy().flatten()
    labels = labels.cpu().numpy().flatten()

    
    mask = labels != -100
    true_labels=labels[mask]
    true_predictions=predictions[mask]
    correct=np.sum(true_predictions==true_labels)
    total=len(true_labels)
    acc=correct/total
    return acc


    

In [152]:
total_acc=0
model.eval()
with torch.no_grad():
        for batch in testdataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels)
            predictions=torch.argmax(outputs.logits,dim=2)
            acc=compute_metrics(predictions,labels)
            total_acc+=acc
        avg_acc=total_acc/len(testdataloader)
        print(f"acc for test data is :{avg_acc:.4f} ")
            
            
    

acc for test data is :0.9420 


In [153]:
def inference(text,model,tokenizer,id2label):
  device = "cuda" if torch.cuda.is_available() else 'cpu'
  text=text.split()
  model.eval()
  model.to(device)
  inputs=tokenizer(text,
                   is_split_into_words=True,
                   padding="max_length",
                   max_length=512,
                   truncation=True,
                   return_tensors='pt').to(device)
                 
  with torch.no_grad():
    outputs=model(**inputs)
    predictions=torch.argmax(outputs.logits,dim=2)
  #handle_subwords
  results=[]

  word_ids = inputs.word_ids()
  prev_word_idx =None
  for idx, word_idx in enumerate(word_ids):
    if word_idx is None:
      continue
    elif word_idx  !=prev_word_idx:
      token = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][idx].item())
      pred_id = predictions[0][idx].item()
      label = id2label[pred_id]
      results.append({
          
          'token':token,
          'label':label } 
          )
    prev_word_idx=word_idx
  for i in results:
    print(f"token : {i['token']} /n label : {i['label']}")
  return results
  

In [172]:
text = "Amazon Seattle is good's  "
results = inference(text, model, tokenizer, idx2labels)

token : Amazon /n label : B-ORG
token : Seattle /n label : B-LOC
token : is /n label : O
token : good /n label : O


In [161]:
test_cases = [
    "Microsoft CEO Satya Nadella visited Apple in California",
    "The European Union met in Brussels, Belgium",
    "Amazon and Google announced partnership in Seattle"
]

for text in test_cases:
    print("\nTesting:", text)
    results = inference(text, model, tokenizer, idx2labels)


Testing: Microsoft CEO Satya Nadella visited Apple in California
token : Microsoft /n label : B-ORG
token : CEO /n label : O
token : Sa /n label : I-PER
token : Na /n label : I-PER
token : visited /n label : O
token : Apple /n label : B-ORG
token : in /n label : O
token : California /n label : B-ORG

Testing: The European Union met in Brussels, Belgium
token : The /n label : O
token : European /n label : B-MISC
token : Union /n label : I-ORG
token : met /n label : O
token : in /n label : O
token : Brussels /n label : B-LOC
token : Belgium /n label : B-LOC

Testing: Amazon and Google announced partnership in Seattle
token : Amazon /n label : B-ORG
token : and /n label : O
token : Google /n label : B-ORG
token : announced /n label : O
token : partnership /n label : O
token : in /n label : O
token : Seattle /n label : B-ORG
