In [1]:
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer
import datasets

import nltk
from nltk.corpus import brown # POS data

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
nltk.download('brown')
nltk.download('universal_tagset')

In [None]:
def get_sentence_pos(sentences_pos_tuple):
    sentence = " ".join([word for word, pos in sentences_pos_tuple])
    pos = " ".join([pos for word, pos in sentences_pos_tuple])

    return (sentence, pos)



In [None]:
corpus = brown.tagged_sents(tagset="universal")

In [None]:
from tqdm import tqdm

In [None]:
distinct_pos = []

In [None]:
sentence_pos_dict = pd.DataFrame(columns=['sentence','pos'])
i=0
distinct_pos = []
for sentences_pos_tuple in tqdm(corpus, total=len(corpus)):
    sentence, pos = get_sentence_pos(sentences_pos_tuple)
    a = pos.split(" ")
    distinct_pos = distinct_pos + a
    distinct_pos = list(set(distinct_pos))
    sentence_pos_dict.loc[i,"sentence"]=sentence
    sentence_pos_dict.loc[i,"pos"]=pos
    i+=1
distinct_pos = list(set(distinct_pos))


In [None]:
distinct_pos = list(set(distinct_pos))

In [None]:
import pickle
from sklearn.model_selection import train_test_split
sentence_pos_dict.columns = ['sentence','label']
train,test = train_test_split(sentence_pos_dict)

train.to_csv("./data/train.csv",index=False)
test.to_csv("./data/test.csv",index=False)
with open('distinct_pos.pkl', 'wb') as file:
    pickle.dump(distinct_pos, file)

In [2]:
import pickle
with open('distinct_pos.pkl', 'rb') as file:
    # Step 3: Use pickle.load to deserialize the content
    distinct_pos = pickle.load(file)

In [3]:
# make id2label & label2id
id2label = {i:value for i, value in enumerate(distinct_pos)}
label2id = {value:key for key, value in id2label.items()}

In [4]:
# Tokenizer
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [7]:
t = tokenizer(sentence_pos_dict.loc[0,"sentence"])
t

NameError: name 'sentence_pos_dict' is not defined

In [8]:
t.word_ids()

NameError: name 't' is not defined

In [5]:
from datasets import load_dataset
# Load the datasets from CSV files
dataset = load_dataset(
    "csv",
    data_files={'train': './data/train.csv', 'test': './data/test.csv'}
)


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 43005
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 14335
    })
})

In [7]:
tokenizer(dataset['train'][0]['sentence'], )

{'input_ids': [101, 1124, 1674, 1136, 1928, 1106, 1474, 1115, 3379, 1575, 1103, 27466, 3080, 12888, 10308, 1104, 1875, 1105, 1117, 25211, 1194, 1103, 2303, 132, 132, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:


# Tokenize for both input and target(label)
def tokenize_fn(batch):
    # Tokenize the input seq first
    # It will populate inputs_ids, attention_mask etc
    tokenized_inputs = tokenizer(batch['sentence'], truncation=True)

    # Tokenized_output using pos mapping
    label_batch_str = batch['label']
    label_batch_int = []
    for label_str in label_batch_str:
        pos_int=[-100]
        for pos in label_str.split(" "):
            #print(pos,label_str)
            pos_int.append(label2id[pos])
        pos_int+[-100]
        label_batch_int.append(pos_int)

    tokenized_inputs['labels'] = label_batch_int
    return tokenized_inputs

In [9]:
tokenized_datasets = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=dataset['train'].column_names # Removing column other than input_ids, attention_mask, labels
)

Map: 100%|██████████| 14335/14335 [00:00<00:00, 16061.53 examples/s]


In [10]:
tokenizer.decode(tokenized_datasets['train'][100]['input_ids'])

'[CLS] ) The sorry fact about this young man, who was barely of age when he broke into major - league baseball, was that he really was a better ball player than he was given credit for being - - never so good as he claimed, and always an irritant to his associates, but a good steady performer when he could fight down the temptation to orate on his skills or cut up in public. [SEP]'

In [11]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label = id2label,
    label2id = label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

train_args = TrainingArguments(
    "distilbert-finetuned-pos",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.1,
)



In [13]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    
    #compute_metrics=compute_metric,
    tokenizer=tokenizer
)

In [21]:
trainer.train()

  3%|▎         | 501/16128 [01:38<50:19,  5.18it/s]  

{'loss': 0.2288, 'grad_norm': 9.21403980255127, 'learning_rate': 1.937996031746032e-05, 'epoch': 0.09}


  6%|▌         | 1001/16128 [03:05<51:30,  4.89it/s] 

{'loss': 0.2232, 'grad_norm': 20.19095802307129, 'learning_rate': 1.8759920634920635e-05, 'epoch': 0.19}


  9%|▉         | 1501/16128 [04:43<44:19,  5.50it/s]  

{'loss': 0.2203, 'grad_norm': 3.8241851329803467, 'learning_rate': 1.8139880952380957e-05, 'epoch': 0.28}


 12%|█▏        | 2001/16128 [06:19<48:11,  4.89it/s]  

{'loss': 0.2112, 'grad_norm': 7.395877361297607, 'learning_rate': 1.751984126984127e-05, 'epoch': 0.37}


 16%|█▌        | 2501/16128 [07:42<32:03,  7.08it/s]  

{'loss': 0.2067, 'grad_norm': 1.5942310094833374, 'learning_rate': 1.689980158730159e-05, 'epoch': 0.47}


 19%|█▊        | 3002/16128 [09:14<28:21,  7.71it/s]

{'loss': 0.2014, 'grad_norm': 12.371731758117676, 'learning_rate': 1.6279761904761905e-05, 'epoch': 0.56}


 22%|██▏       | 3501/16128 [10:52<30:24,  6.92it/s]  

{'loss': 0.192, 'grad_norm': 5.4341535568237305, 'learning_rate': 1.5659722222222223e-05, 'epoch': 0.65}


 25%|██▍       | 4001/16128 [12:28<47:49,  4.23it/s]

{'loss': 0.1923, 'grad_norm': 2.3859901428222656, 'learning_rate': 1.5039682539682541e-05, 'epoch': 0.74}


 28%|██▊       | 4501/16128 [14:04<34:17,  5.65it/s]  

{'loss': 0.1941, 'grad_norm': 11.205224990844727, 'learning_rate': 1.441964285714286e-05, 'epoch': 0.84}


 31%|███       | 5001/16128 [15:39<32:59,  5.62it/s]

{'loss': 0.1875, 'grad_norm': 7.098023414611816, 'learning_rate': 1.3799603174603176e-05, 'epoch': 0.93}


                                                    
 33%|███▎      | 5376/16128 [18:12<29:57,  5.98it/s]

{'eval_loss': 0.1535341888666153, 'eval_runtime': 82.8866, 'eval_samples_per_second': 172.947, 'eval_steps_per_second': 21.62, 'epoch': 1.0}


 34%|███▍      | 5501/16128 [18:40<30:15,  5.85it/s]   

{'loss': 0.1663, 'grad_norm': 2.324141025543213, 'learning_rate': 1.3179563492063495e-05, 'epoch': 1.02}


 37%|███▋      | 6001/16128 [20:12<33:54,  4.98it/s]

{'loss': 0.1565, 'grad_norm': 18.149566650390625, 'learning_rate': 1.2559523809523811e-05, 'epoch': 1.12}


 40%|████      | 6501/16128 [21:47<32:12,  4.98it/s]

{'loss': 0.1527, 'grad_norm': 13.958889961242676, 'learning_rate': 1.193948412698413e-05, 'epoch': 1.21}


 43%|████▎     | 7001/16128 [23:29<28:45,  5.29it/s]

{'loss': 0.1536, 'grad_norm': 4.796503067016602, 'learning_rate': 1.1319444444444444e-05, 'epoch': 1.3}


 47%|████▋     | 7501/16128 [24:58<34:23,  4.18it/s]

{'loss': 0.1418, 'grad_norm': 6.247701168060303, 'learning_rate': 1.0699404761904764e-05, 'epoch': 1.4}


 50%|████▉     | 8001/16128 [26:43<26:18,  5.15it/s]

{'loss': 0.1483, 'grad_norm': 26.537111282348633, 'learning_rate': 1.007936507936508e-05, 'epoch': 1.49}


 53%|█████▎    | 8501/16128 [28:18<24:51,  5.11it/s]

{'loss': 0.1448, 'grad_norm': 22.16181755065918, 'learning_rate': 9.459325396825398e-06, 'epoch': 1.58}


 56%|█████▌    | 9001/16128 [29:55<23:22,  5.08it/s]

{'loss': 0.139, 'grad_norm': 6.327390670776367, 'learning_rate': 8.839285714285714e-06, 'epoch': 1.67}


 59%|█████▉    | 9501/16128 [31:31<24:13,  4.56it/s]

{'loss': 0.1369, 'grad_norm': 6.721161842346191, 'learning_rate': 8.219246031746033e-06, 'epoch': 1.77}


 62%|██████▏   | 10001/16128 [33:06<19:37,  5.21it/s]

{'loss': 0.1334, 'grad_norm': 3.040335178375244, 'learning_rate': 7.599206349206349e-06, 'epoch': 1.86}


 65%|██████▌   | 10501/16128 [34:45<17:22,  5.40it/s]

{'loss': 0.127, 'grad_norm': 4.702414035797119, 'learning_rate': 6.979166666666667e-06, 'epoch': 1.95}


                                                     
 67%|██████▋   | 10752/16128 [36:57<16:36,  5.39it/s]

{'eval_loss': 0.13161221146583557, 'eval_runtime': 82.7501, 'eval_samples_per_second': 173.232, 'eval_steps_per_second': 21.656, 'epoch': 2.0}


 68%|██████▊   | 11001/16128 [37:53<15:09,  5.63it/s]   

{'loss': 0.1157, 'grad_norm': 7.766083717346191, 'learning_rate': 6.359126984126984e-06, 'epoch': 2.05}


 71%|███████▏  | 11501/16128 [39:32<14:51,  5.19it/s]

{'loss': 0.1102, 'grad_norm': 0.4694015681743622, 'learning_rate': 5.7390873015873015e-06, 'epoch': 2.14}


 74%|███████▍  | 12001/16128 [41:13<11:30,  5.98it/s]

{'loss': 0.1159, 'grad_norm': 3.3039238452911377, 'learning_rate': 5.119047619047619e-06, 'epoch': 2.23}


 78%|███████▊  | 12501/16128 [42:55<12:37,  4.79it/s]

{'loss': 0.1164, 'grad_norm': 13.152817726135254, 'learning_rate': 4.499007936507937e-06, 'epoch': 2.33}


 81%|████████  | 13001/16128 [44:34<10:14,  5.09it/s]

{'loss': 0.1065, 'grad_norm': 2.5193512439727783, 'learning_rate': 3.878968253968255e-06, 'epoch': 2.42}


 84%|████████▎ | 13501/16128 [46:16<08:52,  4.93it/s]

{'loss': 0.1111, 'grad_norm': 11.42715835571289, 'learning_rate': 3.258928571428572e-06, 'epoch': 2.51}


 87%|████████▋ | 14001/16128 [47:53<06:27,  5.49it/s]

{'loss': 0.1091, 'grad_norm': 6.992559909820557, 'learning_rate': 2.6388888888888893e-06, 'epoch': 2.6}


 90%|████████▉ | 14501/16128 [49:31<05:39,  4.80it/s]

{'loss': 0.1039, 'grad_norm': 2.436715602874756, 'learning_rate': 2.0188492063492067e-06, 'epoch': 2.7}


 93%|█████████▎| 15001/16128 [51:07<03:52,  4.84it/s]

{'loss': 0.1002, 'grad_norm': 19.11127281188965, 'learning_rate': 1.398809523809524e-06, 'epoch': 2.79}


 96%|█████████▌| 15501/16128 [52:47<01:54,  5.47it/s]

{'loss': 0.1097, 'grad_norm': 1.2619457244873047, 'learning_rate': 7.787698412698413e-07, 'epoch': 2.88}


 99%|█████████▉| 16001/16128 [54:25<00:27,  4.70it/s]

{'loss': 0.1076, 'grad_norm': 11.24446964263916, 'learning_rate': 1.5873015873015874e-07, 'epoch': 2.98}


                                                     
100%|██████████| 16128/16128 [55:58<00:00,  7.97it/s]

{'eval_loss': 0.10742051154375076, 'eval_runtime': 71.3245, 'eval_samples_per_second': 200.983, 'eval_steps_per_second': 25.125, 'epoch': 3.0}


100%|██████████| 16128/16128 [56:02<00:00,  4.80it/s]

{'train_runtime': 3362.9097, 'train_samples_per_second': 38.364, 'train_steps_per_second': 4.796, 'train_loss': 0.15166743028731572, 'epoch': 3.0}





TrainOutput(global_step=16128, training_loss=0.15166743028731572, metrics={'train_runtime': 3362.9097, 'train_samples_per_second': 38.364, 'train_steps_per_second': 4.796, 'total_flos': 1664451124297704.0, 'train_loss': 0.15166743028731572, 'epoch': 3.0})

In [22]:
trainer.save_model("my_saved_model")

In [26]:
trainer.push_to_hub("amanpatkar/distilbert-finetuned-pos", token = "<>")


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A
[A
training_args.bin: 100%|██████████| 5.11k/5.11k [00:00<00:00, 11.9kB/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-pos/commit/b64a65980fed80e1cf9c3ed44213c8315eeca958', commit_message='amanpatkar/distilbert-finetuned-pos', commit_description='', oid='b64a65980fed80e1cf9c3ed44213c8315eeca958', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
tokenizer.push_to_hub("amanpatkar/distilbert-finetuned-pos", token = "<>")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/amanpatkar/distilbert-finetuned-pos/commit/b67e5711d20288b02e5c1cfdeeb8ab00ccb20ac8', commit_message='Upload tokenizer', commit_description='', oid='b67e5711d20288b02e5c1cfdeeb8ab00ccb20ac8', pr_url=None, pr_revision=None, pr_num=None)

In [23]:
from transformers import pipeline

In [28]:
pos = ner = pipeline(
    "token-classification",
    model = "amanpatkar/distilbert-finetuned-pos",
    aggregation_strategy = "simple",
    device = 0
)

In [29]:
s = "Aman Patkar owns the Honda KTM showroom in India. He is a boy."
pos(s)

[{'entity_group': 'NOUN',
  'score': np.float32(0.9963277),
  'word': 'Aman',
  'start': 0,
  'end': 4},
 {'entity_group': 'VERB',
  'score': np.float32(0.99982905),
  'word': 'Pat',
  'start': 5,
  'end': 8},
 {'entity_group': 'DET',
  'score': np.float32(0.99995303),
  'word': '##kar',
  'start': 8,
  'end': 11},
 {'entity_group': 'NOUN',
  'score': np.float32(0.99914724),
  'word': 'owns the Honda',
  'start': 12,
  'end': 26},
 {'entity_group': 'ADP',
  'score': np.float32(0.9999267),
  'word': 'K',
  'start': 27,
  'end': 28},
 {'entity_group': 'NOUN',
  'score': np.float32(0.99967134),
  'word': '##TM',
  'start': 28,
  'end': 30},
 {'entity_group': '.',
  'score': np.float32(0.99757427),
  'word': 'show',
  'start': 31,
  'end': 35},
 {'entity_group': 'PRON',
  'score': np.float32(0.930186),
  'word': '##room',
  'start': 35,
  'end': 39},
 {'entity_group': 'VERB',
  'score': np.float32(0.9140066),
  'word': 'in',
  'start': 40,
  'end': 42},
 {'entity_group': 'NOUN',
  'score':