## ENGLISH

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [2]:
from datasets import load_dataset

ds = load_dataset("okite97/news-data")


README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/980k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/174k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4686 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/828 [00:00<?, ? examples/s]

In [6]:
ds['train']

Dataset({
    features: ['Title', 'Excerpt', 'Category'],
    num_rows: 4686
})

In [15]:
import textwrap
from collections import defaultdict
import json

In [31]:
def chunk_text(text, max_length=1000000):
    return textwrap.wrap(text, width=max_length)

def extract_named_entities(text):
    chunks = chunk_text(text)  
    all_entities = []

    for chunk in chunks:
        entities = nlp(chunk)  
        all_entities.extend(entities)  

    structured_entities = defaultdict(list)
    
    for entity in all_entities:
        entity_type = entity.get("entity_group", entity.get("entity", "UNKNOWN"))  
        entity_text = entity["word"].replace("##", "")  

        if entity_text not in structured_entities[entity_type]: 
            structured_entities[entity_type].append(entity_text)

    return dict(structured_entities)

ner_results = []
for i in range(len(ds['train'])):  
    article = ds["train"][i]  
    if isinstance(article, dict) and "Excerpt" in article:
        entities = extract_named_entities(article["Excerpt"])
        ner_results.append({
            "title": article.get("Title", "No Title"),
            "source": article.get("source", "Unknown Source"),
            "content": article.get("Excerpt"),
            # "date_extracted": article.get("date_extracted", "Unknown Date"),
            "entities": entities
        })
    else:
        print(f"Skipping invalid article: {article}")  

with open("english.json", "w", encoding="utf-8") as f:
    json.dump(ner_results, f, ensure_ascii=False, indent=4)


## HINDI

In [50]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline



tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")

model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")
# nlp = pipeline("ner", model=model, tokenizer=tokenizer, device='cpu')


In [51]:
import torch


In [52]:
def get_predictions( sentence, tokenizer, model ):
  # Let us first tokenize the sentence - split words into subwords
  tok_sentence = tokenizer(sentence, return_tensors='pt')

  with torch.no_grad():
    # we will send the tokenized sentence to the model to get predictions
    logits = model(**tok_sentence).logits.argmax(-1)

    # We will map the maximum predicted class id with the class label
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]

    predicted_labels = []

    previous_token_id = 0
    # we need to assign the named entity label to the head word and not the following sub-words
    word_ids = tok_sentence.word_ids()
    for word_index in range(len(word_ids)):
        if word_ids[word_index] == None:
            previous_token_id = word_ids[word_index]
        elif word_ids[word_index] == previous_token_id:
            previous_token_id = word_ids[word_index]
        else:
            predicted_labels.append( predicted_tokens_classes[ word_index ] )
            previous_token_id = word_ids[word_index]

    return predicted_labels

In [53]:
# let us try with some example sentences here
sentence = 'मेरा नाम आरिश शाह मोहसिन है'

predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model=model
                                   )

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )

मेरा	O
नाम	O
आरिश	B-PER
शाह	I-PER
मोहसिन	I-PER
है	O


In [60]:
import json
from datasets import load_dataset

def process_sentence(title, sentence, tokenizer, model):
    """Processes a single sentence and extracts entities."""
    predicted_labels = get_predictions(sentence=sentence, tokenizer=tokenizer, model=model)

    words = sentence.split(' ')
    entities = []
    entity = None

    for index, (word, label) in enumerate(zip(words, predicted_labels)):
        if label.startswith("B-"):
            if entity:
                entities.append(entity)
            entity = {"text": word, "label": label[2:], "start": index, "end": index}
        elif label.startswith("I-") and entity:
            entity["text"] += " " + word
            entity["end"] = index
        else:
            if entity:
                entities.append(entity)
                entity = None

    if entity:
        entities.append(entity)

    return {
        "title": title,
        "content": sentence,
        "entities": entities
    }





In [57]:
import pandas as pd 

In [58]:
df = pd.read_csv('/kaggle/input/hindi-news-category-dataset/HINDI DATASET - Sheet2.csv')

In [59]:
df.head()

Unnamed: 0,Headline,Content,News Categories
0,कांग्रेस नेता बलजिंदर सिंह की पंजाब में घर के ...,कांग्रेस नेता बलजिंदर सिंह की सोमवार को पंजाब ...,National
1,केंद्रीय मंत्री बोले- महिला आरक्षण लाने का साह...,केंद्रीय मंत्री प्रह्लाद पटेल ने लोकसभा और विध...,Politics
2,ओपीएस लागू करने से अस्थिर हो सकती है राज्यों क...,आरबीआई के 5 अधिकारियों ने एक लेख में लिखा है क...,Business
3,तमिलनाडु में शावरमा खाने से 14 वर्षीय छात्रा क...,नामक्कल (तमिलनाडु) में शावरमा खाने से सोमवार क...,National
4,मणिपुर में मुख्यमंत्री के आश्वासन के बाद मारे ...,मणिपुर के मुख्यमंत्री एन बीरेन सिंह के आश्वासन...,National


In [64]:

results = []

for idx, data in df.iterrows():
    print(idx)
    if idx > 100: break
    result = process_sentence(data['Headline'], data['Content'], tokenizer, model)
    results.append(result)





0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101


In [65]:
json_output = json.dumps(results, ensure_ascii=False, indent=2)
with open("hindi.json", "w", encoding="utf-8") as f:
    f.write(json_output)