In [None]:
NER-Entity-Extraction

In [None]:
pip install requests transformers


In [None]:
pip install requests transformers

In [None]:
import requests
import json
import re
from transformers import pipeline

# Initialize the transformers NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

# Replace with your API key and endpoint
API_KEY = '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'
API_ENDPOINT = 'https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct'

def create_prompt(task_description, examples, input_sentence):
    prompt = task_description + "\n\n"
    for ex in examples:
        prompt += f"Input: {ex['input']}\nOutput: {ex['output']}\n\n"
    prompt += f"Input: {input_sentence}\nOutput:"
    return prompt

def extract_entities_with_categories(text):
    entities = []
    categories = {'name': [], 'location': [], 'date': [], 'organization': [], 'miscellaneous': []}
    start_idx = text.find('@@')
    while start_idx != -1:
        end_idx = text.find('##', start_idx)
        if end_idx != -1:
            entity = text[start_idx + 2:end_idx]
            entity_type = categorize_entity(entity)
            categories[entity_type].append(entity)
            entities.append((entity, entity_type))
            start_idx = text.find('@@', end_idx)
        else:
            break
    return categories

def categorize_entity(entity):
    # Use Hugging Face Transformers for entity categorization
    ner_results = ner_pipeline(entity)
    for result in ner_results:
        label = result['entity_group']
        if label == 'PER':
            return 'name'
        elif label in ['LOC', 'GPE']:
            return 'location'
        elif label == 'DATE':
            return 'date'
        elif label == 'ORG':
            return 'organization'
    # Fallback to heuristic categorization
    names = ["Aman", "Emily", "Barack Obama", "Bill Gates", "Elon Musk", "Taylor Swift", "Jeff Bezos", "Larry Page", "Sergey Brin", "Soham"]
    locations = ["University of Sheffield", "Hawaii", "Cupertino", "Paris", "New York", "Great Wall of China", "Mount Everest"]
    date_pattern = re.compile(r'\b\d{2}/\d{2}/\d{2}\b')
    organization_keywords = ["Inc.", "Ltd.", "Corp.", "LLC", "Company", "University"]

    if entity in names:
        return 'name'
    elif entity in locations:
        return 'location'
    elif date_pattern.match(entity):
        return 'date'
    elif any(keyword in entity for keyword in organization_keywords):
        return 'organization'
    else:
        return 'miscellaneous'

def train_model():
    # Task description
    task_description = (
        "I am an excellent linguist. The task is to label named entities in the given sentence."
    )

    # Few-shot examples
    examples = [
        {"input": "Barack Obama was born in Hawaii on 04/08/61.", "output": "@@Barack Obama## was born in @@Hawaii## on @@04/08/61##."},
        {"input": "Microsoft was founded by Bill Gates on 04/04/75.", "output": "@@Microsoft## was founded by @@Bill Gates## on @@04/04/75##."},
        {"input": "Elon Musk is the CEO of SpaceX since 01/03/02.", "output": "@@Elon Musk## is the CEO of @@SpaceX## since @@01/03/02##."},
        {"input": "The Eiffel Tower was completed on 31/03/89.", "output": "The @@Eiffel Tower## was completed on @@31/03/89##."},
        {"input": "Apple Inc. was founded on 01/04/76.", "output": "@@Apple Inc.## was founded on @@01/04/76##."},
        {"input": "Amazon was started by Jeff Bezos on 05/07/94.", "output": "@@Amazon## was started by @@Jeff Bezos## on @@05/07/94##."},
        {"input": "Google was created by Larry Page and Sergey Brin on 04/09/98.", "output": "@@Google## was created by @@Larry Page## and @@Sergey Brin## on @@04/09/98##."},
        {"input": "Taylor Swift was born on 13/12/89.", "output": "@@Taylor Swift## was born on @@13/12/89##."},
        {"input": "The Great Wall of China was completed on 01/01/1644.", "output": "The @@Great Wall of China## was completed on @@01/01/1644##."},
        {"input": "Mount Everest was first climbed on 29/05/53.", "output": "@@Mount Everest## was first climbed on @@29/05/53##."}
    ]

    return task_description, examples

def predict_ner(input_sentence, task_description, examples):
    # Create the prompt
    prompt = create_prompt(task_description, examples, input_sentence)

    # Define the API request payload
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 250,
            "temperature": 0.7
        }
    }

    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {API_KEY}'
    }

    # Send the request to the API
    response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(payload))

    # Check if the request was successful
    if response.status_code == 200:
        response_data = response.json()
        if isinstance(response_data, list) and len(response_data) > 0:
            generated_output = response_data[0].get("generated_text", "").strip()

            # Extract the relevant output part for the specific input sentence
            start_idx = generated_output.find(f"Input: {input_sentence}\nOutput:")
            if start_idx != -1:
                relevant_part = generated_output[start_idx:].split("Output:")[1].strip()
                relevant_part = relevant_part.split("Input:")[0].strip()
            else:
                relevant_part = ""

            # Extract named entities with categories from the output
            categories = extract_entities_with_categories(relevant_part)

            # Display results
            print(f"Generated output: {relevant_part}")
            print(f"Extracted entities with categories: {categories}")
        else:
            print("The response did not contain any generated text.")
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(f"Response: {response.text}")

# Train the model and get the task description and examples
task_description, examples = train_model()

# Use the trained model to predict named entities for a new complex input sentence
predict_ner("On October 15, 2023, Dr. Susan Thompson, a renowned scientist from Harvard University, gave a groundbreaking presentation on climate change at the United Nations headquarters in New York City. The event was attended by representatives from NASA, Google, and several European countries including Germany and France. Additionally, prominent figures like Bill Gates and Elon Musk participated in the discussions, highlighting the need for global cooperation. Later in the evening, a gala dinner was held at the Ritz-Carlton Hotel, featuring performances by Beyoncé and speeches by various dignitaries."
, task_description, examples)

