In [None]:
#%pip install transformers wikipedia tensorflow torch tf-keras
#%pip install tf-keras

In [None]:
import os
import glob
from transformers import pipeline
import wikipedia
wikipedia.set_lang("hi")

In [None]:
def filter_entity(entity_text):
    # kip subword artifacts or very short tokens
    if "##" in entity_text or len(entity_text.strip()) < 2:
        return False
    return True

In [None]:
ner_pipeline = pipeline("ner", model="ai4bharat/IndicNER", aggregation_strategy="simple")

def get_wikipedia_link(entity_text):
    """
    Given an entity string, try to fetch the corresponding Wikipedia page URL.
    This function uses the wikipedia library. In case of ambiguity or errors, it handles exceptions.
    """
    try:
        page = wikipedia.page(entity_text)
        return page.url
    except wikipedia.DisambiguationError as e:
        # In case multiple pages match the entity, return the options.
        return f"Ambiguous (options: {', '.join(e.options[:3])}...)"
    except Exception as e:
        return None

def process_file(file_path):
    """
    Read the text file, extract named entities, and print each entity with its Wikipedia link.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        # Extract entities from the text
        ner_results = ner_pipeline(text)
        print(f"\nProcessing file: {os.path.basename(file_path)}")
        for entity in ner_results:
            if filter_entity(entity['word']):
                entity_text = entity.get("word")
                wiki_link = get_wikipedia_link(entity_text)
                print(f"Entity: {entity_text}  |  Wikipedia: {wiki_link}")



In [None]:
# Path to the folder containing your Hindi text files
folder_path = "data/"  # <-- update with your folder path

# Process each .txt file in the folder
for file_path in glob.glob(os.path.join(folder_path, "*.txt")):
    process_file(file_path)

## Relation Extraction using LLMs

In [None]:
import requests
import os
import pexpect

In [None]:
class OllamaSession:
    def __init__(self, model="llama3"):
        # Start the persistent Ollama process.
        self.child = pexpect.spawn(f"ollama run {model}", encoding="utf-8", timeout=120)
        # Wait for the initial prompt (adjust regex if needed)
        self.child.expect(">>>")
        # Optionally, you might clear any startup messages here:
        # self.child.before  # This holds any text before the prompt

    def query(self, prompt):
        """Sends a query prompt to the Ollama process and returns its output."""
        self.child.sendline(prompt)
        # Wait for the next prompt indicating that the response is complete.
        self.child.expect(">>>")
        # child.before contains the output generated before the prompt.
        output = self.child.before.strip()
        return output

    def close(self):
        self.child.close()

In [None]:
def load_articles_from_folder(input_folder):
    """Loads text articles from .txt files in the input folder."""
    articles = {}
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            article_name = filename[:-4]  # Remove ".txt" extension
            filepath = os.path.join(input_folder, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
            articles[article_name] = content
    return articles

In [None]:
def get_prompt_for_relation_extraction(text):
    """Generates a prompt for LLM-based relation extraction in Hindi."""
    template = {
        "triplets": [
            {
                "subject": "Hindi Subject",
                "predicate": "Hindi Predicate",
                "object": "Hindi Object",
            }
        ],
    }
    max_triplets = 5 # Adjust as needed

    prompt = f"""
    निर्देश: निम्नलिखित हिंदी पाठ से ज्ञान त्रिक उत्पन्न करें। प्रत्येक त्रिक (विषय, विधेय, वस्तु) प्रारूप में होना चाहिए। अधिकतम {max_triplets} त्रिक निकालें और उन्हें JSON प्रारूप में प्रस्तुत करें। आउटपुट हिंदी में होना चाहिए और इस तरह दिखना चाहिए:
    {json.dumps(template, ensure_ascii=False, indent=4)}

    विश्लेषण करने के लिए यहाँ पाठ दिया गया है:
    {text}
    """
    return prompt

def extract_relations_from_text_llm(text, endpoint="http://localhost:6000/query"):
    """
    Extracts relations (triplets) from Hindi text by sending the prompt to the Flask endpoint.
    """
    prompt = get_prompt_for_relation_extraction(text)
    data = {"prompt": prompt}
    try:
        start_time = time.time()
        response = requests.post(endpoint, json=data, timeout=120)
        response.raise_for_status()
        processing_time = round(time.time() - start_time, 3)
        json_data = response.json()

        if "response" not in json_data:
            print("Error: 'response' field missing in service output.")
            return [], processing_time
        
        result_json = json_data["response"]

        if "triplets" in result_json:
            triplets = result_json["triplets"]
            return triplets, processing_time
        else:
            print(f"Warning: 'triplets' key not found in output. Raw response: {result_json}")
            return [], processing_time

    except requests.exceptions.RequestException as e:
        print(f"Error during Flask service request: {e}")
        return [], -1
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from Flask service: {e}")
        return [], -1


In [None]:
def run_relation_extraction_pipeline(input_folder, output_folder="output_relations"):
    """Runs the relation extraction pipeline on articles in the input folder."""
    articles = load_articles_from_folder(input_folder)
    os.makedirs(output_folder, exist_ok=True)

    for article_name, content in articles.items():
        print(f"Processing article: {article_name}...")
        triplets, extraction_time = extract_relations_from_text_llm(content)
        output_filepath = os.path.join(output_folder, f"{article_name}_relations.json")
        with open(output_filepath, "w", encoding="utf-8") as outfile:
            json.dump({
                "article_name": article_name,
                "triplets": triplets,
                "extraction_time_llm": extraction_time,
            }, outfile, indent=4, ensure_ascii=False)
        print(f"  Extracted {len(triplets)} triplets for {article_name}. Results saved to: {output_filepath}")

In [None]:
input_articles_folder = "data" # Articles in .txt files will be placed here
output_relations_folder = "output_relations" # Extracted relations (JSON) will be saved here

run_relation_extraction_pipeline(input_articles_folder, output_relations_folder)
print(f"\nRelation extraction pipeline completed. Results are in '{output_relations_folder}' folder.")

## Using IndicBART

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json



In [2]:
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", 
                                            do_lower_case=False, 
                                            use_fast=False, 
                                            keep_accents=True)
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")

# Define special token ids
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")
# For decoding in Hindi output (adjust if needed)
decoder_start_token_id = tokenizer._convert_token_to_id_with_added_voc("<2hi>")

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/976M [00:00<?, ?B/s]

In [10]:
def get_prompt_for_relation_extraction(text, max_triplets=5):
    # Example 1
    example_input_1 = "पाठ:\nराम ने सीता को वन में देखा।"
    example_output_1 = """{
  "triplets": [
    {
      "subject": "राम",
      "predicate": "देखा",
      "object": "सीता"
    }
  ]
}"""

    # Example 2 (optional)
    example_input_2 = "पाठ:\nमहात्मा गांधी भारत के राष्ट्रपिता कहलाते हैं।"
    example_output_2 = """{
  "triplets": [
    {
      "subject": "महात्मा गांधी",
      "predicate": "कहलाते हैं",
      "object": "भारत के राष्ट्रपिता"
    }
  ]
}"""

    # Now your actual input
    example_input_3 = f"पाठ:\n{text}"

    # Put it all together:
    # 1. Provide the examples
    # 2. Then ask the model to do the same for the new text
    # 3. Optionally specify a maximum of N triplets
    prompt = f"""उदाहरण:

{example_input_1}
आवश्यक JSON आउटपुट:
{example_output_1}

{example_input_2}
आवश्यक JSON आउटपुट:
{example_output_2}

अब निम्नलिखित पाठ के लिए समान JSON प्रारूप में अधिकतम {max_triplets} त्रिक निकालें:

{example_input_3}
आवश्यक JSON आउटपुट:"""

    return prompt


In [11]:
def extract_relations_indicbart(text, max_triplets=5):
    # Create the prompt with instructions
    prompt = get_prompt_for_relation_extraction(text, max_triplets)
    # Append required tokens (the format might need tweaking based on experiments)
    formatted_prompt = prompt # Removed " </s> <2hi>"

    # Tokenize the prompt
    inp = tokenizer(formatted_prompt, add_special_tokens=False,
                    return_tensors="pt", padding=True).input_ids

    # Set model to evaluation mode
    model.eval()
    model_output = model.generate(
        inp,
        use_cache=True,
        num_beams=5,
        temperature=0.7,
        max_length=256,  # Adjust max_length as needed
        pad_token_id=pad_id,
        bos_token_id=bos_id,
        eos_token_id=eos_id,
        # decoder_start_token_id=decoder_start_token_id # Removed decoder_start_token_id
    )

    # Decode the generated output
    decoded_output = tokenizer.decode(model_output[0],
                                      skip_special_tokens=True,
                                      clean_up_tokenization_spaces=False)
    # Optionally, attempt to parse the output as JSON
    try:
        output_json = json.loads(decoded_output)
    except Exception as e:
        output_json = {"raw_output": decoded_output, "error": str(e)}

    return output_json

In [12]:
# Example usage:
sample_text = "राहुल गांधी ने कहा कि विकास को गति देने के लिए नई नीतियों की आवश्यकता है।"
extracted_relations = extract_relations_indicbart(sample_text)
print("Extracted relations:", extracted_relations)

Extracted relations: {'raw_output': 'आवश्यक JSON आउटपुट: राम ने सीता को वन में देखा। आवश्यक JSON आउटपुट: { "triplets": [ { "subject": "राम", "predicate": "देखा", "देखा", "object": "सीता" } ] } पाठ: महात्मा गांधी भारत के राष्ट्रपिता कहलाते हैं। आवश्यक JSON आउटपुट: { "triplets": { "triplets": [ { "subject": "महात्मा गांधी भारत के राष्ट्रपिता कहलाते हैं। आवश्यक JSON आउटपुट: { "triplets": { "triplets": [ { "subject": "महात्मा गांधी", "predicate": "कहलाते हैं", "कहलाते हैं", "object": "भारत के राष्ट्रपिता" } ] } अब निम्नलिखित पाठ के लिए समान JSON प्रारूप में अधिकतम 5 त्रिक निकालें: पाठ: राहुल गांधी ने कहा कि विकास को गति देने के लिए नई नीतियों की आवश्यकता है। आवश्यक JSON आउटपुट: { JSON आउटपुट: { "tritri : { "triplets: पाठ: राहुल गांधी ने कहा कि विकास को गति देने के लिए नई नीतियों की आवश्यकता है। आवश्यक JSON आउटपुट:। आवश्यक J', 'error': 'Expecting value: line 1 column 1 (char 0)'}
