In [None]:
import requests
import json
import csv
from langchain_text_splitters import RecursiveCharacterTextSplitter
import markdown
import os

In [None]:
# Service data collected from official websites and converted to markdown file

file_path = '' #content/Kenya_Services_Info_0403.md

In [None]:
# Add your own code for vLLM / TGI / of inference endpoint

api_key = "Denvr endpoint API"
endpoint = "LLM endpoint link"

In [None]:
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

In [None]:
system_prompt_dict = """
Your task is to identify the entities and relations from a given text, extract them into a structured JSON format for later building knowledge graphs.
The output should be a JSON with a single key "RESPONSE" containing a list of dictionaries, each with three keys - "head", "tail", and "relation" - with corresponding values.

Here is an input example:
""Adam is a software engineer in Microsoft since 2009, and last year he got an award as the Best Talent. Additionally ...""

Here is a corresponding output example:
{
  "RESPONSE":
  [
  {
    "head": "Adam",
    "relation": "works for",
    "tail": "Microsoft"
  },
  {
    "head": "Adam",
    "relation": "received",
    "tail": "Best Talent Award"
  },
  ...
  ]
}

Your output should always follow the above format. Do not include any other text in the output.

Make sure that you:
1. Extract as many entities and relations as you can, as long as the combination is unique;
2. Maintain Entity Consistency: When extracting entities, it's vital to ensure consistency. If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), always use the most complete identifier for that entity.
"""

In [None]:
def chunk_by_header(file_path, max_chunk_size=1500):

  with open(file_path, 'r') as f:
    md_content = f.read()

  html_content = markdown.markdown(md_content)

  chunks = []
  current_chunk = ''
  for line in html_content.split('\n'):
    if line.startswith('<h'):  # Check for header lines
      if current_chunk:
        chunks.append(current_chunk)
      current_chunk = line + '\n'  # Start a new chunk with the header
    else:
      current_chunk += line + '\n'

      # Check if current chunk exceeds max size
      if len(current_chunk) >= max_chunk_size:
        chunks.append(current_chunk)
        current_chunk = ''  # Start a new chunk

  if current_chunk:
    chunks.append(current_chunk)  # Add the last chunk

  print(f'\nGenerated {len(chunks)} text splits')
  average_length = len(html_content) / len(chunks)
  print(f'Average chunk length: {average_length} characters\n')

  return chunks

In [None]:
def entity_and_relation_extraction(chunk_text):
    data = {
    "model": "meta-llama/Llama-3.3-70B-Instruct",  # or the model name provided by the endpoint
    "messages": [
        {"role": "system","content": system_prompt_dict},
        {"role": "user", "content": chunk_text}
    ],
    "temperature": 0.1,
    "max_tokens": 2048
    }

    request_response = requests.post(endpoint, headers=headers, json=data)

    try:
      response = request_response.json()['choices'][0]['message']['content']
    except Exception as e:
      print('Useless codes: Could not process llm output\n')
      return 'Error'

    try:
      triplets = json.loads(response)
      print(f"Identified {len(triplets['RESPONSE'])} triplet(s)")
      return triplets       # "triplets" here is the formatted json output

    except Exception as e:
      print('Pre-processing llm output to convert to json')
      for c in str(response):
        if c != '{':
          response = response[1:]
        else:
          response = response[::-1]
          for c in str(response):
            if c != '}':
              response = response[1:]
            else:
              response = response[::-1]
              try:
                triplets = json.loads(response)
                print(f"Identified {len(triplets['RESPONSE'])} triplet(s)")
                return triplets
              except:
                print('Could not process llm output\n')
                return 'Error'

In [None]:
def entity_and_relation_extraction_pro(chunk_text):
  returned = 'Error'
  while returned == 'Error':
    returned = entity_and_relation_extraction(chunk_text)

  return returned

In [None]:
def write_to_csv(file_path, data):

    # Check if the file exists and has content
    file_exists = os.path.exists(file_path) and os.path.getsize(file_path) > 0

    with open(file_path, 'a', newline='') as csvfile:  # Use 'a' to append
        writer = csv.writer(csvfile)

        # Write header only if the file is empty or doesn't exist
        if not file_exists:
            writer.writerow(['head', 'relation', 'tail'])

        # Write the data rows
        try:
          for d in data:
            writer.writerow(d.values())
        except Exception as e:
          print(f'Could not write to csv: {e}')

    print('âœ… Data written to CSV successfully!\n')

In [None]:
content_splits = chunk_by_header(file_path)

count = 0
for c in content_splits:
  count += 1
  print(f'Processing chunk {count} of length {len(c)}...')
  extracted_data = entity_and_relation_extraction_pro(c)
  try:
    write_to_csv('hard_core_kg.csv', extracted_data['RESPONSE'])
  except Exception as e:
    print(e)
    print(f'Displaying the llm output: {extracted_data} \n')
  print()