In [1]:
import json
import csv
import logging
import os
import re
from io import BytesIO

In [2]:
# pdfminer for PDF extraction
from pdfminer.high_level import extract_text as pdf_extract_text

# AzureOpenAI
from openai import AzureOpenAI

In [3]:
# Logging Configuration
# ---------------------------------------
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [4]:
# Validation Functions (Basic Examples)
# ---------------------------------------
def is_valid_ip(ip):
    """
    Simple regex for IPv4 addresses (e.g., 185.23.76.19).
    Adjust/expand for IPv6 if needed.
    """
    pattern = r"^(25[0-5]|2[0-4]\d|[01]?\d?\d)\." \
              r"(25[0-5]|2[0-4]\d|[01]?\d?\d)\." \
              r"(25[0-5]|2[0-4]\d|[01]?\d?\d)\." \
              r"(25[0-5]|2[0-4]\d|[01]?\d?\d)$"
    if re.match(pattern, ip.strip()):
        return True
    
    # Quick check for IPv6 (basic check for presence of colons).
    if ":" in ip.strip():
        return True
    
    return False


In [5]:
def is_valid_email(email):
    """
    Simple regex check for email addresses.
    """
    pattern = r"^[^@\s]+@[^@\s]+\.[^@\s]+$"
    return bool(re.match(pattern, email.strip()))

In [6]:
def is_valid_iban(iban):
    """
    Placeholder for IBAN validation. If you install a library
    like 'pyIBAN', you could do real checks.
    For now, we do a naive length check to illustrate.
    """
    iban = iban.replace(" ", "")  # remove spaces
    return 15 <= len(iban) <= 34  # typical IBAN length range

In [7]:
# PDF Extraction & Cleaning
# ---------------------------------------
def extract_text_from_pdf(pdf_file_path):
    """
    Extract text from a PDF file using pdfminer.high_level.extract_text.
    """
    try:
        text = pdf_extract_text(pdf_file_path)
        logger.info(f"Extracted {len(text)} characters from PDF.")
        return text
    except Exception as e:
        logger.error(f"Error extracting text from {pdf_file_path}: {e}")
        return ""

def clean_text(text):
    """
    Clean extracted text by removing extraneous newlines and extra spaces.
    """
    if not text:
        return ""
    cleaned_text = text.replace('\n', ' ').replace('\r', ' ')
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

In [8]:
# Chunking
# ---------------------------------------
def chunk_text(text, max_chars=2000):
    """
    Splits text into smaller chunks to avoid token/character limits.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunks.append(text[start:end])
        start = end
    logger.info(f"Divided text into {len(chunks)} chunks.")
    return chunks


In [9]:
# Azure OpenAI Call
# ---------------------------------------
def call_azure_openai_api(prompt, openai_client, model_name="gpt35-turbo-16k"):
    """
    Sends a prompt to Azure OpenAI using the chat completions endpoint.
    Returns the assistant's response text.
    """
    response = openai_client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are an AI that extracts and normalizes entities/relationships from text."},
            {"role": "user", "content": prompt},
        ],
        temperature=0
    )

    if response.choices:
        # 'message' is a ChatCompletionMessage object; use .content
        return response.choices[0].message.content.strip()
    else:
        return ""


In [10]:
# Relationship Extraction
# ---------------------------------------
def extract_relationships_from_chunk(text_chunk, openai_client):
    """
    Extract relationships among entities from a text chunk.
    The prompt instructs the model to use standardized relationship labels
    and omit meaningless 'Unknown' relationships.
    """
    prompt = f"""
Analyze the following text and extract relationships between the entities mentioned.
Use these standard short forms for the 'relation' field where possible:
- HAS_EMAIL
- HAS_IP
- IS_ASSOCIATED_WITH
- IS_REGISTERED_AS
- HAS_WEBSITE
(You can add or choose from others, but keep them short.)

If a relationship is purely "Unknown - Unknown" or provides no meaningful information,
omit it from the output.

Return the results as valid JSON in this format:
{{
  "relationships": [
    {{
      "source": "Entity Name",
      "relation": "Short Relationship Label (e.g. HAS_EMAIL, HAS_IP)",
      "target": "Related Entity or Value",
      "details": "Additional details if available (optional)"
    }}
  ]
}}

Text to analyze:
{text_chunk}
    """
    logger.debug("Extracting relationships from chunk...")
    response_text = call_azure_openai_api(prompt, openai_client)
    logger.debug(f"Raw API response for relationships: {response_text}")
    try:
        relationships = json.loads(response_text)
        return relationships.get("relationships", [])
    except (json.JSONDecodeError, TypeError) as e:
        logger.error(f"Failed to parse JSON for relationships: {e}")
        return []


In [11]:
def aggregate_relationships(chunks, openai_client):
    """
    Aggregate relationships from all chunks, ensuring uniqueness and omitting
    meaningless or duplicate entries.
    """
    relationships_list = []
    seen = set()

    for chunk in chunks:
        rels = extract_relationships_from_chunk(chunk, openai_client)
        for rel in rels:
            # Create a tuple key to check uniqueness.
            key = (
                rel.get("source", ""),
                rel.get("relation", ""),
                rel.get("target", ""),
                rel.get("details", "")
            )
            if key not in seen:
                seen.add(key)
                relationships_list.append(rel)

    return relationships_list

In [12]:
# Entity Extraction & Validation
# ---------------------------------------
def extract_entities_from_chunk(text_chunk, openai_client):
    """
    Extract entities (IP addresses, emails, company names, person names, IBANs)
    from a text chunk using the Azure OpenAI API.
    Then validate each entity to discard malformed data.
    """
    prompt = f"""
Extract the following entities from the text below:
- IP addresses
- Emails
- Company names
- Person names
- IBANs

Return them as valid JSON in the format:
{{
    "ip_addresses": [],
    "emails": [],
    "company_names": [],
    "person_names": [],
    "ibans": []
}}

If you see no valid data for a category, leave that list empty.

Text to analyze:
{text_chunk}
    """
    logger.debug("Extracting entities from chunk...")
    response_text = call_azure_openai_api(prompt, openai_client)
    logger.debug(f"Raw API response for entities: {response_text}")
    try:
        entities = json.loads(response_text)
    except (json.JSONDecodeError, TypeError) as e:
        logger.error(f"Failed to parse JSON for entities: {e}")
        entities = {}

    # Fill missing keys with empty lists
    ip_addresses = entities.get("ip_addresses", [])
    emails       = entities.get("emails", [])
    companies    = entities.get("company_names", [])
    persons      = entities.get("person_names", [])
    ibans        = entities.get("ibans", [])

    # Validate/normalize
    ip_addresses = [ip for ip in ip_addresses if is_valid_ip(ip)]
    emails       = [em for em in emails if is_valid_email(em)]
    ibans        = [iban for iban in ibans if is_valid_iban(iban)]
    # Company and person names are not syntactically validated,
    # but can be deduplicated/merged later.

    return {
        "ip_addresses": ip_addresses,
        "emails": emails,
        "company_names": companies,
        "person_names": persons,
        "ibans": ibans
    }

In [13]:
# Simple Substring-Based Deduplication
# ---------------------------------------
def unify_substring_entities(entity_set):
    """
    Given a set of entity strings, unify duplicates if one is entirely contained in another.
    Keeps the longer name if the shorter is a substring of the longer.

    Example: 'Crimson Viper' in 'Crimson Viper Group' => keep 'Crimson Viper Group'.
    """
    entities = list(entity_set)
    # Sort by length descending so that longer entities come first
    # and can subsume shorter ones.
    entities.sort(key=len, reverse=True)

    final_list = []
    for candidate in entities:
        skip = False
        for i, existing in enumerate(final_list):
            # If the candidate is contained in an existing entity, skip candidate
            if candidate.lower() in existing.lower():
                skip = True
                break
            # If an existing entity is contained in the candidate, unify to candidate
            if existing.lower() in candidate.lower():
                final_list[i] = candidate
                skip = True
                break
        if not skip:
            final_list.append(candidate)

    return set(final_list)

In [14]:
def aggregate_entities(chunks, openai_client):
    """
    Aggregate entities from all chunks into unique sets and then
    deduplicate near-duplicates for company and person names.
    """
    aggregated = {
        "ip_addresses": set(),
        "emails": set(),
        "company_names": set(),
        "person_names": set(),
        "ibans": set()
    }

    for chunk in chunks:
        entities = extract_entities_from_chunk(chunk, openai_client)
        aggregated["ip_addresses"].update(entities["ip_addresses"])
        aggregated["emails"].update(entities["emails"])
        aggregated["company_names"].update(entities["company_names"])
        aggregated["person_names"].update(entities["person_names"])
        aggregated["ibans"].update(entities["ibans"])

    # Deduplicate company and person names
    aggregated["company_names"] = unify_substring_entities(aggregated["company_names"])
    aggregated["person_names"]  = unify_substring_entities(aggregated["person_names"])

    # Convert sets to sorted lists for readability.
    return {key: sorted(value) for key, value in aggregated.items()}

In [15]:
# CSV Saving (with Encoding Fix)
# ---------------------------------------
def save_entities_to_csv(entities, csv_file_path):
    """
    Save extracted entities to a CSV file with columns: 'entity_type' and 'value',
    applying encoding fixes as needed.
    """
    try:
        with open(csv_file_path, mode="w", newline='', encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["entity_type", "value"])

            for entity_type, values in entities.items():
                for val in values:
                    # Apply encoding fix
                    val_fixed = val.encode('latin-1', errors='replace').decode('utf-8', errors='replace')
                    writer.writerow([entity_type, val_fixed])

        logger.info(f"Entities saved to {csv_file_path}")
    except Exception as e:
        logger.error(f"Error saving entities to CSV: {e}")

In [16]:
def save_relationships_to_csv(relationships, csv_file_path):
    """
    Save extracted relationships to a CSV file with columns:
    'source', 'relation', 'target', 'details',
    applying encoding fixes as needed.
    """
    try:
        with open(csv_file_path, mode="w", newline='', encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["source", "relation", "target", "details"])

            for rel in relationships:
                source   = rel.get("source", "")
                relation = rel.get("relation", "")
                target   = rel.get("target", "")
                details  = rel.get("details", "")

                # Apply encoding fix to each field
                source_fixed   = source.encode('latin-1', errors='replace').decode('utf-8', errors='replace')
                relation_fixed = relation.encode('latin-1', errors='replace').decode('utf-8', errors='replace')
                target_fixed   = target.encode('latin-1', errors='replace').decode('utf-8', errors='replace')
                details_fixed  = details.encode('latin-1', errors='replace').decode('utf-8', errors='replace')

                writer.writerow([source_fixed, relation_fixed, target_fixed, details_fixed])

        logger.info(f"Relationships saved to {csv_file_path}")
    except Exception as e:
        logger.error(f"Error saving relationships to CSV: {e}")

In [17]:
# Main Pipeline
# ---------------------------------------
def extract_data_from_pdf(pdf_file_path, openai_client, max_chars=1500):
    """
    End-to-end pipeline:
      1. Extract text from PDF.
      2. Clean and chunk the text.
      3. Extract entities and relationships from each chunk.
      4. Aggregate, deduplicate, and return the results.
    """
    logger.info("Starting PDF extraction process...")
    raw_text = extract_text_from_pdf(pdf_file_path)
    cleaned_text = clean_text(raw_text)
    
    if not cleaned_text:
        logger.error("No text extracted from PDF.")
        return {}, []

    chunks = chunk_text(cleaned_text, max_chars=max_chars)
    entities = aggregate_entities(chunks, openai_client)
    relationships = aggregate_relationships(chunks, openai_client)
    
    return entities, relationships


In [18]:
# Usage Example (if needed)
# ---------------------------------------
if __name__ == "__main__":
    # Instantiate the AzureOpenAI client with your credentials.
    openai_client = AzureOpenAI(
        azure_endpoint="https://aoai.apim.mitre.org/api-key",   # Replace with your endpoint
        api_key="9abc905da5104e8eb8d6ec3ceb27f767",              # Replace with your API key
        default_headers={"Content-Type": "application/json"},
        api_version="2023-03-15-preview",
    )

    # Path to your PDF file
    pdf_file_path = "/Users/bdowns/tflima/FakePDF.pdf"

    # Run the extraction pipeline
    entities, relationships = extract_data_from_pdf(pdf_file_path, openai_client, max_chars=1500)

    # Log and display results
    logger.info("Extracted Entities:")
    logger.info(json.dumps(entities, indent=2))
    
    logger.info("Extracted Relationships:")
    logger.info(json.dumps(relationships, indent=2))

    # Save results to CSV files (with encoding fixes + dedup)
    save_entities_to_csv(entities, "extracted_entities.csv")
    save_relationships_to_csv(relationships, "extracted_relationships.csv")

INFO:__main__:Starting PDF extraction process...
INFO:__main__:Extracted 8839 characters from PDF.
INFO:__main__:Divided text into 6 chunks.
INFO:httpx:HTTP Request: POST https://aoai.apim.mitre.org/api-key/openai/deployments/gpt35-turbo-16k/chat/completions?api-version=2023-03-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://aoai.apim.mitre.org/api-key/openai/deployments/gpt35-turbo-16k/chat/completions?api-version=2023-03-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://aoai.apim.mitre.org/api-key/openai/deployments/gpt35-turbo-16k/chat/completions?api-version=2023-03-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://aoai.apim.mitre.org/api-key/openai/deployments/gpt35-turbo-16k/chat/completions?api-version=2023-03-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://aoai.apim.mitre.org/api-key/openai/deployments/gpt35-turbo-16k/chat/completions?api-version=2023-03-15-preview "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: