In [1]:
import os
import json

In [2]:
def load_topics_index_mapping(path: str) -> dict: 
    with open(path) as f:
        mapping = json.load(f)
    return mapping

In [3]:
mapping = load_topics_index_mapping('/home/torf/NM-i-AI-2025-Neural-Networks-Enjoyers/emergency-healthcare-rag/data/topics.json')
mapping

{'Abdominal Trauma': 0,
 'Acute Abdomen': 1,
 'Acute Appendicitis': 2,
 'Acute Cholecystitis': 3,
 'Acute Coronary Syndrome': 4,
 'Acute Kidney Injury': 5,
 'Acute Liver Failure': 6,
 'Acute Myocardial Infarction (STEMI_NSTEMI)': 7,
 'Acute Respiratory Distress Syndrome': 8,
 'Anaphylaxis': 9,
 'Aortic Dissection': 10,
 'Aortic Stenosis': 11,
 'Arrhythmias (various)': 12,
 'Aspiration Pneumonia': 13,
 'Asthma Exacerbation': 14,
 'Atrial Fibrillation': 15,
 'Blunt Trauma': 16,
 'Bowel Obstruction': 17,
 'Brain Death': 18,
 'Bronchitis': 19,
 'Burns': 20,
 'COPD Exacerbation': 21,
 'Cardiac Arrest': 22,
 'Cardiac Contusion': 23,
 'Cardiac Tamponade': 24,
 'Cardiomyopathy': 25,
 'Cervical Spine Injury': 26,
 'Chest Pain (non-cardiac)': 27,
 'Compartment Syndrome': 28,
 'Delirium': 29,
 'Diabetic Ketoacidosis': 30,
 'Eclampsia': 31,
 'Ectopic Pregnancy': 32,
 'Embolism': 33,
 'Empyema': 34,
 'Encephalitis': 35,
 'Endocarditis': 36,
 'GI Bleeding': 37,
 'Heart Failure (Acute_Chronic)': 38,


In [4]:
def load_articles(topics_dir: str, topics_json_path: str) -> dict[int, str]:
    mapping = load_topics_index_mapping(topics_json_path)

    articles_by_id: dict[int, str] = {}

    for topic_path in topics_dir.iterdir():
        if topic_path.is_dir():
            topic_name = topic_path.name     
            topic_id = mapping.get(topic_name)
            
            content_parts = []
            md_files = sorted(list(topic_path.glob('*.md')))
            
            for md_file_path in md_files:
                with open(md_file_path, 'r', encoding='utf-8') as f:
                    content_parts.append(f.read())
            
            full_content = "\n\n---\n\n".join(content_parts)
            articles_by_id[topic_id] = full_content
    
    return articles_by_id

In [5]:
from pathlib import Path

DATA_DIR = Path("/home/torf/NM-i-AI-2025-Neural-Networks-Enjoyers/emergency-healthcare-rag/data/")
TOPICS_DIR = DATA_DIR / "topics"
TOPICS_JSON_PATH = DATA_DIR / "topics.json"

all_articles = load_articles(topics_dir=TOPICS_DIR, topics_json_path=TOPICS_JSON_PATH)
print(f"Total topics loaded: {len(all_articles)}")

print(all_articles[0][:1000])

Total topics loaded: 115
---
source: https://www.ncbi.nlm.nih.gov/books/n/statpearls/article-22465/
scraped_date: 2025-07-22 12:47:01 UTC
---

# Abdominal Gunshot Wounds

Jessica Forbes; Bracken Burns.

Author Information and Affiliations

#### Authors

Jessica Forbes1; Bracken Burns2.

#### Affiliations

1 Nova Southeastern College

2 East Tennessee State University (ETSU)

Last Update: July 20, 2023.

## Continuing Education Activity

An abdominal gunshot wound is a multisystemic, traumatic injury that commonly causes high morbidity and mortality. Practitioners should recognize that emergent surgical evaluation is warranted when hemodynamic instability persists and evidence of peritonitis is present. This will lead to improved recognition of potential abnormalities which, in turn, will dictate treatment strategies and improve patient outcomes. This review highlights the role of the interprofessional team in the evaluation and treatment of this emergency situation.

**Objectives:**

 

In [27]:
raw_text = all_articles[0][:1000000]

In [None]:
import re
from typing import List, Dict, Any

def create_clean_chunks(topic_id: int, raw_text: str) -> List[Dict[str, Any]]:
    unwanted_sections = {
        'Authors',
        'Affiliations',
        'Continuing Education Activity',
        'Review Questions',
        'References',
        'Disclosure',
        'Comment on this article.',
        'Access free multiple choice questions on this topic.' 
    }
    
    text = re.sub(r'---\n(.*?)\n---', '', raw_text, flags=re.DOTALL)
    text = re.sub(r'## References.*', '', text, flags=re.DOTALL)
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = text.replace('\\\'', '\'')
    text = re.sub(r'_(.*?)_', r'\1', text)

    preamble_and_content = re.split(r'(?=## )', text, maxsplit=1)
    
    if len(preamble_and_content) == 2:
        main_content = preamble_and_content[1]
    else:
        main_content = preamble_and_content[0]

    title = raw_text.split('\n')[0]
    text = f"{title}\n\n{main_content}"

    text = text.split('* \n\n  * Click here for a simplified version.')[0]
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = text.strip()

    parts = re.split(r'(?=## )', text)
    chunks = []

    first_part_content = parts[0].replace(title, '').strip()
    if first_part_content:
        chunks.append({
            'topic_id': topic_id,
            'section_title': 'Summary',
            'content': first_part_content
        })

    for part in parts[1:]:
        try:
            title_end_index = part.find('\n')
            section_title = part[3:title_end_index].strip()
            section_content = part[title_end_index:].strip()

            if section_title and section_content and section_title not in unwanted_sections:
                chunks.append({
                    'topic_id': topic_id,
                    'section_title': section_title,
                    'content': section_content
                })
        except IndexError:
            continue
            
    return chunks

chunks = create_clean_chunks(1, raw_text)
chunks

[{'topic_id': 1,
  'section_title': 'Introduction',
  'content': 'Traumatic injuries to the abdomen can result from a wide range of etiologies and can lead to life-threatening injuries, multi-organ system dysfunction, and death. Gunshot wounds in the abdominal region can range from minor wounds to severe traumatic injuries depending on the anatomical structures the bullet penetrates. While the leading cause of blunt abdominal trauma-related deaths in the United States in adults ages 15 to 24 is due to motor vehicle collisions, abdominal gunshot wounds account for up to 90 percent of the mortality associated with penetrating abdominal injuries.\n\nThe most common cause of a penetrating abdominal injury is a stab wound or gunshot wound. In gunshot wounds, due to the high-intensity kinetic energy of the bullet, the pathway is often unpredictable in nature as well as the internal organs that may be affected. The most common organs injured are the small and large bowel at 50% and 40%, respe

In [None]:

import pathlib
import pickle

OUTPUT_FILE = pathlib.Path("clean_chunks.pkl")

print("Loading topic name-to-ID map...")
name_to_id_map = load_topics_index_mapping(TOPICS_JSON_PATH)

master_chunks_list = []
print(f"Scanning topic folders in '{TOPICS_DIR}'...")

for topic_path in TOPICS_DIR.iterdir():
    if topic_path.is_dir():
        topic_name = topic_path.name
        topic_id = name_to_id_map.get(topic_name)

        if topic_id is None:
            print(f"  [Warning] Skipping folder '{topic_name}' as it's not in the topic mapping.")
            continue
        
        content_parts = []
        for md_file_path in sorted(list(topic_path.glob('*.md'))):
            with open(md_file_path, 'r', encoding='utf-8') as f:
                content_parts.append(f.read())
        
        raw_text = "\n\n---\n\n".join(content_parts)

        if raw_text:
            chunks = create_clean_chunks(topic_id, raw_text)
            master_chunks_list.extend(chunks)

print(f"\nTotal clean chunks created from all topics: {len(master_chunks_list)}")

print(f"Saving the final list to '{OUTPUT_FILE}'...")
with open(OUTPUT_FILE, "wb") as f:
    pickle.dump(master_chunks_list, f)
    
print("\nData preparation complete!")
print(f"Your clean data is now ready in '{OUTPUT_FILE}'.")

Loading topic name-to-ID map...
Scanning topic folders in '/home/torf/NM-i-AI-2025-Neural-Networks-Enjoyers/emergency-healthcare-rag/data/topics'...

Total clean chunks created from all topics: 1499
Saving the final list to 'clean_chunks.pkl'...

Data preparation complete!
Your clean data is now ready in 'clean_chunks.pkl'.


In [None]:
import pickle
import pprint

filename = "clean_chunks.pkl"

print(f"Attempting to load and view contents of '{filename}'...")

try:
    with open(filename, "rb") as f:
        loaded_chunks = pickle.load(f)

    print("\nFile loaded successfully!")
    print(f"Total number of chunks in the file: {len(loaded_chunks)}")

    print("\n--- Displaying the first 2 chunks ---")
    pprint.pprint(loaded_chunks[:2])

except FileNotFoundError:
    print(f"Error: The file '{filename}' was not found. Please make sure it's in the correct directory.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

Attempting to load and view contents of 'clean_chunks.pkl'...

File loaded successfully!
Total number of chunks in the file: 1499

--- Displaying the first 2 chunks ---
[{'content': 'Undifferentiated patients often present with conditions that may '
             'be due to drug exposure and may require specialized diagnostic '
             'testing. Patients may present due to unintentional poisoning, '
             'attempts at self-harm, or environmental exposures. These '
             'patients will predominantly present to the emergency department '
             'For the scope of this article, the acutely ill patient requiring '
             'screening will be the focus, rather than workplace drug screens '
             'and mandated or routine testing performed in rehabilitation '
             'programs. Screening tests for exposures and drugs of abuse can '
             'vary in availability, accuracy, and utility. Knowledge of the '
             'limitations and clinical applica

In [None]:

import pickle
import time
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

def build_vector_store():
    """
    1. Loads the clean text chunks.
    2. Generates embeddings for each chunk using a sentence-transformer model.
    3. Builds a FAISS index for fast similarity search.
    4. Saves the FAISS index and the corresponding chunks to disk.
    """
    print("Loading clean chunks from disk...")
    try:
        with open("clean_chunks.pkl", "rb") as f:
            clean_chunks = pickle.load(f)
    except FileNotFoundError:
        print("Error: clean_chunks.pkl not found.")
        print("Please ensure you have run the data preparation script first.")
        return

    print(f"Loaded {len(clean_chunks)} clean chunks.")

    print("Loading sentence transformer model...")
    model = SentenceTransformer('BAAI/bge-small-en-v1.5', device='cpu')
    print("Model loaded successfully.")

    texts_to_embed = [
        f"{chunk['section_title']}: {chunk['content']}" for chunk in clean_chunks
    ]

    print(f"\nGenerating embeddings for {len(texts_to_embed)} chunks...")
    start_time = time.time()
    embeddings = model.encode(
        texts_to_embed,
        show_progress_bar=True,
        normalize_embeddings=True
    )
    end_time = time.time()
    print(f"Embeddings generated in {end_time - start_time:.2f} seconds.")

    embedding_dimension = embeddings.shape[1]
    print(f"Embedding dimension: {embedding_dimension}")

    index = faiss.IndexFlatIP(embedding_dimension)

    index.add(embeddings)
    
    print(f"FAISS index built. Total vectors in index: {index.ntotal}")

    faiss.write_index(index, "faiss_index.bin")
    print("FAISS index has been saved to 'faiss_index.bin'.")

    print("\nVector store built successfully!")
    print("You now have 'faiss_index.bin' and 'clean_chunks.pkl' ready for your application.")

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
build_vector_store()

Loading clean chunks from disk...
Loaded 1499 clean chunks.
Loading sentence transformer model...
Model loaded successfully.

Generating embeddings for 1499 chunks...


  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 47/47 [00:20<00:00,  2.25it/s]

Embeddings generated in 20.92 seconds.
Embedding dimension: 384
FAISS index built. Total vectors in index: 1499
FAISS index has been saved to 'faiss_index.bin'.

Vector store built successfully!
You now have 'faiss_index.bin' and 'clean_chunks.pkl' ready for your application.



