# Prompt-Based Field Extraction with Groq LLaMA-3.1

This notebook extracts structured fields from radiology reports using a large language model (LLaMA-3.1-8B-Instant) via Groq API and LangChain.

In [1]:
# ==================== SETUP ====================
import pandas as pd
import nest_asyncio
import asyncio
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from tqdm import tqdm

load_dotenv()
nest_asyncio.apply()

In [2]:
# ==================== LOAD DATA ====================
import pandas as pd

# Read and keep only non-null ReportText rows
df = pd.read_csv("open_ave_data.csv")
df = df[["ReportText"]].dropna().reset_index(drop=True)
# Turn that column into a list of texts
texts = df["ReportText"].tolist()
texts = texts[:30]


In [3]:
# ==================== DEFINE OUTPUT SCHEMA ====================
class FieldsExtraction(BaseModel):
    findings: str = Field(description="Radiologist's technical observations")
    clinicaldata: str = Field(description="Reason for examination")
    ExamName: str = Field(description="Exam type and date")
    impression: str = Field(description="Final diagnosis or summary")

output_parser = PydanticOutputParser(pydantic_object=FieldsExtraction)
format_instructions = output_parser.get_format_instructions()

In [4]:
# ==================== SETUP PROMPT + LLM ====================
llm = ChatGroq(model="llama-3.1-8b-instant")
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful medical data extraction assistant. 

From the given \"Report Text\", extract the following fields and return ONLY a JSON object, and nothing else. 
Use the format described in {format_instructions}.
Return exactly this structure:

Only return a JSON object with the following fields:

### Fields to be extracted:
- findings
- clinicaldata
- ExamName
- impression

### Example:

Input:
EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 08:30 PM. CLINICAL HISTORY: Cough. COMPARISON: None. TECHNIQUE: 2 views. FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None. IMPRESSION: Normal 2-view chest radiography Dictated by: [[PERSONALNAME]] on 06/01/2019 08:42 PM. Electronically signed by: [[PERSONALNAME]] on 06/01/2019 08:43 PM.

Extracted:
findings = FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None.  
clinicaldata = CLINICAL HISTORY: Cough.  
ExamName = EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 08:30 PM. TECHNIQUE: 2 views. COMPARISON: None.  
impression = IMPRESSION: Normal 2-view chest radiography    Dictated by: [[PERSONALNAME]] on 06/01/2019 08:42 PM. Electronically signed by: [[PERSONALNAME]] on 06/01/2019 08:43 PM.

### NOTE: Return all fields as flat strings - do not nest them or break into subfields like 'examdate' or 'examname'. Return 'ExamName' as a single string, just as found in the report.
"""),
    ("user", "{input}")
])

chain = prompt | llm | output_parser

In [5]:
# ==================== ASYNC EXECUTION ====================
async def process(index, text):
    try:
        response = await chain.ainvoke({
            "input": text,
            "format_instructions": format_instructions
        })
        print(f"[{index}] →", response)
        return index, response
    except Exception as e:
        print(f"[Error] Index {index}: {e}")
        return index, None

async def run_all(text_list):
    tasks = [process(i, text) for i, text in enumerate(text_list)]
    return await asyncio.gather(*tasks)

results = await run_all(texts)

[12] → findings='FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None.' clinicaldata='CLINICAL HISTORY: CHEST PAIN.' ExamName='EXAM: CHEST RADIOGRAPHY EXAM DATE: 02/02/2020 08:20 PM. COMPARISON: None. TECHNIQUE: 2 views.' impression='IMPRESSION: Normal 2-view chest radiography.'
[25] → findings='The lungs are clear. There is no effusion, consolidation, or pneumothorax. The cardiomediastinal silhouette is within normal limits. No acute osseous abnormalities.' clinicaldata='___M with chest pain // eval for acute process' ExamName='PA and lateral views of the chest.' impression='No acute cardiopulmonary process.'
[15] → findings='The heart, mediastinum, and pulmonary vasculature are normal. No lung consolidation or pleural effusions are present.' clinicaldata='cough, SOB' ExamName='Two View Chest Radiographs EXAM DATE: 08/03/2020 5:12 PM TECHNIQUE: Frontal and lat

In [6]:
# ==================== SAVE OUTPUT ====================
parsed_outputs = [res[1].dict() if res[1] else {} for res in results]
output_df = pd.DataFrame(parsed_outputs)
output_df.to_csv("extracted_results.csv", index=False)
print("✅ Saved extracted results to 'extracted_results.csv'")

✅ Saved extracted results to 'extracted_results.csv'


/var/folders/97/tj5167w92cqbx7z6t1shzr4w0000gn/T/ipykernel_9810/1140077446.py:2: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  parsed_outputs = [res[1].dict() if res[1] else {} for res in results]
