# Correcting Transcription Errors

If we have a list of valid medications, then a foundational model can use it as a reference to correct transcription errors in hard to read prescriptions.

Using Drugs@FDA as an example
https://www.fda.gov/drugs/drug-approvals-and-databases/drugsfda-data-files

In [18]:
import sqlite3
import csv
import json
import re
import numpy as np
import boto3
import rapidfuzz
from typing import Optional
from functools import lru_cache

from mypy_boto3_textract import TextractClient
from mypy_boto3_bedrock_runtime import BedrockRuntimeClient

 ## Fuzzy Matching
First, we need to identify any words in the transcription that are close to valid drug names or active ingredients. We can use fuzzy matching to get likely matches based on metrics like Levenshtein Distance. To do that, create a word list or dictionary of all the possibly words we care about.

In [19]:
def clean_word(word: str) -> str:
    """Clean a single word by removing special characters and converting to uppercase"""
    return re.sub(r'[^A-Za-z]', '', word).upper()

In [20]:
def should_keep_word(word: str) -> bool:
    """Determine if a word should be kept in the final list"""
    # Skip empty strings
    if not word:
        return False

    # Skip short words
    if len(word) < 3:
        return False

    # Skip common conjunctions and articles and any words that don't differentiate
    skip_words = {'AND', 'OR', 'WITH', 'IN', 'THE', 'DAILY'}

    # Skip common chemical terms
    chemical_terms = {'SODIUM', 'HYDROCHLORIDE', 'HCL', 'SULFATE', 'PHOSPHATE',
                      'ACETATE', 'CITRATE', 'COMPLEX', 'RESIN', 'ASPARTATE'}
    if word in skip_words | chemical_terms:
        return False

    return True

In [21]:
def compact_drug_names(filename: str) -> list:
    """
    Read drug names from file and return set of individual cleaned words
    """
    unique_words = set()

    with open(filename, 'r') as file:
        #discard first line header
        file.readline()
        for line in file:
            # Remove any text between **
            line = re.sub(r'\*\*.*?\*\*', '', line)

            # Split on common delimiters
            words = re.split(r'[,;/\(\)]', line)

            for word_group in words:
                # Split into individual words
                individual_words = word_group.split()

                # Clean and filter each word
                for word in individual_words:
                    cleaned_word = clean_word(word)
                    if should_keep_word(cleaned_word):
                        unique_words.add(cleaned_word)

    return sorted(list(unique_words))


In [22]:
sorted_names = compact_drug_names("data/Products.txt")

In [23]:
print(len(sorted_names))

7767


In [24]:
# Write the result to a new file
with open('data/compact_drug_names.txt', 'w') as outfile:
    for name in sorted_names:
        outfile.write(f"{name}\n")

In [25]:
# Test with mix of drug names and common medical text
test_words = [
    "Heprin",  # Should match
    "1x",  # Should not match
    "daily",  # Should not match
    "Demerol",  # Should match
    "take",  # Should not match
    "Amphetm1ne",  # Should match
    "fexafenodine",
    "albeturol",
]
upper_test_words = [w.upper() for w in test_words]

In [26]:
terms = set()
for word in upper_test_words:
    matches = rapidfuzz.process.extract(
        word,
        sorted_names,
        scorer=rapidfuzz.fuzz.WRatio,
        score_cutoff=80
    )
    if matches:
        for match, score, _ in matches:
            terms.add(match)

In [27]:
matches_matrix = rapidfuzz.process.cdist(upper_test_words, sorted_names, scorer=rapidfuzz.fuzz.WRatio, score_cutoff=80)
terms = set(sorted_names[i] for i in np.where(matches_matrix >= 0.8)[0])

## Drug Details
It could be sufficient to provide the list of likely words to the foundational model, but our data set includes valid strengths for each drug. By providing the full details of the likely medications, the foundational model can correct transcription errors in strength as well.

The dictionary includes terms from the drug names and active ingredients. We can use a substring search to get all the records for the likely words. To reduce the results somewhat, filter the substring matches by word boundaries. It is more likely that the transcription error will be a similar length to the correct term.

Since the dataset is relatively small and doesn't change frequently, it's fine to use sqlite stored in S3 and downloaded on initialization.

In [28]:
def clean_strength(text):
    # Remove comments between ** and **
    return re.sub(r'\*\*.*?\*\*', '', text).strip()

In [29]:
consolidated = []

# Read and process the data
with open('data/Products.txt', 'r') as file:
    next(file)  # Skip header
    reader = csv.reader(file, delimiter='\t')

    for row in reader:
        if len(row) >= 8:
            active_ingredient = row[6]
            form = row[2]
            drug_name = row[5]
            # Clean strength by removing comments between **
            strength = clean_strength(row[3])

            consolidated.append((drug_name, active_ingredient, form, strength))


In [30]:
# Create/connect to SQLite database
conn = sqlite3.connect('data/drugs.db')
cursor = conn.cursor()

# Create table with consolidated columns
cursor.execute('''
               CREATE TABLE IF NOT EXISTS drugs
               (
                   drug_name
                   TEXT,
                   active_ingredient
                   TEXT,
                   strength
                   TEXT,
                   form
                   TEXT
               )
               ''')

# Create index for text search on active_ingredient
cursor.execute('CREATE INDEX IF NOT EXISTS idx_active_ingredient ON drugs(active_ingredient COLLATE NOCASE)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_drug_name ON drugs(drug_name COLLATE NOCASE)')


<sqlite3.Cursor at 0x108b98e40>

In [31]:
# Insert consolidated data
for (drug_name, active_ingredient, form, strength) in consolidated:
    cursor.execute('''
                   INSERT INTO drugs (active_ingredient, form, drug_name, strength)
                   VALUES (?, ?, ?, ?)
                   ''', (active_ingredient, form, drug_name, strength))

# Commit changes and close connection
conn.commit()
conn.close()

In [32]:
def search_drugs(search_terms: list[str]):
    conn = sqlite3.connect('data/drugs.db')
    cursor = conn.cursor()

    # Create the WHERE clause dynamically with OR conditions
    where_conditions = []
    params = []
    for term in search_terms:
        where_conditions.append('''
            (drug_name LIKE ? OR active_ingredient LIKE ?)
        ''')
        params.extend([f'%{term}%', f'%{term}%'])

    query = f'''
        SELECT active_ingredient, drug_name, strength
        FROM drugs
        WHERE {' OR '.join(where_conditions)}
    '''

    cursor.execute(query, params)
    results = cursor.fetchall()
    conn.close()
    print("raw result count", len(results))
    pattern = r'\b(' + r'|'.join(re.escape(term) for term in search_terms) + r')\b'
    filtered_results = [
        row for row in results
        if re.search(pattern, row[0], re.IGNORECASE) or
           re.search(pattern, row[1], re.IGNORECASE)
    ]
    print("filtered result count", len(filtered_results))
    return filtered_results


In [33]:
results = search_drugs(list(terms))
print(len(results))

raw result count 102
filtered result count 102
102


## Textract

In [34]:
textract: "TextractClient" = boto3.client('textract')

In [35]:
with open("data/fake prescription.jpg", "rb") as f:
    image = f.read()
result = textract.detect_document_text(Document={"Bytes": image})

In [36]:
blocks = result['Blocks']
raw_text = '\n'.join([block['Text'] for block in blocks if block['BlockType'] == 'LINE'])

In [37]:
raw_text = '''Date 12 may 2025
R
Patient
Mateo Jackson
Address
Prescription:
- Fenderedine 180mg
1x daily
-Flonese 50mcg
/ spray/nostril 2x daily
-Ventolin 90 meg
2 puffs as needed
MOlance
Refil 012345
Permission'''

In [38]:
print(raw_text)

Date 12 may 2025
R
Patient
Mateo Jackson
Address
Prescription:
- Fenderedine 180mg
1x daily
-Flonese 50mcg
/ spray/nostril 2x daily
-Ventolin 90 meg
2 puffs as needed
MOlance
Refil 012345
Permission


In [39]:
raw_text.split()

['Date',
 '12',
 'may',
 '2025',
 'R',
 'Patient',
 'Mateo',
 'Jackson',
 'Address',
 'Prescription:',
 '-',
 'Fenderedine',
 '180mg',
 '1x',
 'daily',
 '-Flonese',
 '50mcg',
 '/',
 'spray/nostril',
 '2x',
 'daily',
 '-Ventolin',
 '90',
 'meg',
 '2',
 'puffs',
 'as',
 'needed',
 'MOlance',
 'Refil',
 '012345',
 'Permission']

In [40]:
matches_matrix = rapidfuzz.process.cdist(raw_text.upper().split(), sorted_names, scorer=rapidfuzz.fuzz.partial_ratio,
                                         score_cutoff=70)
terms = set(sorted_names[i] for i in np.where(matches_matrix > 0)[0])

In [41]:
len(terms)

23

In [42]:
terms

{'ABACAVIR',
 'ABAMETAPIR',
 'ABATACEPT',
 'ABCIXIMAB',
 'ABELCET',
 'ABEMACICLIB',
 'ABILIFY',
 'ABIRATERONE',
 'ABLAVAR',
 'ABLYSINOL',
 'ABRAXANE',
 'ABREVA',
 'ABRILADA',
 'ABSORBASE',
 'ABSTRAL',
 'ACALABRUTINIB',
 'ACANYA',
 'ACCOLATE',
 'ACCRETROPIN',
 'ACCRUFER',
 'ACCUNEB',
 'ACCUPRIL',
 'ACCURETIC'}

In [43]:
terms = set()
for word in raw_text.upper().split():
    matches = rapidfuzz.process.extract(
        word,
        sorted_names,
        scorer=rapidfuzz.fuzz.partial_ratio,
        score_cutoff=70,
        limit=50,
    )
    if matches:
        for match, score, _ in matches:
            terms.add(match)

In [44]:
len(terms)

719

In [45]:
rapidfuzz.process.extract(
    'Fenderedine'.upper(),
    sorted_names,
    scorer=rapidfuzz.fuzz.partial_ratio,
    score_cutoff=70,
    limit=100,
)

[('RED', 100.0, 5766),
 ('VFEND', 88.88888888888889, 7350),
 ('EMADINE', 83.33333333333334, 2201),
 ('EXIDINE', 83.33333333333334, 2487),
 ('FERNDEX', 83.33333333333334, 2566),
 ('DEXEDRINE', 82.35294117647058, 1797),
 ('ENDEP', 80.0, 2244),
 ('IODINE', 80.0, 3367),
 ('LODINE', 80.0, 3825),
 ('NALDEMEDINE', 80.0, 4450),
 ('NEE', 80.0, 4506),
 ('NEO', 80.0, 4516),
 ('PAREDRINE', 80.0, 5081),
 ('BETADINE', 76.92307692307692, 786),
 ('FEDERAL', 76.92307692307692, 2538),
 ('MAFENIDE', 76.92307692307692, 3965),
 ('URIDINE', 76.92307692307692, 7186),
 ('EDEX', 75.0, 2122),
 ('EPHEDRINE', 75.0, 2293),
 ('INFED', 75.0, 3274),
 ('OVINE', 75.0, 4974),
 ('PRED', 75.0, 5458),
 ('PREDNISONE', 75.0, 5465),
 ('RUFEN', 75.0, 5993),
 ('DELAVIRDINE', 73.6842105263158, 1699),
 ('DESERPIDINE', 73.6842105263158, 1760),
 ('ENSIFENTRINE', 73.6842105263158, 2268),
 ('FEXOFENADINE', 73.6842105263158, 2584),
 ('ORPHENADRINE', 73.6842105263158, 4930),
 ('BELDIN', 72.72727272727273, 720),
 ('CODEINE', 72.72727272

In [46]:
rapidfuzz.fuzz.partial_ratio("fenderedine", "fexofenadine")

73.6842105263158

### Reduce results?
That's a lot of results. Maybe they can be reduced by asking a small llm to identify words that could be medications.

In [47]:
bedrock: "BedrockRuntimeClient" = boto3.client("bedrock-runtime")

In [48]:
system_prompt = '''You are a pharmacist looking for drug and active ingredient names on prescription transcriptions. There may be transcription errors.

Please identify all possible drug names or active ingredient names in the text even if you don't recognize them. Use the context of the position of the words and the words around them to determine which words are possible drug names or active ingredient names.

Return ONLY the list of words as a JSON list.
'''

In [49]:
response = bedrock.converse(modelId="us.amazon.nova-micro-v1:0",
                            messages=[
                                {"role": "user", "content": [{"text": raw_text}]},
                                {"role": "assistant", "content": [{"text": "["}]}
                            ],
                            system=[{"text": system_prompt}, {"cachePoint": {"type": "default"}}],
                            inferenceConfig={
                                "temperature": 0.0,
                                "stopSequences": ["]"],
                            }
                            )

In [50]:
json_output = "[" + response["output"]["message"]["content"][0]["text"]

In [51]:
json.loads(json_output)

['Fenderedine', 'Flonese', 'Ventolin', 'MOlance']

In [52]:
terms = set()
for word in json.loads(json_output):
    matches = rapidfuzz.process.extract(
        word.upper(),
        sorted_names,
        scorer=rapidfuzz.fuzz.partial_ratio,
        score_cutoff=70,
        limit=None,
    )
    if matches:
        for match, score, _ in matches:
            terms.add(match)

In [53]:
len(terms)

258

In [54]:
"FEXOFENADINE" in terms

True

258 terms is much more reasonable. Now we have our method!

In [71]:
class DrugNameMatcher:
    def __init__(self, word_list: list[str], threshold: int = 70):
        self.word_list = word_list
        self.threshold = threshold

    def find_matches(self, query: str, limit: Optional[int] = None) -> list[tuple[str, int, int | str]]:
        return rapidfuzz.process.extract(
            query.upper(),
            self.word_list,
            scorer=rapidfuzz.fuzz.partial_ratio,
            limit=limit,
            score_cutoff=self.threshold
        )

    def list_matches(self, words: list[str]) -> set[str]:
        """
        Process multiple words in batch and return unique matches.

        Args:
            words: List of words to process

        Returns:
            Set of unique matched drug names
        """
        return {
            match for word in words
            for match, score, _ in self.find_matches(word.upper())
        }

In [72]:
matcher = DrugNameMatcher(sorted_names, threshold=70)

In [73]:
terms = matcher.list_matches(json.loads(json_output))

In [74]:
len(terms)

258

In [75]:
"FEXOFENADINE" in terms

True

In [60]:
results = search_drugs(list(terms))
print(len(results))

raw result count 10674
filtered result count 6684
6684


In [86]:
terms = ["flonase", "ventolin", "fexofenadine", "losartan", "loratadine", "lansoprazole"]
results = search_drugs(terms)

raw result count 802
filtered result count 736


In [87]:
medications = ["active ingredients | drug name | strength", "---|---|---"]
for active, drug_name, strength in results:
    medications.append(f"{active} | {drug_name} | {strength}")
with open("data/medications.txt", "w") as f:
    f.write("\n".join(medications))