In [None]:
%cd ..

In [None]:
import pandas as pd

df = pd.read_json(r'C:\Users\admin\medical_data_saved\drugs\crawled\raw.json')
print(df.head())

In [None]:
df['chemicals_length'] = df['chemicals'].str.len()
print(df['chemicals_length'].describe())

long_chemicals = df[df['chemicals_length'] > 2000][['drug_name', 'chemicals', 'chemicals_length']]
long_chemicals.value_counts().sum()
print(long_chemicals)

In [None]:
df.duplicated()

In [None]:
df = df.drop_duplicates()
df.info()

In [None]:
sub_df = df[['drug_name', 'chemicals']].copy()
print(sub_df.head())

In [None]:
import re
import unicodedata
import pandas as pd

def clean_chemicals(text, drug_name=None):
    if pd.isna(text):
        return pd.NA

    text = unicodedata.normalize('NFKC', str(text)).lower()

    text = re.sub(r'1\s*thành phần\s*:?', '', text)
    text = re.sub(r'(thành phần\s*:?\s*){1,2}', '', text)

    text = re.sub(r'\bmỗi\s*(viên|viên nén|viên nang|viên thuốc|lọ|ống|chai|gói|ml)?\s*(thuốc|sản phẩm)?\b', '', text)

    text = re.sub(r'(là thuốc gì.*|thuốc gì.*|cách dùng.*|thuốc này.*|câu hỏi.*|chỉ định.*|sử dụng.*)', '', text)

    if drug_name:
        drug_name_norm = unicodedata.normalize('NFKC', str(drug_name)).lower()
        text = re.sub(re.escape(drug_name_norm), '', text)

    stopwords = [
        'thành phần', 'thuốc', 'gói', 'ống', 'lọ', 'nén', 'nang',
        'bao phim', 'bao chế', 'dưới dạng', 'hàm lượng', 'được dùng', 'được',
        'bao gồm', 'là', 'chứa', 'tá dược.*', 'vừa đủ.*',
        'trong', 'sản phẩm', 'của', 'dạng', 'với'
    ]
    
    pattern_stopwords = r'\b(' + '|'.join(stopwords) + r')\b'
    text = re.sub(pattern_stopwords, '', text)

    text = re.sub(r'[^\w\s%/.,μgmgmlmcgiu]', ' ', text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text


# Áp dụng
sub_df['chemicals'] = sub_df['chemicals'].apply(clean_chemicals)
sub_df['drug_name'] = sub_df['drug_name'].apply(clean_chemicals)


In [None]:
sub_df['chemicals_length'] = sub_df['chemicals'].str.len()
print(sub_df['chemicals_length'].describe())

In [None]:
long_chemicals = sub_df[sub_df['chemicals_length'] > 2000][['drug_name', 'chemicals', 'chemicals_length']]
long_chemicals.value_counts().sum()
print(long_chemicals)

In [None]:
import re

def extract_chemicals(text, drug_name=None):
    if pd.isna(text) or not isinstance(text, str) or len(text.strip()) == 0:
        return [{'ingredient': '', 'dose': None}]
    
    text = text.lower()
    
    if isinstance(drug_name, str) and len(drug_name.strip()) > 0:
        drug_name = drug_name.lower()
        text = re.sub(re.escape(drug_name), '', text)
    
    units = r'mg|g|mcg|μg|ml|m\s?ui|iu|%|mg/ml|g/ml|iu/ml'
    dose_pattern = rf'(?:\d+(?:[\.,]\d+)?(?:\s*/\s*\d+(?:[\.,]\d+)?\s*)?(?:\s*(?:{units}))?)'

    matches = re.findall(
        rf'([a-zàáạảãâầấậẩẫăằắặẳẵêềếệểễèéẹẻẽôồốộổỗơờớợởỡưừứựửữùúụủũìíịỉĩỳýỵỷỹđ\s\-]+?)\s*({dose_pattern})',
        text
    )

    results = []
    for ing, dose in matches:
        ing = re.sub(r'\s+', ' ', ing.strip())
        dose = dose.replace(',', '.').replace(' ', '')
        results.append({'ingredient': ing, 'dose': dose})
    
    if not results:
        short = text.split('.')[0].split('\n')[0][:40]
        return [{'ingredient': short.strip(), 'dose': None}]
    
    return results

sub_df['chemicals'] = sub_df['chemicals'].apply(extract_chemicals)



In [None]:
sub_df.head(50)

In [None]:
import pandas as pd

df_pics = pd.DataFrame({
    'path': [
        r"C:\Users\admin\Documents\data\.data\20231213145107_001.pdf.0000.png",
        r"C:\Users\admin\Documents\data\.data\20140119215734_001.pdf.0000.png",
        r"C:\Users\admin\Documents\data\.data\a8e7c9eca5b61ee847a7.jpg.0000.png"
    ],
    'ocr_text': [
        [
            "Ibandronic Acid (Jointmeno) 150mg – Viên uống, 1 viên/tháng, dùng buổi sáng, uống với ~150ml nước, không ăn/uống thêm trong 1h",
            "NextG Cal 500mg – Viên uống, 2 viên/trưa, 30 ngày, tổng liều 60 viên"
        ],
        [
            "Levofloxacin (LEVODHG 500) 500mg – Viên, tổng 14 viên (2 viên/ngày, sáng-tối, 7 ngày)",
            "Diosmin + hesperidin (Venokern 500mg) 450mg + 50mg – Viên, 60 viên (6 viên/ngày đầu chia 3 lần, sau uống 4 viên/ngày chia 2 lần)"
        ],
        [
            "Bình vị nam - Việt Nam – 300 viên (uống 10 viên/ngày, 5 sáng, 5 tối)",
            "Modom'S (Domperidone 10mg) – 60 viên (2 viên/ngày, sáng-tối, 1 viên/lần)",
            "Hacumin (Nano Curcumin, Royal Jelly) – 60 viên (2 viên/ngày, sáng-tối)",
            "Somastop (Sucralfat) – 60 viên (2 viên/ngày, sáng-tối)",
            "Repratt (Pantoprazol 40mg) – 60 viên (2 viên/ngày, sáng-tối)"
        ]
    ]
})

df_pics.head()

In [None]:
from rapidfuzz import fuzz
from plasma.meta.class_registrator import ObjectFactory
from typing import Callable
import pandas as pd
import unicodedata
import re

matcher_factory = ObjectFactory[str, Callable[[str, pd.DataFrame], pd.Series]]()

def normalize_text(text):
    text = unicodedata.normalize('NFD', str(text))
    text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

def split_ocr_lines(lines: list[str]):
    split_lines = []
    units = r'(mg|ml|vien|goi|vi)'
    for line in lines:
        marked = re.sub(rf'{units}', r'\1||', line, flags=re.IGNORECASE)
        parts = marked.split('||')
        split_lines.extend([p.strip() for p in parts if len(p.strip()) > 5])
    return split_lines

sub_df['norm_name'] = sub_df['drug_name'].apply(normalize_text)
drug_keywords = set(sub_df['norm_name'].tolist())

def filter_relevant_lines(lines):
    return [l for l in lines if any(kw in normalize_text(l) for kw in drug_keywords)]

@matcher_factory.register("fuzzy")
def fuzzy_match(line: str, sub_df: pd.DataFrame):
    norm_line = normalize_text(line)
    best_score, best_row = 0, None
    for _, row in sub_df.iterrows():
        name = normalize_text(row['drug_name'])
        score = fuzz.partial_ratio(norm_line, name)
        if score > best_score and score > 92 and name in norm_line:
            best_score, best_row = score, row
    return best_row

def classify_medicine_lines(lines: list[str], sub_df: pd.DataFrame, method="fuzzy"):
    matcher = matcher_factory[method]
    results = []
    for line in lines:
        row = matcher(line, sub_df)
        if row is not None:
            results.append({
                "matched_name": row['drug_name'],
                "chemicals": row['chemicals']
            })
    return results if results else None

df_pics['ocr_lines'] = df_pics['ocr_text'].apply(split_ocr_lines)
df_pics['ocr_filtered'] = df_pics['ocr_lines'].apply(filter_relevant_lines)
df_pics['matched'] = df_pics['ocr_filtered'].apply(
    lambda lines: classify_medicine_lines(lines, sub_df, method="fuzzy")
)

for idx, row in df_pics.iterrows():
    print(f'CASE {idx + 1} | {row.get("path", "unknown")}')
    print("OCR lines:", row['ocr_lines'])
    print("Matched:", [{m['matched_name']: m['chemicals']} for m in row['matched']] if row['matched'] else "No match")

print(df_pics[['ocr_text', 'matched']].head())

In [None]:
import os
import cv2
import pandas as pd
import easyocr
import re

df_pics = pd.read_pickle(r"C:\Users\admin\Documents\data\data.pkl")
base_path = r"C:\Users\admin\Documents\data\.data"
df_pics['abs_path'] = df_pics['path'].apply(lambda p: os.path.join(base_path, os.path.basename(p)))

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        return None
    img = cv2.resize(img, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    contrast = clahe.apply(gray)
    blur = cv2.GaussianBlur(contrast, (3, 3), 0)
    thresh = cv2.adaptiveThreshold(
        blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, blockSize=25, C=10
    )
    thresh = cv2.bitwise_not(thresh)
    return thresh

reader = easyocr.Reader(['vi', 'en'])

def extract_text_from_preprocessed(img):
    if img is None:
        return []
    return reader.readtext(img, detail=0)

def fix_common_ocr_errors(text):
    text = text.lower()
    text = re.sub(r'\b1o\b', '10', text)
    text = re.sub(r'\buong\b', 'uống', text)
    text = re.sub(r'[;,_]', ' ', text)
    text = re.sub(r'[-–—]', '-', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def contains_medicine_name(text, keywords):
    text = text.lower()
    for kw in keywords:
        if kw in text:
            return True
    return False

def extract_medicine_lines(lines, keywords):
    results = []
    for line in lines:
        line_clean = line.strip()
        if len(line_clean) < 6:
            continue
        line_fixed = fix_common_ocr_errors(line_clean)

        has_unit = re.search(r'\b(\d+(mg|ml|g)|viên|gói|vỉ)\b', line_fixed)
        starts_with_num = re.match(r'^\d+\.', line_fixed)
        has_med_name = contains_medicine_name(line_fixed, keywords)

        if has_med_name or has_unit or starts_with_num:
            results.append(line_fixed)
    return results

df_pics['preprocessed'] = df_pics['abs_path'].apply(preprocess_image)
df_pics['ocr_text'] = df_pics['preprocessed'].apply(extract_text_from_preprocessed)

In [None]:
print(df_pics[['path', 'ocr_text']].head())