In [None]:
!pip install qdrant-client pypdf2 sentence-transformers pdfplumber PyMuPDF

## Vector DB oluşturma

In [None]:
import os
from glob import glob
import torch
import fitz
import uuid
import time
from tenacity import retry, stop_after_attempt, wait_exponential
from langchain.text_splitter import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [None]:
false_labels = [ "references", "bibliography", "figure", "table", "source:", "doi",
                "appendix", "supplementary material", "acknowledgement", "conflict of interest",
                 "funding", "disclosure", "author contributions", "affiliation", "copyright", "rights reserved",
                 "licence", "abstract", "keywords", "table of contents", "contents", "cover page", "index", "revision history",
                 "submission date", "accepted", "published online", "email:", "correspondence", "journal:", "preprint", "bioRxiv",
                 "medRxiv", "publisher", "issn", "isbn", "et al." ]

def initialize_model_and_client(model_name, qdrant_url, api_key):
    print(f"Model yükleniyor: {model_name}")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Kullanılan cihaz: {device}")
    model = SentenceTransformer(model_name, device=device)

    print(f"Qdrant client bağlanıyor: {qdrant_url}")
    client = QdrantClient(
        url=qdrant_url,
        api_key=api_key,
        prefer_grpc=True
    )

    return model, client

def setup_collection(client, collection_name="llm-agents", vector_size=768):
    print(f"'{collection_name}' koleksiyonu oluşturuluyor veya sıfırlanıyor...")
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=vector_size,
            distance=models.Distance.COSINE
        )
    )
    print(f"Koleksiyon hazır: {collection_name}")

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=60))
def process_pdf(pdf_path, splitter):
    print(f"PDF işleniyor: {os.path.basename(pdf_path)}")
    doc = fitz.open(pdf_path)
    chunks = []

    for page_num, page in enumerate(doc):
        try:
            text = page.get_text()
            if not text.strip():
                continue

            page_chunks = splitter.split_text(text)
            for chunk in page_chunks:
                if len(chunk.strip()) < 50:
                    continue
                if any(keyword.lower() in chunk.lower() for keyword in false_labels):
                    continue

                chunk_id = str(uuid.uuid4())
                chunks.append({
                    "id": chunk_id,
                    "text": chunk,
                    "metadata": {
                        "source": os.path.basename(pdf_path),
                        "page": page_num + 1,
                        "chunk_length": len(chunk)
                        # "entities": extract_medical_entities(chunk)
                    }
                })
        except Exception as e:
            print(f"Sayfa {page_num+1} işlenirken hata: {str(e)}")

    doc.close()
    return chunks

def create_embeddings_and_upload(model, client, chunks, batch_size=64, collection_name="llm-agents"):
    if not chunks:
        print("İşlenecek metin parçası bulunamadı.")
        return

    print(f"Toplam {len(chunks)} parça işlenecek, batch boyutu: {batch_size}")

    for i in tqdm(range(0, len(chunks), batch_size)):
        batch = chunks[i:i+batch_size]
        texts = [item["text"] for item in batch]

        try:
            with torch.no_grad():
                embeddings = model.encode(texts, convert_to_tensor=True, normalize_embeddings=True)
                if torch.cuda.is_available():
                    embeddings = embeddings.cpu().numpy()
                else:
                    embeddings = embeddings.numpy()

            points = [
                models.PointStruct(
                    id=item["id"],
                    vector=emb.tolist(),
                    payload={
                        "text": item["text"],
                        "metadata": item["metadata"]
                    }
                ) for item, emb in zip(batch, embeddings)
            ]

            client.upsert(
                collection_name=collection_name,
                points=points
            )

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            time.sleep(0.5)

        except Exception as e:
            print(f"Batch {i} işlenirken hata: {str(e)}")

def process_directory(folder_path, model, client, collection_name="llm-agents"):
    pdf_files = glob(os.path.join(folder_path, "*.pdf"))

    if not pdf_files:
        print(f"Uyarı: {folder_path} dizininde PDF dosyası bulunamadı.")
        return

    print(f"{len(pdf_files)} PDF dosyası bulundu.")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=64,
        separators=["\n\n", "\n", ". ", "! ", "? "]
    )


    all_chunks = []
    for pdf_path in tqdm(pdf_files):
        try:
            chunks = process_pdf(pdf_path, splitter)
            all_chunks.extend(chunks)
            print(f"{pdf_path} işlendi, {len(chunks)} parça çıkarıldı.")
        except Exception as e:
            print(f"Hata: {pdf_path} işlenirken: {str(e)}")

    print(f"Toplam {len(all_chunks)} metin parçası işlendi.")

    batch_size = 64 if torch.cuda.is_available() else 32
    create_embeddings_and_upload(model, client, all_chunks, batch_size, collection_name)

    try:
        collection_info = client.get_collection(collection_name=collection_name)
        print(f"\nKoleksiyon istatistikleri:")
        print(f"Toplam nokta sayısı: {collection_info.points_count}")
        print(f"Vektör boyutu: {collection_info.config.params.vectors.size}")
    except Exception as e:
        print(f"Koleksiyon istatistikleri alınamadı: {str(e)}")

In [None]:
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
QDRANT_URL = ""
API_KEY = ""
COLLECTION_NAME = "medical-knowledge-v3"
DOCUMENTS_PATH = "/content/test"

start_time = time.time()

model, client = initialize_model_and_client(MODEL_NAME, QDRANT_URL, API_KEY)
setup_collection(client, COLLECTION_NAME, model.get_sentence_embedding_dimension())

process_directory(DOCUMENTS_PATH, model, client, COLLECTION_NAME)

elapsed_time = time.time() - start_time
print(f"Toplam çalışma süresi: {elapsed_time:.2f} saniye")

In [None]:
query = """The patient is a 45-year-old male who presented with a fever of 38.5°C, persistent dry cough and shortness of breath for approximately 3 days.
The patient also reported intermittent chest pain and fatigue.
His history includes 2 doses of COVID-19 vaccine, no check-up in the last 6 months.
Smoking history: 20 pack-years. Family history of coronary artery disease in the father."""

query_embedding = model.encode(query).tolist()

hits = client.search(
    collection_name="llm-agents",
    query_vector=query_embedding,
    limit=5
)

for hit in hits:
    print(hit.payload["text"])
    print(f"Skor: {hit.score}")
    print("---")

Model yükleniyor: emilyalsentzer/Bio_ClinicalBERT
Kullanılan cihaz: cuda




Qdrant client bağlanıyor: https://eaec0b77-6b32-436f-89e8-3df389a237c7.us-east-1-0.aws.cloud.qdrant.io:6333
ment of Respiratory Medicine at Sayyad Shirazi Hospital, 
Gorgan, Iran. The patient complained of fever, chills, and 
cough with whitish-yellow sputum that started about one 
month ago. She had a history of losing significant weight 
during one month. Her symptoms gradually progressed 
in the last week, and her symptoms did not completely 
heal to outpatient treatment. On admission, the patient 
mentioned weakness, lethargy, and frequent fatigue for a 
month and had one episode of hemoptysis before admis­
Skor: 0.9615371227264404
---
lights COVID-19 as a causality factor for AP in the absence 
of other known or identifiable risk factors.
Case Presentation
A 24-year-old Caucasian male presented with abdominal 
pain, nausea, vomiting, and 2 episodes of loose stools for 1 
day. He had no significant past medical history except for a 
COVID-19 infection diagnosed 2 weeks ago, but his

## Dokuman İndirme

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m192.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
import re
import xml.etree.ElementTree as ET
import PyPDF2

Entrez.email = ""
Entrez.api_key = ""

In [None]:
search_terms = (
    '('
    '"symptom-disease association"[Title/Abstract] OR '
    '"symptom-based diagnosis"[Title/Abstract] OR '
    '"clinical symptom patterns"[Title/Abstract] OR '
    '"differential diagnosis"[Title/Abstract] OR '
    '"symptom checklist"[Title/Abstract] OR '
    '"infectious disease symptoms"[Title/Abstract] OR '
    '"respiratory symptoms"[Title/Abstract] OR '
    '"neurological symptoms"[Title/Abstract] OR '
    '"gastrointestinal symptoms"[Title/Abstract] OR '
    '"cardiovascular symptoms"[Title/Abstract] OR '
    '"pediatric symptom analysis"[Title/Abstract] OR '
    '"psychiatric symptom profiles"[Title/Abstract]'
    '"symptom progression"[Title/Abstract] OR '
    '"early symptoms"[Title/Abstract] OR '
    '"atypical symptoms"[Title/Abstract] OR '
    '"multisystem involvement"[Title/Abstract]'
    ') AND '
    '('
    '"open access"[filter] AND '
    '"humans"[Mesh] AND '
    '"diagnosis"[Publication Type]'
    ') NOT '
    '('
    '"predictive model" OR "risk assessment" OR "artificial intelligence" OR '
    '"computer-assisted" OR "machine learning" OR "algorithm" OR '
    '"deep learning" OR "neural network" OR "AI-driven" OR "automated diagnosis" OR '
    '"computer-based analysis" OR "cannabis" OR "e-cigarette" OR "smoking" OR '
    '"cytokeratin" OR "Galectin-3" OR "tumor differentiation" OR '
    '"mantle cell lymphoma" OR "flow cytometry" OR "oncology biomarkers" OR '
    '"COVID-19 vaccine" OR "vaccination effects" OR "genetic mutation" OR "genome" OR '
    '"microbial carriage" OR "travel medicine" OR "infection prevalence"'
    ')'
)

In [None]:
max_results = 1000
output_dir = "/content/test"
os.makedirs(output_dir, exist_ok=True)

def download_pdf(pmc_id, title):
    pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/pdf/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/90.0.4430.212 Safari/537.36",
        "Accept": "application/pdf"
    }
    try:
        response = requests.get(pdf_url, headers=headers, timeout=10)
        if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
            safe_title = title[:50].replace("/", "-")
            filename = f"{output_dir}/{pmc_id}_{safe_title}.pdf"
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {pmc_id} - {title}")
            return True
        else:
            print(f"PDF is not accessible: {pmc_id} - HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"error: {pmc_id} - {str(e)}")
        return False

def parse_article_title(article_xml):
    try:
        root = ET.fromstring(article_xml)
        title_elem = root.find('.//article-title')
        if title_elem is not None:
            title_text = "".join(title_elem.itertext()).strip()
            return title_text
    except Exception as e:
        print(f"error: {str(e)}")
    return "No Title"

def main():
    try:
        search_handle = Entrez.esearch(
            db="pmc",
            term=search_terms,
            retmax=max_results,
            sort="relevance",
            usehistory="y"
        )
        search_results = Entrez.read(search_handle)
        total_count = search_results.get("Count", "0")
        print(f"Total number of related studies: {total_count}")

        pmc_ids = search_results.get("IdList", [])
        search_handle.close()

        success_count = 0
        for pmc_id in pmc_ids:
            if success_count >= max_results:
                break

            try:
                fetch_handle = Entrez.efetch(db="pmc", id=pmc_id, rettype="full", retmode="xml")
                article_xml = fetch_handle.read()
                fetch_handle.close()
            except Exception as e:
                print(f"be withdrawn: {pmc_id} - {str(e)}")
                continue

            title = parse_article_title(article_xml)
            if not title:
                print(f"be retrieved: {pmc_id}")
                continue

            if download_pdf(pmc_id, title):
                success_count += 1

    except Exception as e:
        print(f"error: {str(e)}")

if __name__ == "__main__":
    main()

In [None]:
!pip install pmc-downloader

In [None]:
from pmc_downloader import PMCDownloader #Kendi kütüphanem

downloader = PMCDownloader(
    email="your@email.com",
    api_key="your_api_key",
    search_terms='''
      (
        "student-teacher interaction"[Title/Abstract] OR
        "educational"
      )
      AND ("open access"[filter])
      NOT ("online learning") ''',
    output_dir="/content/data",
    max_results=1000000
)

pmc_ids = downloader.search_articles()
downloader.download_articles(pmc_ids, download_count=10)

🔍 Total number of articles found: 548854
⏳ Downloading total 10 articles...
✅ Successfully downloaded: 9887132 - The-role-of-recent-refugees-educational-selectivity-in-their-childrens-educational-decisions-in-Germ
✅ Successfully downloaded: 10203400 - Gendered-intergenerational-educational-mobility-patterns-converge-in-the-cohort-sequence-evidence-fr
✅ Successfully downloaded: 10717804 - The-postgraduate-medical-educational-climate-assessed-by-the-Danish-Residency-Educational-Climate-Te
✅ Successfully downloaded: 11511753 - The-interplay-between-teachers-value-related-educational-goals-and-their-value-related-school-climat
✅ Successfully downloaded: 9445992 - The-influence-of-migrant-childrens-identification-with-the-college-matriculation-policy-on-their-edu
✅ Successfully downloaded: 10555788 - Educational-attainment-health-outcomes-and-mortality-a-within-sibship-Mendelian-randomization-study
✅ Successfully downloaded: 6916805 - Educational
✅ Successfully downloaded: 6929824 - Educati

10