# 17/11/2024

In [1]:
# Import required libraries
import pandas as pd
from py2neo import Graph, Node, Relationship

# Load the dataset from your Excel file
file_path = r"C:\Users\acer\.Neo4jDesktop\projects\project-7388dcf2-cc74-42d3-8076-55f11a6046c1\data_obat_uas.xlsx"
data = pd.read_excel(file_path)

# Connect to Neo4j
# Replace 'username' and 'password' with your actual Neo4j credentials
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Attaqy81"))

# Iterate through each row in the dataset and create nodes and relationships
for index, row in data.iterrows():
    # Create the main drug node
    drug_node = Node("Drug", name=row["name"])
    graph.merge(drug_node, "Drug", "name")
    
    # Create and link each attribute to the drug node
    if pd.notna(row["price"]):
        price_node = Node("Price", amount=row["price"])
        graph.merge(price_node, "Price", "amount")
        graph.merge(Relationship(drug_node, "hasPrice", price_node))

    if pd.notna(row["Dosis"]):
        dosage_node = Node("Dosage", amount=row["Dosis"])
        graph.merge(dosage_node, "Dosage", "amount")
        graph.merge(Relationship(drug_node, "hasDosage", dosage_node))

    if pd.notna(row["Efek Samping"]):
        side_effects_node = Node("SideEffects", effects=row["Efek Samping"])
        graph.merge(side_effects_node, "SideEffects", "effects")
        graph.merge(Relationship(drug_node, "hasSideEffect", side_effects_node))

    if pd.notna(row["Golongan Produk"]):
        group_node = Node("ProductGroup", group=row["Golongan Produk"])
        graph.merge(group_node, "ProductGroup", "group")
        graph.merge(Relationship(drug_node, "belongsToGroup", group_node))

    if pd.notna(row["Indikasi Umum"]):
        indications_node = Node("Indications", indications=row["Indikasi Umum"])
        graph.merge(indications_node, "Indications", "indications")
        graph.merge(Relationship(drug_node, "indicatedFor", indications_node))

    if pd.notna(row["Kemasan"]):
        packaging_node = Node("Packaging", description=row["Kemasan"])
        graph.merge(packaging_node, "Packaging", "description")
        graph.merge(Relationship(drug_node, "packagedAs", packaging_node))

    if pd.notna(row["Komposisi"]):
        ingredients_node = Node("Ingredients", content=row["Komposisi"])
        graph.merge(ingredients_node, "Ingredients", "content")
        graph.merge(Relationship(drug_node, "contains", ingredients_node))

    if pd.notna(row["Manufaktur"]):
        manufacturer_node = Node("Manufacturer", name=row["Manufaktur"])
        graph.merge(manufacturer_node, "Manufacturer", "name")
        graph.merge(Relationship(drug_node, "producedBy", manufacturer_node))

    if pd.notna(row["No. Registrasi"]):
        registration_node = Node("RegistrationNumber", number=row["No. Registrasi"])
        graph.merge(registration_node, "RegistrationNumber", "number")
        graph.merge(Relationship(drug_node, "hasRegistrationNumber", registration_node))

    if pd.notna(row["Perhatian"]):
        precautions_node = Node("Precautions", details=row["Perhatian"])
        graph.merge(precautions_node, "Precautions", "details")
        graph.merge(Relationship(drug_node, "requiresPrecaution", precautions_node))

print("Data successfully converted to Neo4j Knowledge Graph.")


ModuleNotFoundError: No module named 'pandas'

In [4]:
# Query Neo4j to get details of a specific drug
drug_name = "Sildenafil Citrate 50 mg 4 Tablet"  # Replace with a drug name from your dataset
query = f"""
MATCH (d:Drug {{name: '{drug_name}'}})-[r]->(n)
RETURN d, r, n
"""
results = graph.run(query)

# Display results
for record in results:
    print(record)


In [6]:
# Revised ground truth based on the exact data structure in Neo4j
# Verify these values with the output from the Neo4j Browser
ground_truth = [
    ("Sildenafil Citrate 50 mg 4 Tablet", "hasSideEffect", "Sakit kepala"),  # Ensure correct label and language if necessary
    ("Sildenafil Citrate 50 mg 4 Tablet", "hasDosage", "50 mg"),
    ("Sildenafil Citrate 50 mg 4 Tablet", "producedBy", "Generic Manufacturer"),
]

# Run the query to retrieve data from Neo4j
query = """
MATCH (d:Drug {name: 'Sildenafil Citrate 50 mg 4 Tablet'})-[r]->(n)
RETURN d.name AS drug, type(r) AS relation, n.name AS related_entity
"""
results = graph.run(query).data()

# Process the retrieved results to match the ground truth format
retrieved_results = [(record["drug"], record["relation"], record["related_entity"]) for record in results]

# Convert ground truth and retrieved results to sets
ground_truth_set = set(ground_truth)
retrieved_results_set = set(retrieved_results)

# Calculate True Positives, False Positives, and False Negatives
true_positives = ground_truth_set & retrieved_results_set
false_positives = retrieved_results_set - ground_truth_set
false_negatives = ground_truth_set - retrieved_results_set

# Calculate precision, recall, and F1 score
precision = len(true_positives) / (len(true_positives) + len(false_positives)) if len(true_positives) + len(false_positives) > 0 else 0
recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if len(true_positives) + len(false_negatives) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

# Display results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("False Negatives:", false_negatives)


Precision: 0
Recall: 0.0
F1 Score: 0
True Positives: set()
False Positives: set()
False Negatives: {('Sildenafil Citrate 50 mg 4 Tablet', 'hasSideEffect', 'Sakit kepala'), ('Sildenafil Citrate 50 mg 4 Tablet', 'producedBy', 'Generic Manufacturer'), ('Sildenafil Citrate 50 mg 4 Tablet', 'hasDosage', '50 mg')}


# 19/11/2024

In [3]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0-py3-none-any.whl (11.1 MB)
     ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
      --------------------------------------- 0.3/11.1 MB ? eta -:--:--
     -- ------------------------------------- 0.8/11.1 MB 1.8 MB/s eta 0:00:06
     --- ------------------------------------ 1.0/11.1 MB 2.0 MB/s eta 0:00:05
     ----- ---------------------------------- 1.6/11.1 MB 1.7 MB/s eta 0:00:06
     ------ --------------------------------- 1.8/11.1 MB 1.8 MB/s eta 0:00:06
     --------- ------------------------------ 2.6/11.1 MB 2.1 MB/s eta 0:00:04
     ----------- ---------------------------- 3.1/11.1 MB 2.2 MB/s eta 0:00:04
     -------------- ------------------------- 3.9/11.1 MB 2.4 MB/s eta 0:00:03
     ---------------- ----------------------- 4.5/11.1 

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\acer\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\xx_ent_wiki_sm\\xx_ent_wiki_sm-3.7.0\\vocab\\strings.json'
Consider using the `--user` option or check the permissions.



In [18]:
import spacy

# Coba muat model
try:
    nlp = spacy.load("xx_ent_wiki_sm")
    print("Model berhasil dimuat!")
except OSError as e:
    print("Error saat memuat model:", e)


ModuleNotFoundError: No module named 'spacy'

In [5]:
!pip install pandas



In [15]:
!pip install spacy
!pip install spacy-transformers



In [13]:
!pip install numpy



In [7]:
# xlsx_to_kg.py

# Import required libraries
from py2neo import Graph, Node, Relationship
import pandas as pd
import spacy

# Load spaCy model for NER (menggunakan model multibahasa)
nlp = spacy.load("xx_ent_wiki_sm")

# Function to extract diseases using NER
def extract_diseases(text):
    doc = nlp(text)
    diseases = [ent.text for ent in doc.ents if ent.label_ in ['DISEASE', 'CONDITION', 'SYMPTOM']]  # Sesuaikan label sesuai model
    return diseases

# Load the dataset from your Excel file
file_path = r"C:\Users\acer\.Neo4jDesktop\projects\project-7388dcf2-cc74-42d3-8076-55f11a6046c1\data_obat_uas.xlsx"
data = pd.read_excel(file_path)

# Connect to Neo4j
# Replace 'username' and 'password' with your actual Neo4j credentials
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Attaqy81"))

# Iterate through each row in the dataset and create nodes and relationships
for index, row in data.iterrows():
    # Create the main drug node with dosage and price as properties
    drug_properties = {
        "name": row["name"]
    }

    if pd.notna(row["price"]):
        drug_properties["price"] = row["price"]

    if pd.notna(row["Dosis"]):
        drug_properties["dosage"] = row["Dosis"]

    drug_node = Node("Drug", **drug_properties)
    graph.merge(drug_node, "Drug", "name")

    # Create and link other attributes as separate nodes
    if pd.notna(row["Efek Samping"]):
        side_effects_node = Node("SideEffects", effects=row["Efek Samping"])
        graph.merge(side_effects_node, "SideEffects", "effects")
        graph.merge(Relationship(drug_node, "hasSideEffect", side_effects_node))

    if pd.notna(row["Golongan Produk"]):
        group_node = Node("ProductGroup", group=row["Golongan Produk"])
        graph.merge(group_node, "ProductGroup", "group")
        graph.merge(Relationship(drug_node, "belongsToGroup", group_node))

    if pd.notna(row["Kemasan"]):
        packaging_node = Node("Packaging", description=row["Kemasan"])
        graph.merge(packaging_node, "Packaging", "description")
        graph.merge(Relationship(drug_node, "packagedAs", packaging_node))

    if pd.notna(row["Komposisi"]):
        ingredients_node = Node("Ingredients", content=row["Komposisi"])
        graph.merge(ingredients_node, "Ingredients", "content")
        graph.merge(Relationship(drug_node, "contains", ingredients_node))

    if pd.notna(row["Manufaktur"]):
        manufacturer_node = Node("Manufacturer", name=row["Manufaktur"])
        graph.merge(manufacturer_node, "Manufacturer", "name")
        graph.merge(Relationship(drug_node, "producedBy", manufacturer_node))

    if pd.notna(row["No. Registrasi"]):
        registration_node = Node("RegistrationNumber", number=row["No. Registrasi"])
        graph.merge(registration_node, "RegistrationNumber", "number")
        graph.merge(Relationship(drug_node, "hasRegistrationNumber", registration_node))

    if pd.notna(row["Perhatian"]):
        precautions_node = Node("Precautions", details=row["Perhatian"])
        graph.merge(precautions_node, "Precautions", "details")
        graph.merge(Relationship(drug_node, "requiresPrecaution", precautions_node))

    # Ekstraksi dan pembuatan node penyakit dari indikasi umum
    if pd.notna(row["Indikasi Umum"]):
        diseases = extract_diseases(row["Indikasi Umum"])
        for disease in diseases:
            disease_node = Node("Disease", name=disease)
            graph.merge(disease_node, "Disease", "name")
            graph.merge(Relationship(drug_node, "treats", disease_node))

print("Data successfully converted to Neo4j Knowledge Graph.")

Data successfully converted to Neo4j Knowledge Graph.


In [None]:
# create_ground_truth.py

import pandas as pd
import spacy

# Load spaCy model for NER (gunakan model multibahasa jika perlu)
try:
    nlp = spacy.load("xx_ent_wiki_sm")
except OSError:
    print("Model 'xx_ent_wiki_sm' tidak ditemukan. Silakan instal model tersebut dengan menjalankan:")
    print("python -m spacy download xx_ent_wiki_sm")
    exit(1)

# Fungsi untuk mengekstrak penyakit menggunakan NER
def extract_diseases(text):
    doc = nlp(text)
    diseases = [ent.text.strip() for ent in doc.ents if ent.label_ in ['DISEASE', 'CONDITION', 'SYMPTOM']]
    # Jika model tidak mengenali label yang diinginkan, gunakan pemisahan berbasis koma
    if not diseases:
        diseases = [d.strip() for d in text.split(',')]
    return diseases

# Load dataset dari file Excel
file_path = r"C:\Users\acer\.Neo4jDesktop\projects\project-7388dcf2-cc74-42d3-8076-55f11a6046c1\data_obat_uas.xlsx"
try:
    data = pd.read_excel(file_path)
except FileNotFoundError:
    print(f"File tidak ditemukan di path: {file_path}")
    exit(1)

# Dictionary untuk menyimpan query dan expected drugs
ground_truth_dict = {}

for index, row in data.iterrows():
    if pd.notna(row["Indikasi Umum"]):
        # Ekstrak penyakit dari Indikasi Umum
        diseases = extract_diseases(row["Indikasi Umum"])
        for disease in diseases:
            query = f"Obat untuk {disease}"
            if query not in ground_truth_dict:
                ground_truth_dict[query] = set()
            ground_truth_dict[query].add(row["name"])

# Konversi dictionary ke DataFrame
ground_truth_df = pd.DataFrame([
    {"query": query, "expected_drugs": ",".join(sorted(drugs))}
    for query, drugs in ground_truth_dict.items()
])

# Simpan ke CSV
output_file = "ground_truth.csv"
ground_truth_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Ground truth berhasil dibuat dan disimpan sebagai '{output_file}'.")

Ground truth berhasil dibuat dan disimpan sebagai 'ground_truth.csv'.


In [None]:
# evaluation.py

import pandas as pd
from py2neo import Graph

# Atur logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Koneksi ke Neo4j
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Attaqy81"))

# Load ground truth dataset
ground_truth = pd.read_csv("ground_truth.csv")  # Format: query, expected_drugs

# Function to perform exact match evaluation
def evaluate_retrieval(search_app_function, ground_truth):
    correct = 0
    total = len(ground_truth)
    
    for index, row in ground_truth.iterrows():
        query = row['query']
        expected = set([drug.strip() for drug in row['expected_drugs'].split(',')])
        retrieved = set(search_app_function(query))
        
        if expected == retrieved:
            correct += 1
    
    accuracy = correct / total
    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")

# Definisikan fungsi search_app_function sesuai dengan kebutuhan evaluasi
def search_app_function(query):
    if query.startswith("Obat untuk"):
        disease = query.replace("Obat untuk", "").strip()
        query_neo4j = """
        MATCH (d:Drug)-[:treats]->(dis:Disease {name: $disease})
        RETURN d.name AS drug_name
        """
        try:
            result = graph.run(query_neo4j, disease=disease)
            return [record['drug_name'] for record in result]
        except Exception as e:
            print(f"Error saat menjalankan query untuk penyakit '{disease}': {e}")
            return []
    else:
        # Implementasikan logika untuk query lainnya jika diperlukan
        return []


# Jalankan evaluasi
evaluate_retrieval(search_app_function, ground_truth)


Exact Match Accuracy: 0.00%


In [None]:
# Tambahkan dictionary untuk normalisasi penyakit
disease_normalization = {
    "fever": "demam",
    "headache": "sakit kepala",
    # Tambahkan entri lainnya sesuai kebutuhan
}

def extract_diseases(text):
    doc = nlp(text)
    diseases = [ent.text.strip().lower() for ent in doc.ents if ent.label_ in ['DISEASE', 'CONDITION', 'SYMPTOM']]
    # Jika model tidak mengenali label yang diinginkan, gunakan pemisahan berbasis koma
    if not diseases:
        diseases = [d.strip().lower() for d in text.split(',')]
    # Normalisasi nama penyakit
    normalized_diseases = [disease_normalization.get(d, d) for d in diseases]
    return normalized_diseases


In [None]:
import stanza

# Download model Bahasa Indonesia
stanza.download('id')

# Inisialisasi pipeline Stanza untuk NER
nlp = stanza.Pipeline('id', processors='tokenize,ner')


NameError: name 'search_term' is not defined

In [None]:
def extract_diseases(text):
    doc = nlp(text)
    diseases = [ent.text.strip().lower() for ent in doc.entities if ent.type in ['DISEASE', 'SYMPTOM', 'CONDITION']]
    if not diseases:
        # Fallback: gunakan pemisahan berbasis kata kunci
        keywords = ["batuk", "pilek", "demam", "sakit kepala", "pusing", "asma", "diare", "tipes", "maag"]
        diseases = [keyword for keyword in keywords if keyword in text.lower()]
    return diseases


In [None]:
# Tambahkan dictionary untuk normalisasi penyakit
disease_normalization = {
    "demam": "demam",
    "fever": "demam",
    "sakit kepala": "sakit kepala",
    "kepala pusing": "sakit kepala",
    "maag": "maag",
    # Tambahkan sinonim lainnya sesuai kebutuhan
}

def extract_diseases(text):
    doc = nlp(text)
    diseases = [ent.text.strip().lower() for ent in doc.entities if ent.type in ['DISEASE', 'SYMPTOM', 'CONDITION']]
    if not diseases:
        # Fallback: gunakan pemisahan berbasis kata kunci
        keywords = ["batuk", "pilek", "demam", "sakit kepala", "pusing", "asma", "diare", "tipes", "maag"]
        diseases = [keyword for keyword in keywords if keyword in text.lower()]
    # Normalisasi nama penyakit
    normalized_diseases = [disease_normalization.get(d, d) for d in diseases]
    return normalized_diseases


In [8]:
# xlsx_to_kg.py

from py2neo import Graph, Node, Relationship
import pandas as pd
import spacy

# Load spaCy model for NER (gunakan model multibahasa)
nlp = spacy.load("xx_ent_wiki_sm")

# Function to extract diseases using NER
def extract_diseases(text):
    doc = nlp(text)
    diseases = [ent.text for ent in doc.ents if ent.label_ in ['DISEASE', 'CONDITION', 'SYMPTOM']]  # Sesuaikan label sesuai model
    return diseases

# Load the dataset from your Excel file
file_path = r"C:\Users\acer\Documents\KULIAHHH\5. smt 5\Natural Language Processing\kg_durgs\data_obat_uas.xlsx"
data = pd.read_excel(file_path)

# Connect to Neo4j
# Ganti 'neo4j' dan 'Attaqy81' dengan kredensial Neo4j Anda
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Attaqy81"))

# Iterate through each row in the dataset and create nodes and relationships
for index, row in data.iterrows():
    # Create the main drug node with dosage and price as properties
    drug_properties = {
        "name": row["name"]
    }

    if pd.notna(row["price"]):
        drug_properties["price"] = row["price"]

    if pd.notna(row["Dosis"]):
        drug_properties["dosage"] = row["Dosis"]

    drug_node = Node("Drug", **drug_properties)
    graph.merge(drug_node, "Drug", "name")

    # Create and link other attributes as separate nodes
    if pd.notna(row["Efek Samping"]):
        side_effects_node = Node("SideEffects", effects=row["Efek Samping"])
        graph.merge(side_effects_node, "SideEffects", "effects")
        graph.merge(Relationship(drug_node, "hasSideEffect", side_effects_node))

    if pd.notna(row["Golongan Produk"]):
        group_node = Node("ProductGroup", group=row["Golongan Produk"])
        graph.merge(group_node, "ProductGroup", "group")
        graph.merge(Relationship(drug_node, "belongsToGroup", group_node))

    if pd.notna(row["Kemasan"]):
        packaging_node = Node("Packaging", description=row["Kemasan"])
        graph.merge(packaging_node, "Packaging", "description")
        graph.merge(Relationship(drug_node, "packagedAs", packaging_node))

    if pd.notna(row["Komposisi"]):
        ingredients_node = Node("Ingredients", content=row["Komposisi"])
        graph.merge(ingredients_node, "Ingredients", "content")
        graph.merge(Relationship(drug_node, "contains", ingredients_node))

    if pd.notna(row["Manufaktur"]):
        manufacturer_node = Node("Manufacturer", name=row["Manufaktur"])
        graph.merge(manufacturer_node, "Manufacturer", "name")
        graph.merge(Relationship(drug_node, "producedBy", manufacturer_node))

    if pd.notna(row["No. Registrasi"]):
        registration_node = Node("RegistrationNumber", number=row["No. Registrasi"])
        graph.merge(registration_node, "RegistrationNumber", "number")
        graph.merge(Relationship(drug_node, "hasRegistrationNumber", registration_node))

    if pd.notna(row["Perhatian"]):
        precautions_node = Node("Precautions", details=row["Perhatian"])
        graph.merge(precautions_node, "Precautions", "details")
        graph.merge(Relationship(drug_node, "requiresPrecaution", precautions_node))

    # Ekstraksi dan pembuatan node penyakit dari indikasi umum
    if pd.notna(row["Indikasi Umum"]):
        diseases = extract_diseases(row["Indikasi Umum"])
        for disease in diseases:
            disease_node = Node("Disease", name=disease.lower())
            graph.merge(disease_node, "Disease", "name")
            graph.merge(Relationship(drug_node, "treats", disease_node))

    print(f"Processed row {index + 1}")

print("Data successfully converted to Neo4j Knowledge Graph.")


Processed row 1
Processed row 2
Processed row 3
Processed row 4
Processed row 5
Processed row 6
Processed row 7
Processed row 8
Processed row 9
Processed row 10
Processed row 11
Processed row 12
Processed row 13
Processed row 14
Processed row 15
Processed row 16
Processed row 17
Processed row 18
Processed row 19
Processed row 20
Processed row 21
Processed row 22
Processed row 23
Processed row 24
Processed row 25
Processed row 26
Processed row 27
Processed row 28
Processed row 29
Processed row 30
Processed row 31
Processed row 32
Processed row 33
Processed row 34
Processed row 35
Processed row 36
Processed row 37
Processed row 38
Processed row 39
Processed row 40
Processed row 41
Processed row 42
Processed row 43
Processed row 44
Processed row 45
Processed row 46
Processed row 47
Processed row 48
Processed row 49
Processed row 50
Processed row 51
Processed row 52
Processed row 53
Processed row 54
Processed row 55
Processed row 56
Processed row 57
Processed row 58
Processed row 59
Proces

In [9]:
import stanza

# Download model Bahasa Indonesia
stanza.download('id')

# Inisialisasi pipeline Stanza untuk NER
nlp = stanza.Pipeline('id', processors='tokenize,ner')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-19 22:51:47 INFO: Downloaded file to C:\Users\acer\stanza_resources\resources.json
2024-11-19 22:51:47 INFO: Downloading default packages for language: id (Indonesian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-id/resolve/v1.9.0/models/default.zip:   0%|          | 0…

2024-11-19 22:54:34 INFO: Downloaded file to C:\Users\acer\stanza_resources\id\default.zip
2024-11-19 22:54:42 INFO: Finished downloading models and saved to C:\Users\acer\stanza_resources
2024-11-19 22:54:42 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-19 22:54:44 INFO: Downloaded file to C:\Users\acer\stanza_resources\resources.json
2024-11-19 22:54:44 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| ner       | default |

2024-11-19 22:54:44 INFO: Using device: cpu
2024-11-19 22:54:44 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-19 22:54:50 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-19 22:54:50 INFO: Loading: ner
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-19 22:54:50 ERROR: Cannot load model from C:\Users\acer\stanza_resources\id\ner\default.pt


UnsupportedProcessorError: Processor ner is not known for language id.  If you have created your own model, please specify the ner_model_path parameter when creating the pipeline.

In [10]:
def extract_diseases(text):
    doc = nlp(text)
    diseases = [ent.text.strip().lower() for ent in doc.ents if ent.type in ['DISEASE', 'CONDITION', 'SYMPTOM']]
    if not diseases:
        # Fallback: gunakan pemisahan berbasis kata kunci
        keywords = ["batuk", "pilek", "demam", "sakit kepala", "pusing", "asma", "diare", "tipes", "maag", "migraine", "migrain"]
        diseases = [keyword for keyword in keywords if keyword in text.lower()]
    # Normalisasi nama penyakit
    normalized_diseases = [disease_normalization.get(d, d) for d in diseases]
    return normalized_diseases


# 20/11/2024

In [None]:
from py2neo import Graph, Node, Relationship
import pandas as pd
import spacy
import re

# Load spaCy model
nlp = spacy.load("xx_ent_wiki_sm")

# Function to normalize disease names
def normalize_disease_name(disease):
    disease = disease.lower().strip()
    disease = re.sub(r'\bsakit\s+', '', disease)
    disease = re.sub(r'\bpenyakit\s+', '', disease)
    return disease

# Function to extract age groups
def extract_age_group(text):
    text = text.lower()
    if any(word in text for word in ['anak', 'bayi', 'pediatrik']):
        return 'anak-anak'
    elif any(word in text for word in ['dewasa', 'adult']):
        return 'dewasa'
    return 'umum'

# Function to parse dosage
def parse_dosage(dosage_text):
    if pd.isna(dosage_text):
        return None
    
    dosage_text = str(dosage_text).lower()
    match = re.search(r'(\d+(?:\.\d+)?)\s*(mg|ml|g)', dosage_text)
    if match:
        value = float(match.group(1))
        unit = match.group(2)
        return {'value': value, 'unit': unit}
    return None

# Function to extract diseases with context
def extract_diseases(text):
    if pd.isna(text):
        return []
    
    text = str(text).lower()
    doc = nlp(text)
    
    common_diseases = [
        'flu', 'demam', 'pusing', 'sakit kepala', 'batuk', 'pilek',
        'asma', 'diare', 'tipes', 'maag', 'migraine', 'migrain',
        'hipertensi', 'diabetes', 'kolesterol tinggi', 'ginjal batu',
        'anemia', 'hiv', 'hepatitis', 'alergi', 'infeksi saluran kemih',
        # Tambahkan penyakit lain sesuai kebutuhan
    ]

    diseases = []
    
    for disease in common_diseases:
        if disease in text:
            diseases.append({
                'name': normalize_disease_name(disease),
                'age_group': extract_age_group(text)
            })
    
    diseases.extend([{
        'name': normalize_disease_name(ent.text),
        'age_group': extract_age_group(text)
    } for ent in doc.ents if ent.label_ in ['DISEASE', 'CONDITION', 'SYMPTOM']])
    
    return diseases

def parse_price_range(price_str):
    """
    Parse price range string in format 'RpX.XXX - RpY.YYY' to (min_price, max_price)
    
    Args:
        price_str (str): Price string in Indonesian Rupiah format
        
    Returns:
        tuple: (min_price, max_price) in float format
    """
    try:
        # Remove 'Rp' and split by '-'
        prices = price_str.replace('Rp', '').split('-')
        
        # Clean and convert each price
        prices = [float(price.strip().replace('.', '')) for price in prices]
        
        # Return as tuple (min_price, max_price)
        if len(prices) == 2:
            return (prices[0], prices[1])
        else:
            return (prices[0], prices[0])  # If single price, return same value for min and max
            
    except (ValueError, AttributeError):
        return (None, None)  # Return None if parsing fails
    
# Connect to Neo4j
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Attaqy81"))

# Load the dataset
file_path = r"C:\Users\acer\.Neo4jDesktop\projects\project-7388dcf2-cc74-42d3-8076-55f11a6046c1\data_obat_uas.xlsx"
data = pd.read_excel(file_path)

# Create constraints (syntax baru untuk Neo4j)
graph.run("CREATE CONSTRAINT drug_name IF NOT EXISTS FOR (d:Drug) REQUIRE d.name IS UNIQUE")
graph.run("CREATE CONSTRAINT disease_name IF NOT EXISTS FOR (d:Disease) REQUIRE d.name IS UNIQUE")

# Iterate through dataset
for index, row in data.iterrows():
    # Create drug node with properties
    drug_properties = {
        "name": row["name"]
    }
    
    # Add price as property using parse_price_range
    if pd.notna(row["price"]):
        min_price, max_price = parse_price_range(row["price"])
        if min_price is not None and max_price is not None:
            drug_properties["min_price"] = min_price
            drug_properties["max_price"] = max_price
    
    # Parse and add dosage as property
    if pd.notna(row["Dosis"]):
        dosage_info = parse_dosage(row["Dosis"])
        if dosage_info:
            drug_properties["dosage_value"] = dosage_info['value']
            drug_properties["dosage_unit"] = dosage_info['unit']
    
    drug_node = Node("Drug", **drug_properties)
    graph.merge(drug_node, "Drug", "name")
    
    # Create relationships for other attributes
    if pd.notna(row["Golongan Produk"]):
        group_node = Node("ProductGroup", group=row["Golongan Produk"])
        graph.merge(group_node, "ProductGroup", "group")
        graph.merge(Relationship(drug_node, "belongsToGroup", group_node))
    
    if pd.notna(row["Komposisi"]):
        ingredients_node = Node("Ingredients", content=row["Komposisi"])
        graph.merge(ingredients_node, "Ingredients", "content")
        graph.merge(Relationship(drug_node, "contains", ingredients_node))
    
    if pd.notna(row["Manufaktur"]):
        manufacturer_node = Node("Manufacturer", name=row["Manufaktur"])
        graph.merge(manufacturer_node, "Manufacturer", "name")
        graph.merge(Relationship(drug_node, "producedBy", manufacturer_node))
    
    # Extract and create disease nodes with context
    if pd.notna(row["Indikasi Umum"]):
        diseases = extract_diseases(row["Indikasi Umum"])
        for disease_info in diseases:
            disease_node = Node("Disease", 
                              name=disease_info['name'],
                              age_group=disease_info['age_group'])
            graph.merge(disease_node, "Disease", "name")
            graph.merge(Relationship(drug_node, "treats", disease_node))

print("Data successfully converted to Neo4j Knowledge Graph.")

Data successfully converted to Neo4j Knowledge Graph.


In [3]:
def parse_price_range(price_str):
    """
    Parse price range string in format 'RpX.XXX - RpY.YYY' to (min_price, max_price)
    
    Args:
        price_str (str): Price string in Indonesian Rupiah format
        
    Returns:
        tuple: (min_price, max_price) in float format
    """
    try:
        # Remove 'Rp' and split by '-'
        prices = price_str.replace('Rp', '').split('-')
        
        # Clean and convert each price
        prices = [float(price.strip().replace('.', '')) for price in prices]
        
        # Return as tuple (min_price, max_price)
        if len(prices) == 2:
            return (prices[0], prices[1])
        else:
            return (prices[0], prices[0])  # If single price, return same value for min and max
            
    except (ValueError, AttributeError):
        return (None, None)  # Return None if parsing fails

# Example usage:
test_prices = [
    "Rp96.500 - Rp168.500",
    "Rp156.800 - Rp164.900",
    "Rp77.100 - Rp104.000"
]

for price in test_prices:
    min_price, max_price = parse_price_range(price)
    print(f"Original: {price}")
    print(f"Parsed: Min = {min_price:,.2f}, Max = {max_price:,.2f}")
    print()

Original: Rp96.500 - Rp168.500
Parsed: Min = 96,500.00, Max = 168,500.00

Original: Rp156.800 - Rp164.900
Parsed: Min = 156,800.00, Max = 164,900.00

Original: Rp77.100 - Rp104.000
Parsed: Min = 77,100.00, Max = 104,000.00



In [None]:
import pandas as pd
from py2neo import Graph
import json

class KGEvaluator:
    def __init__(self, graph_connection, ground_truth_path):
        """
        Inisialisasi evaluator
        graph_connection: koneksi ke Neo4j
        ground_truth_path: path ke file ground truth (JSON/Excel)
        """
        self.graph = graph_connection
        self.ground_truth = self.load_ground_truth(ground_truth_path)
        
    def load_ground_truth(self, path):
        """
        Load ground truth dari file
        Format ground truth:
        {
            "query1": ["expected_result1", "expected_result2"],
            "query2": ["expected_result1", "expected_result2"],
            ...
        }
        """
        if path.endswith('.json'):
            with open(path, 'r', encoding='utf-8') as f:
                return json.load(f)
        elif path.endswith('.xlsx'):
            df = pd.read_excel(path)
            ground_truth = {}
            for _, row in df.iterrows():
                ground_truth[row['query']] = eval(row['expected_results'])
            return ground_truth
        else:
            raise ValueError("Unsupported file format")

    def execute_query(self, cypher_query):
        """
        Eksekusi query Neo4j dan return set hasil
        """
        results = self.graph.run(cypher_query).data()
        # Extract drug names from results
        drug_names = set()
        for result in results:
            # Sesuaikan dengan struktur return dari query Anda
            if 'drug' in result:
                drug_names.add(result['drug']['name'])
            elif 'd' in result:
                drug_names.add(result['d']['name'])
        return drug_names

    def evaluate_query(self, query, cypher_query):
        """
        Evaluasi single query
        """
        if query not in self.ground_truth:
            raise ValueError(f"Query '{query}' tidak ditemukan dalam ground truth")

        expected_results = set(self.ground_truth[query])
        actual_results = self.execute_query(cypher_query)

        exact_match = expected_results == actual_results
        
        # Hitung intersection dan differences untuk analisis
        correct_results = expected_results.intersection(actual_results)
        missed_results = expected_results - actual_results
        extra_results = actual_results - expected_results

        return {
            'exact_match': exact_match,
            'expected_count': len(expected_results),
            'retrieved_count': len(actual_results),
            'correct_count': len(correct_results),
            'correct_results': list(correct_results),
            'missed_results': list(missed_results),
            'extra_results': list(extra_results)
        }

    def evaluate_all(self, query_mappings):
        """
        Evaluasi semua query
        query_mappings: dictionary dengan format
        {
            "query_text": "corresponding_cypher_query"
        }
        """
        results = {}
        total_exact_matches = 0

        for query_text, cypher_query in query_mappings.items():
            try:
                eval_result = self.evaluate_query(query_text, cypher_query)
                results[query_text] = eval_result
                if eval_result['exact_match']:
                    total_exact_matches += 1
            except Exception as e:
                print(f"Error evaluating query '{query_text}': {str(e)}")
                continue

        # Calculate overall accuracy
        accuracy = total_exact_matches / len(query_mappings) if query_mappings else 0

        return {
            'query_results': results,
            'total_queries': len(query_mappings),
            'total_exact_matches': total_exact_matches,
            'accuracy': accuracy
        }

    def print_evaluation_results(self, results):
        """
        Print hasil evaluasi dalam format yang mudah dibaca
        """
        print("\n=== EVALUATION RESULTS ===")
        print(f"Total Queries: {results['total_queries']}")
        print(f"Exact Matches: {results['total_exact_matches']}")
        print(f"Overall Accuracy: {results['accuracy']:.2%}")
        
        print("\nDetailed Results:")
        for query, result in results['query_results'].items():
            print(f"\nQuery: {query}")
            print(f"Exact Match: {'✓' if result['exact_match'] else '✗'}")
            print(f"Expected Count: {result['expected_count']}")
            print(f"Retrieved Count: {result['retrieved_count']}")
            print(f"Correct Count: {result['correct_count']}")
            
            if result['missed_results']:
                print("Missed Results:", result['missed_results'])
            if result['extra_results']:
                print("Extra Results:", result['extra_results'])

# Contoh penggunaan:
if __name__ == "__main__":
    # Koneksi ke Neo4j
    graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
    
    # Path ke file ground truth
    ground_truth_path = "ground_truth.json"
    
    # Inisialisasi evaluator
    evaluator = KGEvaluator(graph, ground_truth_path)
    
    # Contoh query mappings
    query_mappings = {
        "obat untuk batuk": """
        MATCH (d:Drug)-[:treats]->(disease:Disease)
        WHERE disease.name = 'batuk'
        RETURN d
        """,
        "obat flu untuk anak": """
        MATCH (d:Drug)-[:treats]->(disease:Disease)
        WHERE disease.name = 'flu' 
        AND disease.age_group = 'anak-anak'
        RETURN d
        """
    }
    
    # Evaluasi
    results = evaluator.evaluate_all(query_mappings)
    
    # Print hasil
    evaluator.print_evaluation_results(results)

In [1]:
import stanza

stanza.download('id')


  _torch_pytree._register_pytree_node(


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-20 13:26:28 INFO: Downloaded file to C:\Users\acer\stanza_resources\resources.json
2024-11-20 13:26:28 INFO: Downloading default packages for language: id (Indonesian) ...
2024-11-20 13:26:29 INFO: File exists: C:\Users\acer\stanza_resources\id\default.zip
2024-11-20 13:26:32 INFO: Finished downloading models and saved to C:\Users\acer\stanza_resources


In [4]:
import spacy
from spacy.training import Example

# Inisialisasi spaCy
nlp = spacy.blank("id")  # Membuat pipeline baru untuk Bahasa Indonesia

# Tambahkan komponen NER
ner = nlp.add_pipe("ner")

# Tambahkan label entitas
ner.add_label("DISEASE")
ner.add_label("CONDITION")
ner.add_label("SYMPTOM")

# Siapkan data
TRAIN_DATA = [
    ("Sakit kepala adalah kondisi umum.", {"entities": [(0, 12, "SYMPTOM")]}),
    # Tambahkan data lain sesuai kebutuhan
]

# Mulai proses pelatihan
optimizer = nlp.begin_training()
for itn in range(10):
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], drop=0.5, sgd=optimizer)


  _torch_pytree._register_pytree_node(


In [5]:
pip install spacy-transformers


Note: you may need to restart the kernel to use updated packages.


# RUN INI

In [None]:
# xlsx_to_kg.py

from py2neo import Graph, Node, Relationship
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Inisialisasi tokenizer dan model untuk NER
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModelForTokenClassification.from_pretrained("indobenchmark/indobert-base-p1")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Fungsi untuk normalisasi nama penyakit
def normalize_disease_name(disease):
    disease = disease.lower().strip()
    disease = re.sub(r'\bsakit\s+', '', disease)
    disease = re.sub(r'\bpenyakit\s+', '', disease)
    return disease

# Fungsi untuk menyelesaikan entitas (entity resolution)
def resolve_entity(name):
    disease_mapping = {
        "migrain": "migraine",
        "kepala pusing": "sakit kepala",
        "sakit kepala": "sakit kepala",
        # Tambahkan mapping lain sesuai kebutuhan
    }
    return disease_mapping.get(name, name)

# Fungsi untuk mengekstrak kelompok usia
def extract_age_group(text):
    text = text.lower()
    if any(word in text for word in ['anak', 'bayi', 'pediatrik']):
        return 'anak-anak'
    elif any(word in text for word in ['dewasa', 'adult']):
        return 'dewasa'
    return 'umum'

# Fungsi untuk memparsing dosis
def parse_dosage(dosage_text):
    if pd.isna(dosage_text):
        return None
    
    dosage_text = str(dosage_text).lower()
    match = re.search(r'(\d+(?:\.\d+)?)\s*(mg|ml|g)', dosage_text)
    if match:
        value = float(match.group(1))
        unit = match.group(2)
        return {'value': value, 'unit': unit}
    return None

# Fungsi untuk mengekstrak penyakit dengan konteks menggunakan Transformers
# Tambahkan daftar kata kunci penyakit yang lebih luas
common_diseases = [
    'flu', 'demam', 'pusing', 'sakit kepala', 'batuk', 'pilek',
    'asma', 'diare', 'tipes', 'maag', 'migraine', 'migrain',
    'hipertensi', 'diabetes', 'kolesterol tinggi', 'ginjal batu',
    'anemia', 'hiv', 'hepatitis', 'alergi', 'infeksi saluran kemih',
    # Tambahkan penyakit lain sesuai kebutuhan
]


additional_disease_keywords = [
    'hipertensi', 'diabetes', 'kolesterol tinggi', 'ginjal batu',
    'anemia', 'hiv', 'hepatitis', 'alergi', 'infeksi saluran kemih',
    # Tambahkan kata kunci lainnya
]

def extract_diseases_from_uses(text):
    if pd.isna(text):
        return []
    
    text = str(text).lower()
    diseases = []
    
    # Contoh pola yang bisa ditangkap, sesuaikan dengan data Anda
    disease_patterns = [
        r'mengobati ([\w\s]+)', 
        r'untuk ([\w\s]+)', 
        r'digunakan untuk ([\w\s]+)'
    ]
    
    for pattern in disease_patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            disease = resolve_entity(normalize_disease_name(match.strip()))
            diseases.append({
                'name': disease,
                'age_group': extract_age_group(text)
            })
    
    return diseases

def extract_diseases(text):
    if pd.isna(text):
        return []
    
    text = str(text).lower()
    entities = ner_pipeline(text)
    
    diseases = []
    
    # Ekstraksi dari common_diseases
    for disease in common_diseases:
        if disease in text:
            resolved_name = resolve_entity(normalize_disease_name(disease))
            diseases.append({
                'name': resolved_name,
                'age_group': extract_age_group(text)
            })
    
    # Ekstraksi menggunakan NER
    for ent in entities:
        if ent['entity_group'] in ['DISEASE', 'CONDITION', 'SYMPTOM']:
            normalized_name = normalize_disease_name(ent['word'])
            resolved_name = resolve_entity(normalized_name)
            diseases.append({
                'name': resolved_name,
                'age_group': extract_age_group(text)
            })
    
    # Ekstraksi tambahan menggunakan keyword matching
    for keyword in additional_disease_keywords:
        if keyword in text and keyword not in [d['name'] for d in diseases]:
            resolved_name = resolve_entity(normalize_disease_name(keyword))
            diseases.append({
                'name': resolved_name,
                'age_group': extract_age_group(text)
            })
    
    # Ekstraksi menggunakan regex
    regex_diseases = extract_diseases_from_uses(text)
    for disease_info in regex_diseases:
        if disease_info['name'] not in [d['name'] for d in diseases]:
            diseases.append(disease_info)
    
    return diseases

# Fungsi untuk memparsing rentang harga
def parse_price_range(price_str):
    try:
        # Remove 'Rp' and split by '-'
        prices = price_str.replace('Rp', '').split('-')
        
        # Clean and convert each price
        prices = [float(price.strip().replace('.', '')) for price in prices]
        
        # Return as tuple (min_price, max_price)
        if len(prices) == 2:
            return (prices[0], prices[1])
        else:
            return (prices[0], prices[0])  # If single price, return same value for min and max
                
    except (ValueError, AttributeError):
        return (None, None)  # Return None if parsing fails

# Koneksi ke Neo4j
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Attaqy81"))

# Load dataset
file_path = r"C:\Users\acer\.Neo4jDesktop\projects\project-7388dcf2-cc74-42d3-8076-55f11a6046c1\data_obat_uas.xlsx"
data = pd.read_excel(file_path)

# Buat constraints
graph.run("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Drug) REQUIRE d.name IS UNIQUE")
graph.run("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Disease) REQUIRE d.name IS UNIQUE")

# Iterasi melalui dataset
for index, row in data.iterrows():
    # Buat node Drug dengan properti
    drug_properties = {
        "name": row["name"]
    }
    
    # Tambahkan harga sebagai properti
    if pd.notna(row["price"]):
        min_price, max_price = parse_price_range(row["price"])
        if min_price is not None and max_price is not None:
            drug_properties["min_price"] = min_price
            drug_properties["max_price"] = max_price
    
    # Parsing dan tambahkan dosis sebagai properti
    if pd.notna(row["Dosis"]):
        dosage_info = parse_dosage(row["Dosis"])
        if dosage_info:
            drug_properties["dosage_value"] = dosage_info['value']
            drug_properties["dosage_unit"] = dosage_info['unit']
    
    # Tambahkan properti tambahan sesuai Schema.org
    drug_properties["activeIngredient"] = row.get("Komposisi", None)
    drug_properties["dosageForm"] = row.get("Aturan Pakai", None)
    drug_properties["drugClass"] = row.get("Golongan Produk", None)
    drug_properties["manufacturer"] = row.get("Manufaktur", None)
    drug_properties["prescriptionStatus"] = "Prescription" if "obat keras" in str(row.get("Golongan Produk", "")).lower() else "Over-the-counter"
    drug_properties["legalStatus"] = row.get("Golongan Produk", None)
    drug_properties["description"] = row.get("uses", None)
    
    drug_node = Node("Drug", **drug_properties)
    graph.merge(drug_node, "Drug", "name")
    
    # Buat relationships untuk atribut lain
    if pd.notna(row["Golongan Produk"]):
        group_node = Node("ProductGroup", group=row["Golongan Produk"])
        graph.merge(group_node, "ProductGroup", "group")
        graph.merge(Relationship(drug_node, "belongsToGroup", group_node))
    
    if pd.notna(row["Komposisi"]):
        ingredients_node = Node("Ingredients", content=row["Komposisi"])
        graph.merge(ingredients_node, "Ingredients", "content")
        graph.merge(Relationship(drug_node, "contains", ingredients_node))
    
    if pd.notna(row["Manufaktur"]):
        manufacturer_node = Node("Manufacturer", name=row["Manufaktur"])
        graph.merge(manufacturer_node, "Manufacturer", "name")
        graph.merge(Relationship(drug_node, "producedBy", manufacturer_node))
    
    # Ekstrak dan buat node penyakit dengan konteks
    if pd.notna(row["Indikasi Umum"]):
        diseases = extract_diseases(row["Indikasi Umum"])
        for disease_info in diseases:
            disease_node = Node("Disease", 
                                name=disease_info['name'],
                                age_group=disease_info['age_group'])
            graph.merge(disease_node, "Disease", "name")
            graph.merge(Relationship(drug_node, "treats", disease_node))
    
    # Tambahkan hubungan interaksi obat jika ada
    if pd.notna(row["Kontra Indikasi"]):
        interacting_drug_names = row["Kontra Indikasi"].split(',')
        for interacting_drug_name in interacting_drug_names:
            interacting_drug_name = interacting_drug_name.strip()
            interacting_drug_node = Node("Drug", name=interacting_drug_name)
            graph.merge(interacting_drug_node, "Drug", "name")
            graph.merge(Relationship(drug_node, "interactsWith", interacting_drug_node))
    
    # Tambahkan hubungan untuk efek samping
    if pd.notna(row["Efek Samping"]):
        side_effects = row["Efek Samping"].split(',')
        for effect in side_effects:
            effect = effect.strip()
            side_effect_node = Node("SideEffect", effect=effect)
            graph.merge(side_effect_node, "SideEffect", "effect")
            graph.merge(Relationship(drug_node, "hasSideEffect", side_effect_node))
    
    # Tambahkan hubungan untuk perhatian
    if pd.notna(row["Perhatian"]):
        precaution_node = Node("Precaution", details=row["Perhatian"])
        graph.merge(precaution_node, "Precaution", "details")
        graph.merge(Relationship(drug_node, "requiresPrecaution", precaution_node))

print("Data berhasil dikonversi ke Neo4j Knowledge Graph.")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [1]:
# evaluate_ir.py

import pandas as pd
from py2neo import Graph
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from rapidfuzz import process, fuzz
import re

# Inisialisasi tokenizer dan model untuk NER
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModelForTokenClassification.from_pretrained("indobenchmark/indobert-base-p1")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Fungsi untuk normalisasi nama penyakit
def normalize_disease_name(disease):
    disease = disease.lower().strip()
    disease = re.sub(r'\bsakit\s+', '', disease)
    disease = re.sub(r'\bpenyakit\s+', '', disease)
    return disease

# Fungsi untuk menyelesaikan entitas (entity resolution)
def resolve_entity(name):
    disease_mapping = {
        "migrain": "migraine",
        "kepala pusing": "sakit kepala",
        "sakit kepala": "sakit kepala",
        # Tambahkan mapping lain sesuai kebutuhan
    }
    return disease_mapping.get(name, name)

# Fungsi untuk mengekstrak kelompok usia
def extract_age_group(text):
    text = text.lower()
    if any(word in text for word in ['anak', 'bayi', 'pediatrik']):
        return 'anak-anak'
    elif any(word in text for word in ['dewasa', 'adult']):
        return 'dewasa'
    return 'umum'

# Fungsi untuk mengekstrak penyakit dengan konteks menggunakan Transformers
common_diseases = [
    'flu', 'demam', 'pusing', 'sakit kepala', 'batuk', 'pilek',
    'asma', 'diare', 'tipes', 'maag', 'migraine', 'migrain',
    'hipertensi', 'diabetes', 'kolesterol tinggi', 'ginjal batu',
    'anemia', 'hiv', 'hepatitis', 'alergi', 'infeksi saluran kemih',
    # Tambahkan penyakit lain sesuai kebutuhan
]

additional_disease_keywords = [
    'hipertensi', 'diabetes', 'kolesterol tinggi', 'ginjal batu',
    'anemia', 'hiv', 'hepatitis', 'alergi', 'infeksi saluran kemih',
    'stroke', 'kanker', 'asthma', 'menular', 'tidak stabil',
    # Tambahkan kata kunci lainnya
]

def extract_diseases_from_uses(text):
    if pd.isna(text):
        return []
    
    text = str(text).lower()
    diseases = []
    
    # Contoh pola yang bisa ditangkap, sesuaikan dengan data Anda
    disease_patterns = [
        r'mengobati ([\w\s]+)', 
        r'untuk ([\w\s]+)', 
        r'digunakan untuk ([\w\s]+)'
    ]
    
    for pattern in disease_patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            disease = resolve_entity(normalize_disease_name(match.strip()))
            diseases.append({
                'name': disease,
                'age_group': extract_age_group(text)
            })
    
    return diseases

def extract_diseases(text):
    if pd.isna(text):
        return []
    
    text = str(text).lower()
    entities = ner_pipeline(text)
    
    diseases = []
    
    # Ekstraksi dari common_diseases
    for disease in common_diseases:
        if disease in text:
            resolved_name = resolve_entity(normalize_disease_name(disease))
            diseases.append({
                'name': resolved_name,
                'age_group': extract_age_group(text)
            })
    
    # Ekstraksi menggunakan NER
    for ent in entities:
        if ent['entity_group'] in ['DISEASE', 'CONDITION', 'SYMPTOM']:
            normalized_name = normalize_disease_name(ent['word'])
            resolved_name = resolve_entity(normalized_name)
            diseases.append({
                'name': resolved_name,
                'age_group': extract_age_group(text)
            })
    
    # Ekstraksi tambahan menggunakan keyword matching
    for keyword in additional_disease_keywords:
        if keyword in text and keyword not in [d['name'] for d in diseases]:
            resolved_name = resolve_entity(normalize_disease_name(keyword))
            diseases.append({
                'name': resolved_name,
                'age_group': extract_age_group(text)
            })
    
    # Ekstraksi menggunakan regex
    regex_diseases = extract_diseases_from_uses(text)
    for disease_info in regex_diseases:
        if disease_info['name'] not in [d['name'] for d in diseases]:
            diseases.append(disease_info)
    
    # Logging untuk debugging
    print(f"Ekstraksi penyakit dari teks: {text}")
    print(f"Penyakit yang terdeteksi: {[d['name'] for d in diseases]}")
    
    return diseases

# Koneksi ke Neo4j (hanya baca)
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Attaqy81"), secure=False)

# Fungsi untuk mencari obat berdasarkan penyakit
def search_drugs_by_disease(disease):
    query = """
    MATCH (d:Drug)-[:treats]->(dis:Disease {name: $disease})
    RETURN d.name AS drug_name
    """
    result = graph.run(query, disease=disease)
    return [record['drug_name'] for record in result]

# Fungsi utama untuk mencari berdasarkan query pengguna
def search_query(query):
    diseases = extract_diseases(query)
    if diseases:
        # Pencarian obat berdasarkan penyakit
        drugs = set()
        for disease in diseases:
            matched_drugs = search_drugs_by_disease(disease['name'])
            drugs.update(matched_drugs)
        
        if drugs:
            return list(drugs)
        else:
            return []
    else:
        # Pencarian berdasarkan kata kunci seperti harga, komposisi, efek samping
        if "harga" in query.lower():
            # Ekstrak nama obat dari query
            all_drug_names = get_all_drug_names()
            matched_drug, score, _ = process.extractOne(
                query, all_drug_names, scorer=fuzz.WRatio
            )
            
            if score >= 80:
                return get_drug_price(matched_drug)
            else:
                return []
        elif "komposisi" in query.lower():
            # Ekstrak nama obat dari query
            all_drug_names = get_all_drug_names()
            matched_drug, score, _ = process.extractOne(
                query, all_drug_names, scorer=fuzz.WRatio
            )
            
            if score >= 80:
                return get_drug_composition(matched_drug)
            else:
                return []
        elif "efek samping" in query.lower():
            # Ekstrak nama obat dari query
            all_drug_names = get_all_drug_names()
            matched_drug, score, _ = process.extractOne(
                query, all_drug_names, scorer=fuzz.WRatio
            )
            
            if score >= 80:
                return get_drug_side_effects(matched_drug)
            else:
                return []
        elif "manufaktur" in query.lower():
            # Ekstrak nama obat dari query
            all_drug_names = get_all_drug_names()
            matched_drug, score, _ = process.extractOne(
                query, all_drug_names, scorer=fuzz.WRatio
            )
            
            if score >= 80:
                return get_drug_manufacturer(matched_drug)
            else:
                return []
        elif "perhatian" in query.lower():
            # Ekstrak nama obat dari query
            all_drug_names = get_all_drug_names()
            matched_drug, score, _ = process.extractOne(
                query, all_drug_names, scorer=fuzz.WRatio
            )
            
            if score >= 80:
                return get_drug_precautions(matched_drug)
            else:
                return []
        else:
            return []

# Fungsi untuk mendapatkan semua nama obat untuk fuzzy matching dan auto-complete
def get_all_drug_names():
    query = "MATCH (d:Drug) RETURN d.name AS name"
    result = graph.run(query)
    return [record['name'] for record in result]

# Fungsi untuk mendapatkan atribut spesifik (hanya baca)
def get_drug_price(drug_name):
    query = """
    MATCH (d:Drug {name: $drug_name})
    RETURN d.min_price AS MinPrice, d.max_price AS MaxPrice
    """
    result = graph.run(query, drug_name=drug_name)
    df = pd.DataFrame([record for record in result])
    return df[['MinPrice', 'MaxPrice']].values.tolist()

def get_drug_composition(drug_name):
    query = """
    MATCH (d:Drug {name: $drug_name})-[:contains]->(n:Ingredients)
    RETURN n.content AS Composition
    """
    result = graph.run(query, drug_name=drug_name)
    df = pd.DataFrame([record for record in result])
    return df['Composition'].tolist()

def get_drug_side_effects(drug_name):
    query = """
    MATCH (d:Drug {name: $drug_name})-[:hasSideEffect]->(n:SideEffect)
    RETURN n.effect AS SideEffects
    """
    result = graph.run(query, drug_name=drug_name)
    df = pd.DataFrame([record for record in result])
    return df['SideEffects'].tolist()

def get_drug_manufacturer(drug_name):
    query = """
    MATCH (d:Drug {name: $drug_name})-[:producedBy]->(n:Manufacturer)
    RETURN n.name AS Manufacturer
    """
    result = graph.run(query, drug_name=drug_name)
    df = pd.DataFrame([record for record in result])
    return df['Manufacturer'].tolist()

def get_drug_precautions(drug_name):
    query = """
    MATCH (d:Drug {name: $drug_name})-[:requiresPrecaution]->(n:Precaution)
    RETURN n.details AS Precautions
    """
    result = graph.run(query, drug_name=drug_name)
    df = pd.DataFrame([record for record in result])
    return df['Precautions'].tolist()

def get_drug_description(drug_name):
    query = """
    MATCH (d:Drug {name: $drug_name})
    RETURN d.description AS Description
    """
    result = graph.run(query, drug_name=drug_name)
    df = pd.DataFrame([record for record in result])
    return df['Description'].tolist()

# Fungsi untuk mengumpulkan hasil sistem berdasarkan ground truth
def get_system_results(ground_truth_df):
    system_results = {}
    for index, row in ground_truth_df.iterrows():
        query = row['Query']
        retrieved = search_query(query)
        system_results[query] = retrieved
    return system_results

# Fungsi untuk memuat ground truth
def load_ground_truth(file_path):
    return pd.read_excel(file_path)

# Fungsi untuk menghitung metrik evaluasi
def evaluate_system(ground_truth_df, system_results):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    for index, row in ground_truth_df.iterrows():
        query = row['Query']
        expected = set([drug.strip() for drug in row['Expected_Drugs'].split(',')])
        retrieved = set(system_results.get(query, []))
        
        true_positives = len(expected & retrieved)
        precision = true_positives / len(retrieved) if retrieved else 0
        recall = true_positives / len(expected) if expected else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        print(f"Query: {query}")
        print(f"Expected: {expected}")
        print(f"Retrieved: {retrieved}")
        print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n")
    
    evaluation_metrics = {
        'Precision': precision_scores,
        'Recall': recall_scores,
        'F1-Score': f1_scores
    }
    
    evaluation_df = pd.DataFrame(evaluation_metrics)
    print("Average Precision:", evaluation_df['Precision'].mean())
    print("Average Recall:", evaluation_df['Recall'].mean())
    print("Average F1-Score:", evaluation_df['F1-Score'].mean())
    
    return evaluation_df

# Fungsi untuk menyimpan hasil evaluasi
def save_evaluation_results(evaluation_df, output_path='evaluation_results.xlsx'):
    evaluation_df.to_excel(output_path, index=False)
    print(f"Evaluasi disimpan di {output_path}")

# Main function untuk menjalankan evaluasi
def main():
    # Path ke ground truth
    ground_truth_path = 'ground_truth.xlsx'  # Ganti dengan path yang sesuai
    
    # Memuat ground truth
    ground_truth_df = load_ground_truth(ground_truth_path)
    
    # Mengumpulkan hasil sistem
    system_results = get_system_results(ground_truth_df)
    
    # Melakukan evaluasi
    evaluation_df = evaluate_system(ground_truth_df, system_results)
    
    # Menyimpan hasil evaluasi
    save_evaluation_results(evaluation_df)

if __name__ == "__main__":
    main()


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: 'ground_truth.xlsx'

# SEBELUM INI

In [17]:
def evaluate_exact_match(retrieved_results, ground_truth):
    """
    Compare retrieved results with ground truth.
    
    Parameters:
    - retrieved_results: List of drugs retrieved from Neo4j.
    - ground_truth: List of correct drugs expected to be returned.
    
    Returns:
    - accuracy: Exact match accuracy percentage.
    """
    # Convert results to sets to ignore order
    retrieved_set = set(retrieved_results)
    ground_truth_set = set(ground_truth)
    
    # Calculate how many retrieved results match the ground truth
    matches = retrieved_set.intersection(ground_truth_set)
    accuracy = len(matches) / len(ground_truth_set) if ground_truth_set else 0
    
    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
    print(f"Matches: {matches}")
    print(f"Retrieved: {retrieved_set}")
    print(f"Ground Truth: {ground_truth_set}")
    
    return accuracy

def search_drugs_by_disease(disease):
    """
    Search for drugs that treat a given disease in Neo4j.
    """
    query = """
    MATCH (d:Drug)-[:treats]->(dis:Disease {name: $disease})
    RETURN d.name AS drug_name
    """
    result = graph.run(query, disease=disease)
    drugs = [record['drug_name'] for record in result]
    return drugs

def generate_ground_truth():
    query = """
    MATCH (d:Disease)<-[:treats]-(drug:Drug)
    RETURN d.name AS disease, collect(drug.name) AS drugs
    """
    result = graph.run(query)
    ground_truth = {record['disease']: record['drugs'] for record in result}
    return ground_truth

# Example usage
ground_truth = generate_ground_truth()
print(ground_truth)


{'kalangan medis': ['sildenafil citrate 50 mg 4 tablet', 'viagra 50 mg 1 tablet', 'ericfil tablet 100 mg', 'cialis tablet 10 mg', 'ericfil 50 mg odf', 'sildenafil citrate 100 mg 4 tablet', 'cialis 5 mg 14 tablet', 'rozgra 50 mg tablet', 'viagra 100 mg 1 tablet', 'cialis tablet 20 mg', 'topgra 100 mg 1 tablet', 'ciastar yellow 10 mg odf', 'gramax 100 mg tablet', 'levitra tablet 20 mg', 'promel 20 mg 4 tablet', 'rozgra 100 mg tablet', 'viastar blue 100 mg odf', 'viastar blue 50 mg odf', 'bifido 50 mg 1 sachet', 'caliberi 20 mg odf', 'ciastar yellow 20 mg odf', 'microgest 100 mg 15 kapsul', 'microgest 200 mg 15 kapsul', 'duphaston 10 mg 20 tablet', 'profertil 50 mg 10 tablet', 'cygest suppositoria 400mg', 'femaplex 2.5 mg 10 tablet', 'dipthen 50 mg 10 tablet', 'utrogestan 100 mg 15 kapsul', 'betadine vaginal douche 10% 100 ml plus dengan alat', 'lasal expectorant sirup 100 ml', 'salbutamol 2 mg 10 tablet', 'ataroc sirup 60 ml', 'lasal sirup 100 ml', 'salbutamol 4 mg 10 tablet', 'teosal 10

In [20]:
retrieved_results = search_drugs_by_disease('kalangan medis')
print("Retrieved Results:", retrieved_results)


Retrieved Results: ['sildenafil citrate 50 mg 4 tablet', 'viagra 50 mg 1 tablet', 'ericfil tablet 100 mg', 'cialis tablet 10 mg', 'ericfil 50 mg odf', 'sildenafil citrate 100 mg 4 tablet', 'cialis 5 mg 14 tablet', 'rozgra 50 mg tablet', 'viagra 100 mg 1 tablet', 'cialis tablet 20 mg', 'topgra 100 mg 1 tablet', 'ciastar yellow 10 mg odf', 'gramax 100 mg tablet', 'levitra tablet 20 mg', 'promel 20 mg 4 tablet', 'rozgra 100 mg tablet', 'viastar blue 100 mg odf', 'viastar blue 50 mg odf', 'bifido 50 mg 1 sachet', 'caliberi 20 mg odf', 'ciastar yellow 20 mg odf', 'microgest 100 mg 15 kapsul', 'microgest 200 mg 15 kapsul', 'duphaston 10 mg 20 tablet', 'profertil 50 mg 10 tablet', 'cygest suppositoria 400mg', 'femaplex 2.5 mg 10 tablet', 'dipthen 50 mg 10 tablet', 'utrogestan 100 mg 15 kapsul', 'betadine vaginal douche 10% 100 ml plus dengan alat', 'lasal expectorant sirup 100 ml', 'salbutamol 2 mg 10 tablet', 'ataroc sirup 60 ml', 'lasal sirup 100 ml', 'salbutamol 4 mg 10 tablet', 'teosal 10

In [21]:
print("Ground Truth:", ground_truth['kalangan medis'])


Ground Truth: ['sildenafil citrate 50 mg 4 tablet', 'viagra 50 mg 1 tablet', 'ericfil tablet 100 mg', 'cialis tablet 10 mg', 'ericfil 50 mg odf', 'sildenafil citrate 100 mg 4 tablet', 'cialis 5 mg 14 tablet', 'rozgra 50 mg tablet', 'viagra 100 mg 1 tablet', 'cialis tablet 20 mg', 'topgra 100 mg 1 tablet', 'ciastar yellow 10 mg odf', 'gramax 100 mg tablet', 'levitra tablet 20 mg', 'promel 20 mg 4 tablet', 'rozgra 100 mg tablet', 'viastar blue 100 mg odf', 'viastar blue 50 mg odf', 'bifido 50 mg 1 sachet', 'caliberi 20 mg odf', 'ciastar yellow 20 mg odf', 'microgest 100 mg 15 kapsul', 'microgest 200 mg 15 kapsul', 'duphaston 10 mg 20 tablet', 'profertil 50 mg 10 tablet', 'cygest suppositoria 400mg', 'femaplex 2.5 mg 10 tablet', 'dipthen 50 mg 10 tablet', 'utrogestan 100 mg 15 kapsul', 'betadine vaginal douche 10% 100 ml plus dengan alat', 'lasal expectorant sirup 100 ml', 'salbutamol 2 mg 10 tablet', 'ataroc sirup 60 ml', 'lasal sirup 100 ml', 'salbutamol 4 mg 10 tablet', 'teosal 10 tabl

In [22]:
def normalize_names(names):
    return {name.lower().strip() for name in names}

# Normalize both sets
retrieved_set = normalize_names(retrieved_results)
ground_truth_set = normalize_names(ground_truth['kalangan medis'])

# Recalculate accuracy
matches = retrieved_set.intersection(ground_truth_set)
accuracy = len(matches) / len(ground_truth_set) if ground_truth_set else 0
print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")


Exact Match Accuracy: 100.00%


In [23]:
matches = retrieved_set.intersection(ground_truth_set)
accuracy = len(matches) / len(ground_truth_set) if ground_truth_set else 0
print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
print(f"Matches: {matches}")


Exact Match Accuracy: 100.00%
Matches: {'taxime 200 mg 10 kapsul', 'betason cream 5 gr', 'neuralgin rx 10 kaplet', 'regit 5 mg/5 ml sirup 60 ml', 'yusimox dry sirup 125 mg/5 ml 60 ml', 'l-vit d3 5000 10 tablet', 'cetirizine 10 mg 10 tablet', 'ozen drops 12 ml', 'rhinos sr 10 kapsul', 'alerzin 10 mg 10 tablet', 'rhinofed 10 tablet', 'canesten sd tablet vaginal 500 mg', 'nebacetin salep 5 g', 'dazolin n ovula', 'inbacef 100 mg dry syrup 30 ml', 'alcet sirup 60 ml', 'isprinol 500 mg 4 tablet', 'betason-n cream 5 g', 'vomizole 40 mg 10 tablet', 'sanprima suspensi 60 ml', 'favikal 200 mg 10 tablet', 'cendo mycos eye ointment 3.5 g', 'itraconazole 100 mg 10 kapsul', 'zyloric 100 mg 10 tablet', 'diprosta cream 5 g', 'narfoz tablet 8 mg', 'biatron 500 mg 10 tablet', 'natureline vitamin d3 1000 ui 60 tablet', 'ezol 20 mg 7 tablet', 'transpulmin sirup 100 ml', 'ponstan 500 mg 10 tablet', 'etoricoxib 60 mg 10 tablet', 'clindamycin 150 mg 10 kapsul', 'profat 500 mg/5 ml suspensi 200 ml', 'flutrop 

In [24]:
print("Normalized Retrieved Results:", retrieved_set)
print("Normalized Ground Truth:", ground_truth_set)
print("Matches Found:", matches)


Normalized Retrieved Results: {'taxime 200 mg 10 kapsul', 'betason cream 5 gr', 'neuralgin rx 10 kaplet', 'regit 5 mg/5 ml sirup 60 ml', 'yusimox dry sirup 125 mg/5 ml 60 ml', 'l-vit d3 5000 10 tablet', 'cetirizine 10 mg 10 tablet', 'ozen drops 12 ml', 'rhinos sr 10 kapsul', 'alerzin 10 mg 10 tablet', 'rhinofed 10 tablet', 'canesten sd tablet vaginal 500 mg', 'nebacetin salep 5 g', 'dazolin n ovula', 'inbacef 100 mg dry syrup 30 ml', 'alcet sirup 60 ml', 'isprinol 500 mg 4 tablet', 'betason-n cream 5 g', 'vomizole 40 mg 10 tablet', 'sanprima suspensi 60 ml', 'favikal 200 mg 10 tablet', 'cendo mycos eye ointment 3.5 g', 'itraconazole 100 mg 10 kapsul', 'zyloric 100 mg 10 tablet', 'diprosta cream 5 g', 'narfoz tablet 8 mg', 'biatron 500 mg 10 tablet', 'natureline vitamin d3 1000 ui 60 tablet', 'ezol 20 mg 7 tablet', 'transpulmin sirup 100 ml', 'ponstan 500 mg 10 tablet', 'etoricoxib 60 mg 10 tablet', 'clindamycin 150 mg 10 kapsul', 'profat 500 mg/5 ml suspensi 200 ml', 'flutrop 10 tablet