## <span style="color:#ff5f27">üìù Imports </span>

In [None]:
!pip install -q hopsworks==4.2.10 rdflib sentence-transformers pymupdf

In [10]:
import pandas as pd
import hopsworks

from sentence_transformers import SentenceTransformer
from hsfs.embedding import EmbeddingIndex

from functions.zotero_parser import ZoteroCSVParser
from functions.PDF_extractor import PDFExtractor


## <span style="color:#ff5f27"> Global Config

In [None]:
import os

os.environ["HOPSWORKS_API_KEY"] = ""


## <span style="color:#ff5f27">üß¨ Metadata and Text Extraction </span>

In [12]:
import re
from typing import Optional, Dict, Any

def sanitize_paper_metadata(paper: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Defensive metadata sanitation.
    Returns None if the paper is considered invalid.
    """

    # ---- 1. Mandatory Field Validation ----
    title = paper.get("title", "").strip()
    if not title:
        return None  # Equivalent to RDF version: discard if title is missing

    paper["title"] = title

    # ---- 2. Year Repair (Reusing regex logic from RDF) ----
    year = paper.get("year")
    if year is None:
        # Attempt to recover year from other metadata fields
        for field in ("url", "abstract"):
            text = paper.get(field, "")
            match = re.search(r"(19|20)\d{2}", text)
            if match:
                paper["year"] = int(match.group())
                break

    # ---- 3. Authors Fallback ----
    authors = paper.get("authors", "").strip()
    if not authors or authors.lower() == "nan":
        paper["authors"] = "Unknown"

    # ---- 4. Abstract Normalization ----
    abstract = paper.get("abstract", "").strip()
    if abstract.lower() in {"nan", "none"}:
        paper["abstract"] = ""

    # ---- 5. Attachments Handling ----
    # Preserve original state but ensure the value is a string
    attachments = paper.get("file_attachments")
    paper["file_attachments"] = str(attachments) if attachments is not None else ""

    return paper

In [13]:
# === Cell 3: Parse Zotero CSV ===
parser = ZoteroCSVParser("PCG.csv")
raw_papers = parser.parse()

papers = []
for paper in raw_papers:
    fixed = sanitize_paper_metadata(paper)
    if fixed is not None:
        papers.append(fixed)

print(f"Parsed {len(papers)} papers.")
papers[:2]  

Parsed 15 papers.


[{'paper_id': 'CLGNKPIJ',
  'title': 'Synthesis of Normal Heart Sounds Using Generative Adversarial Networks and Empirical Wavelet Transform',
  'authors': 'Narv√°ez, Pedro; Percybrooks, Winston S.',
  'year': 2020,
  'abstract': 'Currently, there are many works in the literature focused on the analysis of heart sounds, speciÔ¨Åcally on the development of intelligent systems for the classiÔ¨Åcation of normal and abnormal heart sounds. However, the available heart sound databases are not yet large enough to train generalized machine learning models. Therefore, there is interest in the development of algorithms capable of generating heart sounds that could augment current databases. In this article, we propose a model based on generative adversary networks (GANs) to generate normal synthetic heart sounds. Additionally, a denoising algorithm is implemented using the empirical wavelet transform (EWT), allowing a decrease in the number of epochs and the computational cost that the GAN model

In [14]:
# === Cell 4: Extract Full Text from Attachments ===

import re
import urllib.parse
from typing import List, Optional

from config import MIN_FULLTEXT_LEN, CHUNK_SIZE, CHUNK_OVERLAP


metadata_rows = []
fulltext_rows = []


# -------- Abstract extraction (provided implementation) --------

def extract_abstract_from_text(text: str) -> Optional[str]:
    normalized = text.replace("\r\n", "\n").replace("\r", "\n")

    stop_markers = (
        r"keywords|index\s*terms|subject[s]?|introduction|background|materials\s+and\s+methods|"
        r"methods|results|conclusions|references|acknowledg(e)?ments|1\.|i\.|ii\.|iii\."
        r"|Keywords|Introduction|Background|Methods|Results|Conclusion|References"
    )
    start_markers = r"abstract|summary|Abstract|Summary"

    pattern = rf"(?is)\b(?:{start_markers})\b\s*[:\.\-]?\s*(.+?)(?=\n\s*(?:{stop_markers})\b|\n\n\s*[A-Z][A-Za-z ]+\b|\Z)"
    match = re.search(pattern, normalized)
    if match:
        abstract = re.sub(r"\s+", " ", match.group(1).strip())
        if 50 <= len(abstract) <= 5000 and re.search(r"[a-z]", abstract, re.I):
            return abstract

    lines = normalized.split("\n")
    abstract_started = False
    buffer: list[str] = []

    for line in lines:
        line_stripped = line.strip()

        if not abstract_started:
            if re.match(
                r"(?i)^(abstract|summary)\b\s*[:\-\.]?\s*$",
                line_stripped,
            ) or re.match(
                r"(?i)^(abstract|summary)\b\s*[:\-\.]?",
                line_stripped,
            ):
                after = re.sub(
                    r"(?i)^(abstract|summary)\b\s*[:\-\.]?\s*",
                    "",
                    line_stripped,
                )
                if after:
                    buffer.append(after)
                abstract_started = True
            continue
        else:
            if re.match(rf"(?i)^\s*(?:{stop_markers})\b", line_stripped):
                break
            buffer.append(line)

    candidate = re.sub(r"\s+", " ", " ".join(buffer)).strip()
    if 50 <= len(candidate) <= 5000 and re.search(r"[a-z]", candidate, re.I):
        return candidate

    paragraphs = re.split(r"\n\s*\n", normalized)
    for paragraph in paragraphs[:8]:
        p = re.sub(r"\s+", " ", paragraph.strip())
        if (
            120 <= len(p) <= 5000
            and not re.match(
                r"(?i)^(keywords|index\s*terms|introduction|references|acknowledg(e)?ments)",
                p,
            )
            and p.count(".") >= 2
        ):
            return p

    return None


# -------- Content cleaning --------

def clean_text(text: str) -> str:
    if not text:
        return ""

    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"(\.{5,}|\-{5,})", " ", text)
    text = re.sub(r"[\t\f\u00A0]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


# -------- Chunking --------

def chunk_text(
    text: str,
    chunk_size: int = CHUNK_SIZE,
    overlap: int = CHUNK_OVERLAP,
) -> List[str]:
    if not text:
        return []

    paragraphs = text.split("\n\n")
    chunks = []
    current = ""

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        if len(current) + len(para) < chunk_size:
            current = f"{current}\n\n{para}" if current else para
        else:
            if current:
                chunks.append(current)
                current = current[-overlap:] + "\n\n" + para

            if len(current) > chunk_size:
                for i in range(0, len(current), chunk_size - overlap):
                    chunks.append(current[i : i + chunk_size])
                current = ""

    if current:
        chunks.append(current)

    return chunks


# -------- Safe file reading --------

def safe_read_fulltext(file_path: str) -> str:
    if not file_path:
        return ""

    decoded = urllib.parse.unquote(file_path)
    return PDFExtractor.read_file(decoded) or ""


# -------- Main loop --------

for paper in papers:
    # ---- Full text extraction ----
    raw_full_text = safe_read_fulltext(paper.get("file_attachments", ""))

    # ---- Content processing ----
    full_text = clean_text(raw_full_text)

    # ---- Abstract handling ----
    abstract = paper.get("abstract", "")
    if not abstract and len(full_text) >= MIN_FULLTEXT_LEN:
        abstract = extract_abstract_from_text(full_text) or ""

    # ---- Paper-level features ----
    metadata_rows.append(
        {
            "paper_id": paper["paper_id"],
            "title": paper["title"],
            "abstract": abstract,
            "authors": paper["authors"],
            "year": paper["year"],
            "item_type": paper["item_type"],
            "combined_text": (
                f"Title: {paper['title']}\n"
                f"Abstract: {abstract}"
            ),
        }
    )

    # ---- Chunk-level features ----
    if len(full_text) >= MIN_FULLTEXT_LEN:
        for i, chunk in enumerate(chunk_text(full_text)):
            fulltext_rows.append(
                {
                    "paper_id": paper["paper_id"],
                    "chunk_index": i,
                    "content": chunk,
                    "year": paper["year"],
                }
            )

print(f"Metadata rows: {len(metadata_rows)}")
print(f"Fulltext chunks: {len(fulltext_rows)}")


Metadata rows: 15
Fulltext chunks: 59


## <span style="color:#ff5f27;"> üîÆ Embedding Extraction </span>

In [15]:
# === Cell 5: Generate Embeddings for Metadata and Full Text ===

import pandas as pd
from sentence_transformers import SentenceTransformer
from hsfs import embedding

from config import EMBEDDING_MODEL_NAME


# -------------------------
# 1. Load embedding model
# -------------------------

model = SentenceTransformer(EMBEDDING_MODEL_NAME)
embedding_dim = model.get_sentence_embedding_dimension()

print(f"Loaded embedding model: {EMBEDDING_MODEL_NAME}")
print(f"Embedding dimension: {embedding_dim}")


# -------------------------
# 2. Prepare DataFrames
# -------------------------

df_metadata = pd.DataFrame(metadata_rows)
df_chunks = pd.DataFrame(fulltext_rows)

print(f"Metadata rows: {len(df_metadata)}")
print(f"Chunk rows: {len(df_chunks)}")


# -------------------------
# 3. Generate metadata embeddings
# -------------------------

if not df_metadata.empty:
    embeddings = model.encode(
        df_metadata["combined_text"].tolist(),
        show_progress_bar=True,
        convert_to_numpy=True,
    ).astype("float32")

    df_metadata["embedding"] = list(embeddings)
else:
    df_metadata["embedding"] = []

print("Metadata embeddings generated.")


# -------------------------
# 4. Generate full-text chunk embeddings
# -------------------------

if not df_chunks.empty:
    embeddings = model.encode(
        df_chunks["content"].fillna("").tolist(),
        show_progress_bar=True,
        convert_to_numpy=True,
    ).astype("float32")

    df_chunks["embedding"] = list(embeddings)

else:
    df_chunks["embedding"] = []

print("Chunk embeddings generated.")


# -------------------------
# 5. Add context_id (stable row id)
# -------------------------

df_metadata["context_id"] = range(len(df_metadata))
df_chunks["context_id"] = range(len(df_chunks))


# -------------------------
# 6. Create embedding indexes
# -------------------------

metadata_index = embedding.EmbeddingIndex()
metadata_index.add_embedding(
    "metadata_embedding",
    embedding_dim,
)

chunk_index = embedding.EmbeddingIndex()
chunk_index.add_embedding(
    "chunk_embedding",
    embedding_dim,
)

print("Embedding indexes created.")


# -------------------------
# 7. Final sanity check
# -------------------------

display(df_metadata.head())
display(df_chunks.head())


2026-01-08 01:51:49,183 INFO: Use pytorch device_name: cpu
2026-01-08 01:51:49,186 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Loaded embedding model: all-MiniLM-L6-v2
Embedding dimension: 384
Metadata rows: 15
Chunk rows: 59


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Metadata embeddings generated.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Chunk embeddings generated.
Embedding indexes created.


Unnamed: 0,paper_id,title,abstract,authors,year,item_type,combined_text,embedding,context_id
0,CLGNKPIJ,Synthesis of Normal Heart Sounds Using Generat...,"Currently, there are many works in the literat...","Narv√°ez, Pedro; Percybrooks, Winston S.",2020,journalArticle,Title: Synthesis of Normal Heart Sounds Using ...,"[-0.09191086, -0.067204505, 0.036164455, -0.05...",0
1,R7JAHFY6,ECG Generation Based on Denoising Diffusion Pr...,Arrhythmia diseases seriously damage people‚Äôs ...,"Wang, Zhongyu; Ma, Caiyun; Zhao, Minghui; Zhan...",2024,conferencePaper,Title: ECG Generation Based on Denoising Diffu...,"[-0.032254055, -0.06744331, -0.021227641, -0.0...",1
2,WBP9IWFM,"PhysioBank, PhysioToolkit, and PhysioNet: Comp...",Abstract ‚ÄîThe newly inaugurated ...,"Goldberger, Ary L.; Amaral, Luis A. N.; Glass,...",2000,journalArticle,"Title: PhysioBank, PhysioToolkit, and PhysioNe...","[-0.04702052, -0.10492723, -0.058769476, -0.00...",2
3,UPL26UPV,Comparative Analysis of CNN and Transformer Ar...,The automated classification of phonocardiogra...,"Sondermann, Martin; Bisgin, Pinar; Tschorn, Ni...",2025,preprint,Title: Comparative Analysis of CNN and Transfo...,"[-0.047316596, -0.04460356, 0.037667654, -0.01...",3
4,BX8G68KQ,A Comprehensive Overview of Heart Sound Analys...,Cardiovascular diseases (CVDs) are a prevalent...,"Hamza, Motaz Faroq A. Ben; Sjarif, Nilam Nur Amir",2024,journalArticle,Title: A Comprehensive Overview of Heart Sound...,"[-0.021934992, -0.061657984, 0.042390797, -0.0...",4


Unnamed: 0,paper_id,chunk_index,content,year,embedding,context_id
0,CLGNKPIJ,0,"Appl. Sci. 2020, 10, 7003\n7 of 16\n \n \n \nF...",2020,"[-0.037153482, -0.05627889, -0.021900643, -0.0...",0
1,CLGNKPIJ,1,range of 0‚Äì200 Hz; (F) EWT component of the no...,2020,"[-0.11885451, 0.0015717824, 0.016471593, 0.005...",1
2,CLGNKPIJ,2,"with a relatively high noise level, as shown ...",2020,"[-0.14223221, 0.012694337, 0.021791773, -0.028...",2
3,BX8G68KQ,0,"M. F. A. B. Hamza, N. N. Amir Sjarif: Comprehe...",2024,"[0.07923419, -0.030007407, -0.021555917, 0.031...",3
4,BX8G68KQ,1,w of Heart Sound Analysis\nTABLE 4. Summary of...,2024,"[-0.07195769, -0.09754954, 0.014396299, -0.041...",4


## <span style="color:#ff5f27;"> üîÆ Connecting to Hopsworks Feature Store </span>

In [16]:
import hopsworks
from config import HOPSWORKS_API_KEY
# project = hopsworks.login()

project = hopsworks.login(
        # project=HOPSWORKS_PROJECT,
        api_key_value=HOPSWORKS_API_KEY
    )

fs = project.get_feature_store()

2026-01-08 01:51:57,836 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-08 01:51:57,839 INFO: Initializing external client
2026-01-08 01:51:57,840 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-08 01:51:59,262 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286333


## <span style="color:#ff5f27;"> ü™Ñ Feature Group Creation </span>

In [17]:
# === Cell 6.1: Create or Get Metadata Feature Group (Safe Version) ===

from hsfs import embedding

metadata_emb_index = embedding.EmbeddingIndex()
metadata_emb_index.add_embedding(
    "embedding",
    model.get_sentence_embedding_dimension(),
)

metadata_fg = fs.get_or_create_feature_group(
    name="paper_metadata_fg",
    version=1,
    description="Paper-level metadata features (title, abstract, authors, year, item type)",
    primary_key=["paper_id"],
    online_enabled=True,
    embedding_index=metadata_emb_index,
)


metadata_fg.insert(
    df_metadata,
    write_options={"wait_for_job": True},
)

print(f"Inserted {len(df_metadata)} rows into metadata feature group.")


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286333/fs/1273958/fg/1908147


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 15/15 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: paper_metadata_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286333/jobs/named/paper_metadata_fg_1_offline_fg_materialization/executions
2026-01-08 01:52:28,475 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-08 01:52:34,857 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-08 01:54:20,039 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2026-01-08 01:54:20,211 INFO: Waiting for log aggregation to finish.
2026-01-08 01:54:28,835 INFO: Execution finished successfully.
Inserted 15 rows into metadata feature group.


In [18]:
# === Cell 6.2: Create or Get Fulltext Chunk Feature Group (Safe Version) ===

chunk_emb_index = embedding.EmbeddingIndex()
chunk_emb_index.add_embedding(
    "embedding",
    model.get_sentence_embedding_dimension(),
)

chunk_fg = fs.get_or_create_feature_group(
    name="paper_chunk_fg",
    version=1,
    description="Chunk-level full text features for RAG",
    primary_key=["paper_id", "chunk_index"],
    online_enabled=True,
    embedding_index=chunk_emb_index,
)


In [19]:
chunk_fg.insert(
    df_chunks,
    write_options={"wait_for_job": True},
)

print(f"Inserted {len(df_chunks)} rows into chunk feature group.")


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286333/fs/1273958/fg/1893857


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 59/59 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: paper_chunk_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286333/jobs/named/paper_chunk_fg_1_offline_fg_materialization/executions
2026-01-08 01:55:06,106 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2026-01-08 01:55:09,298 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-08 01:55:12,491 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-08 01:56:57,908 INFO: Waiting for log aggregation to finish.
2026-01-08 01:57:06,629 INFO: Execution finished successfully.
Inserted 59 rows into chunk feature group.


## <span style="color:#ff5f27;">ü™Ñ Feature View Creation </span>


In [20]:
# === Cell 8.1: Create Metadata Feature View ===

metadata_fv = fs.get_or_create_feature_view(
    name="paper_metadata_fv",
    version=1,
    description="Paper-level metadata for retrieval and filtering",
    query=metadata_fg.select(
        [
            "paper_id",
            "title",
            "abstract",
            "authors",
            "year",
            "item_type",
            "embedding",
        ]
    ),
)

print("Metadata Feature View ready.")


Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286333/fs/1273958/fv/paper_metadata_fv/version/1
Metadata Feature View ready.


In [21]:
# === Cell 8.2: Create Chunk Feature View ===

chunk_fv = fs.get_or_create_feature_view(
    name="paper_chunk_fv",
    version=1,
    description="Chunk-level full text for RAG context retrieval",
    query=chunk_fg.select(
        [
            "paper_id",
            "chunk_index",
            "content",
            "year",
            "embedding",
        ]
    ),
)

print("Chunk Feature View ready.")


Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286333/fs/1273958/fv/paper_chunk_fv/version/1
Chunk Feature View ready.


---