## <span style="color:#ff5f27">üìù Imports </span>

In [None]:
!pip install -q hopsworks==4.2.10 rdflib sentence-transformers pymupdf

In [1]:
import pandas as pd
import hopsworks

from sentence_transformers import SentenceTransformer
from hsfs.embedding import EmbeddingIndex

from functions.zotero_parser import ZoteroCSVParser
from functions.PDF_extractor import PDFExtractor


## <span style="color:#ff5f27"> Global Config

In [None]:
import os

os.environ["HOPSWORKS_API_KEY"] = ""


## <span style="color:#ff5f27">üß¨ Metadata and Text Extraction </span>

In [None]:
from functions.metadata_check import sanitize_paper_metadata

# === Cell 3: Parse Zotero CSV ===
parser = ZoteroCSVParser("PCG.csv")
raw_papers = parser.parse()

papers = []
for paper in raw_papers:
    fixed = sanitize_paper_metadata(paper)
    if fixed is not None:
        papers.append(fixed)

print(f"Parsed {len(papers)} papers.")
papers[:2]  

Parsed 15 papers.


[{'paper_id': 'CLGNKPIJ',
  'title': 'Synthesis of Normal Heart Sounds Using Generative Adversarial Networks and Empirical Wavelet Transform',
  'authors': 'Narv√°ez, Pedro; Percybrooks, Winston S.',
  'year': 2020,
  'abstract': 'Currently, there are many works in the literature focused on the analysis of heart sounds, speciÔ¨Åcally on the development of intelligent systems for the classiÔ¨Åcation of normal and abnormal heart sounds. However, the available heart sound databases are not yet large enough to train generalized machine learning models. Therefore, there is interest in the development of algorithms capable of generating heart sounds that could augment current databases. In this article, we propose a model based on generative adversary networks (GANs) to generate normal synthetic heart sounds. Additionally, a denoising algorithm is implemented using the empirical wavelet transform (EWT), allowing a decrease in the number of epochs and the computational cost that the GAN model

In [3]:
# -------- Safe file reading --------
import urllib.parse

def safe_read_fulltext(file_path: str) -> str:
    if not file_path:
        return ""

    decoded = urllib.parse.unquote(file_path)
    return PDFExtractor.read_file(decoded) or ""

In [4]:
from functions.test_and_file_processor import clean_text, extract_abstract_from_text, extract_paragraph_chunks
from config import MIN_FULLTEXT_LEN

# -------- Main loop --------
metadata_rows = []
fulltext_rows = []

for paper in papers:
    # ---- Full text extraction ----
    raw_full_text = safe_read_fulltext(paper.get("file_attachments", ""))

    # ---- Content processing ----
    full_text = clean_text(raw_full_text)

    # ---- Abstract handling ----
    abstract = paper.get("abstract", "")
    if not abstract and len(full_text) >= MIN_FULLTEXT_LEN:
        abstract = extract_abstract_from_text(full_text) or ""

    # ---- Paper-level features ----
    metadata_rows.append(
        {
            "paper_id": paper["paper_id"],
            "title": paper["title"],
            "abstract": abstract,
            "authors": paper["authors"],
            "year": paper["year"],
            "item_type": paper["item_type"],
            "combined_text": (
                f"Title: {paper['title']}\n"
                f"Abstract: {abstract}"
            ),
        }
    )

    # ---- Chunk-level features ----
    if len(full_text) >= MIN_FULLTEXT_LEN:
        for i, chunk in enumerate(extract_paragraph_chunks(full_text)):
            fulltext_rows.append(
                {
                    "paper_id": paper["paper_id"],
                    "chunk_index": i,
                    "content": chunk,
                    "year": paper["year"],
                }
            )

print(f"Metadata rows: {len(metadata_rows)}")
print(f"Fulltext chunks: {len(fulltext_rows)}")

Metadata rows: 15
Fulltext chunks: 516


## <span style="color:#ff5f27;"> üîÆ Embedding Extraction </span>

In [None]:
# === Cell 5: Generate Embeddings for Metadata and Full Text ===

import pandas as pd
from sentence_transformers import SentenceTransformer
from hsfs import embedding

from config import EMBEDDING_MODEL_NAME


# -------------------------
# 1. Load embedding model
# -------------------------

model = SentenceTransformer(EMBEDDING_MODEL_NAME)
embedding_dim = model.get_sentence_embedding_dimension()

print(f"Loaded embedding model: {EMBEDDING_MODEL_NAME}")
print(f"Embedding dimension: {embedding_dim}")


# -------------------------
# 2. Prepare DataFrames
# -------------------------

df_metadata = pd.DataFrame(metadata_rows)
df_chunks = pd.DataFrame(fulltext_rows)

print(f"Metadata rows: {len(df_metadata)}")
print(f"Chunk rows: {len(df_chunks)}")


# -------------------------
# 3. Generate metadata embeddings
# -------------------------

if not df_metadata.empty:
    embeddings = model.encode(
        df_metadata["combined_text"].tolist(),
        show_progress_bar=True,
        convert_to_numpy=True,
    ).astype("float32")

    df_metadata["embedding"] = list(embeddings)
else:
    df_metadata["embedding"] = []

print("Metadata embeddings generated.")


# -------------------------
# 4. Generate full-text chunk embeddings
# -------------------------

if not df_chunks.empty:
    embeddings = model.encode(
        df_chunks["content"].fillna("").tolist(),
        show_progress_bar=True,
        convert_to_numpy=True,
    ).astype("float32")

    df_chunks["embedding"] = list(embeddings)

else:
    df_chunks["embedding"] = []

print("Chunk embeddings generated.")


# # -------------------------
# # 5. Add context_id (stable row id)
# # -------------------------

# df_metadata["context_id"] = range(len(df_metadata))
# df_chunks["context_id"] = range(len(df_chunks))


# -------------------------
# 6. Create embedding indexes
# -------------------------

metadata_index = embedding.EmbeddingIndex()
metadata_index.add_embedding(
    "metadata_embedding",
    embedding_dim,
)

chunk_index = embedding.EmbeddingIndex()
chunk_index.add_embedding(
    "chunk_embedding",
    embedding_dim,
)

print("Embedding indexes created.")


# -------------------------
# 7. Final sanity check
# -------------------------

display(df_metadata.head())
display(df_chunks.head())


## <span style="color:#ff5f27;"> üîÆ Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks
from config import HOPSWORKS_API_KEY
# project = hopsworks.login()

project = hopsworks.login(
        # project=HOPSWORKS_PROJECT,
        api_key_value=HOPSWORKS_API_KEY
    )

fs = project.get_feature_store()

## <span style="color:#ff5f27;"> ü™Ñ Feature Group Creation </span>

In [None]:
# === Cell 6.1: Create or Get Metadata Feature Group (Safe Version) ===

from hsfs import embedding

metadata_emb_index = embedding.EmbeddingIndex()
metadata_emb_index.add_embedding(
    "embedding",
    model.get_sentence_embedding_dimension(),
)

metadata_fg = fs.get_or_create_feature_group(
    name="paper_metadata_fg_2",
    version=3,
    description="New chunk splitting to avoid cut a word",
    primary_key=["paper_id"],
    online_enabled=True,
    embedding_index=metadata_emb_index,
)


metadata_fg.insert(
    df_metadata,
    write_options={"wait_for_job": True},
)

print(f"Inserted {len(df_metadata)} rows into metadata feature group.")


In [None]:
# === Cell 6.2: Create or Get Fulltext Chunk Feature Group (Safe Version) ===

chunk_emb_index = embedding.EmbeddingIndex()
chunk_emb_index.add_embedding(
    "embedding",
    model.get_sentence_embedding_dimension(),
)

chunk_fg = fs.get_or_create_feature_group(
    name="paper_chunk_fg_2",
    version=3,
    description="New chunk splitting to avoid cut a word",
    primary_key=["paper_id", "chunk_index"],
    online_enabled=True,
    embedding_index=chunk_emb_index,
)


In [None]:
chunk_fg.insert(
    df_chunks,
    write_options={"wait_for_job": True},
)

print(f"Inserted {len(df_chunks)} rows into chunk feature group.")


## <span style="color:#ff5f27;">ü™Ñ Feature View Creation </span>


In [None]:
# === Cell 8.1: Create Metadata Feature View ===

metadata_fv = fs.get_or_create_feature_view(
    name="paper_metadata_fv_2",
    version=3,
    description="New chunk splitting to avoid cut a word",
    query=metadata_fg.select(
        [
            "paper_id",
            "title",
            "abstract",
            "authors",
            "year",
            "item_type",
            "embedding",
        ]
    ),
)

print("Metadata Feature View ready.")


In [None]:
# === Cell 8.2: Create Chunk Feature View ===

chunk_fv = fs.get_or_create_feature_view(
    name="paper_chunk_fv_2",
    version=3,
    description="New chunk splitting to avoid cut a word",
    query=chunk_fg.select(
        [
            "paper_id",
            "chunk_index",
            "content",
            "year",
            "embedding",
        ]
    ),
)

print("Chunk Feature View ready.")


---