## <span style="color:#ff5f27">üìù Imports </span>

In [None]:
!pip install -q hopsworks rdflib sentence-transformers pymupdf

In [None]:
import pandas as pd
import hopsworks

from sentence_transformers import SentenceTransformer
from hsfs.embedding import EmbeddingIndex

from functions.zotero_parser import ZoteroCSVParser
from functions.PDF_extractor import PDFExtractor


## <span style="color:#ff5f27"> Global Config

In [None]:
import os

os.environ["HOPSWORKS_API_KEY"] = "1QJZ515qO3Hl6pwr.Kr6HwXJ5SbnYV6TeEyAEyDGsV31Is9rryhZUyvRjamJjodvONIodYhBskNcZxHAz"


## <span style="color:#ff5f27">üß¨ Metadata and Text Extraction, Embedding Creation </span>

In [None]:
import re
from typing import Optional, Dict, Any

def sanitize_paper_metadata(paper: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Defensive metadata sanitation.
    Returns None if the paper is considered invalid.
    """

    # ---- 1. Mandatory Field Validation ----
    title = paper.get("title", "").strip()
    if not title:
        return None  # Equivalent to RDF version: discard if title is missing

    paper["title"] = title

    # ---- 2. Year Repair (Reusing regex logic from RDF) ----
    year = paper.get("year")
    if year is None:
        # Attempt to recover year from other metadata fields
        for field in ("url", "abstract"):
            text = paper.get(field, "")
            match = re.search(r"(19|20)\d{2}", text)
            if match:
                paper["year"] = int(match.group())
                break

    # ---- 3. Authors Fallback ----
    authors = paper.get("authors", "").strip()
    if not authors or authors.lower() == "nan":
        paper["authors"] = "Unknown"

    # ---- 4. Abstract Normalization ----
    abstract = paper.get("abstract", "").strip()
    if abstract.lower() in {"nan", "none"}:
        paper["abstract"] = ""

    # ---- 5. Attachments Handling ----
    # Preserve original state but ensure the value is a string
    attachments = paper.get("file_attachments")
    paper["file_attachments"] = str(attachments) if attachments is not None else ""

    return paper

In [None]:
# === Cell 3: Parse Zotero CSV ===
parser = ZoteroCSVParser("PCG.csv")
raw_papers = parser.parse()

papers = []
for paper in raw_papers:
    fixed = sanitize_paper_metadata(paper)
    if fixed is not None:
        papers.append(fixed)

print(f"Parsed {len(papers)} papers.")
papers[:2]  

In [None]:
# === Cell 4: Extract Full Text from Attachments ===

import re
import urllib.parse
from typing import List, Optional

from config import MIN_FULLTEXT_LEN, CHUNK_SIZE, CHUNK_OVERLAP


metadata_rows = []
fulltext_rows = []


# -------- Abstract extraction (provided implementation) --------

def extract_abstract_from_text(text: str) -> Optional[str]:
    normalized = text.replace("\r\n", "\n").replace("\r", "\n")

    stop_markers = (
        r"keywords|index\s*terms|subject[s]?|introduction|background|materials\s+and\s+methods|"
        r"methods|results|conclusions|references|acknowledg(e)?ments|1\.|i\.|ii\.|iii\."
        r"|Keywords|Introduction|Background|Methods|Results|Conclusion|References"
    )
    start_markers = r"abstract|summary|Abstract|Summary"

    pattern = rf"(?is)\b(?:{start_markers})\b\s*[:\.\-]?\s*(.+?)(?=\n\s*(?:{stop_markers})\b|\n\n\s*[A-Z][A-Za-z ]+\b|\Z)"
    match = re.search(pattern, normalized)
    if match:
        abstract = re.sub(r"\s+", " ", match.group(1).strip())
        if 50 <= len(abstract) <= 5000 and re.search(r"[a-z]", abstract, re.I):
            return abstract

    lines = normalized.split("\n")
    abstract_started = False
    buffer: list[str] = []

    for line in lines:
        line_stripped = line.strip()

        if not abstract_started:
            if re.match(
                r"(?i)^(abstract|summary)\b\s*[:\-\.]?\s*$",
                line_stripped,
            ) or re.match(
                r"(?i)^(abstract|summary)\b\s*[:\-\.]?",
                line_stripped,
            ):
                after = re.sub(
                    r"(?i)^(abstract|summary)\b\s*[:\-\.]?\s*",
                    "",
                    line_stripped,
                )
                if after:
                    buffer.append(after)
                abstract_started = True
            continue
        else:
            if re.match(rf"(?i)^\s*(?:{stop_markers})\b", line_stripped):
                break
            buffer.append(line)

    candidate = re.sub(r"\s+", " ", " ".join(buffer)).strip()
    if 50 <= len(candidate) <= 5000 and re.search(r"[a-z]", candidate, re.I):
        return candidate

    paragraphs = re.split(r"\n\s*\n", normalized)
    for paragraph in paragraphs[:8]:
        p = re.sub(r"\s+", " ", paragraph.strip())
        if (
            120 <= len(p) <= 5000
            and not re.match(
                r"(?i)^(keywords|index\s*terms|introduction|references|acknowledg(e)?ments)",
                p,
            )
            and p.count(".") >= 2
        ):
            return p

    return None


# -------- Content cleaning --------

def clean_text(text: str) -> str:
    if not text:
        return ""

    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"(\.{5,}|\-{5,})", " ", text)
    text = re.sub(r"[\t\f\u00A0]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


# -------- Chunking --------

def chunk_text(
    text: str,
    chunk_size: int = CHUNK_SIZE,
    overlap: int = CHUNK_OVERLAP,
) -> List[str]:
    if not text:
        return []

    paragraphs = text.split("\n\n")
    chunks = []
    current = ""

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        if len(current) + len(para) < chunk_size:
            current = f"{current}\n\n{para}" if current else para
        else:
            if current:
                chunks.append(current)
                current = current[-overlap:] + "\n\n" + para

            if len(current) > chunk_size:
                for i in range(0, len(current), chunk_size - overlap):
                    chunks.append(current[i : i + chunk_size])
                current = ""

    if current:
        chunks.append(current)

    return chunks


# -------- Safe file reading --------

def safe_read_fulltext(file_path: str) -> str:
    if not file_path:
        return ""

    decoded = urllib.parse.unquote(file_path)
    return PDFExtractor.read_file(decoded) or ""


# -------- Main loop --------

for paper in papers:
    # ---- Full text extraction ----
    raw_full_text = safe_read_fulltext(paper.get("file_attachments", ""))

    # ---- Content processing ----
    full_text = clean_text(raw_full_text)

    # ---- Abstract handling ----
    abstract = paper.get("abstract", "")
    if not abstract and len(full_text) >= MIN_FULLTEXT_LEN:
        abstract = extract_abstract_from_text(full_text) or ""

    # ---- Paper-level features ----
    metadata_rows.append(
        {
            "paper_id": paper["paper_id"],
            "title": paper["title"],
            "abstract": abstract,
            "authors": paper["authors"],
            "year": paper["year"],
            "item_type": paper["item_type"],
            "combined_text": (
                f"Title: {paper['title']}\n"
                f"Abstract: {abstract}"
            ),
        }
    )

    # ---- Chunk-level features ----
    if len(full_text) >= MIN_FULLTEXT_LEN:
        for i, chunk in enumerate(chunk_text(full_text)):
            fulltext_rows.append(
                {
                    "paper_id": paper["paper_id"],
                    "chunk_index": i,
                    "content": chunk,
                    "year": paper["year"],
                }
            )

print(f"Metadata rows: {len(metadata_rows)}")
print(f"Fulltext chunks: {len(fulltext_rows)}")


In [None]:
# === Cell 5: Generate Embeddings for Metadata and Full Text ===

import pandas as pd
from sentence_transformers import SentenceTransformer
from hsfs import embedding

from config import EMBEDDING_MODEL_NAME


# -------------------------
# 1. Load embedding model
# -------------------------

model = SentenceTransformer(EMBEDDING_MODEL_NAME)
embedding_dim = model.get_sentence_embedding_dimension()

print(f"Loaded embedding model: {EMBEDDING_MODEL_NAME}")
print(f"Embedding dimension: {embedding_dim}")


# -------------------------
# 2. Prepare DataFrames
# -------------------------

df_metadata = pd.DataFrame(metadata_rows)
df_chunks = pd.DataFrame(fulltext_rows)

print(f"Metadata rows: {len(df_metadata)}")
print(f"Chunk rows: {len(df_chunks)}")


# -------------------------
# 3. Generate metadata embeddings
# -------------------------

if not df_metadata.empty:
    df_metadata["embedding"] = model.encode(
        df_metadata["combined_text"].tolist(),
        show_progress_bar=True,
        convert_to_numpy=True,
    )
else:
    df_metadata["embedding"] = []

print("Metadata embeddings generated.")


# -------------------------
# 4. Generate full-text chunk embeddings
# -------------------------

if not df_chunks.empty:
    df_chunks["embedding"] = model.encode(
        df_chunks["content"].tolist(),
        show_progress_bar=True,
        convert_to_numpy=True,
    )
else:
    df_chunks["embedding"] = []

print("Chunk embeddings generated.")


# -------------------------
# 5. Add context_id (stable row id)
# -------------------------

df_metadata["context_id"] = range(len(df_metadata))
df_chunks["context_id"] = range(len(df_chunks))


# -------------------------
# 6. Create embedding indexes
# -------------------------

metadata_index = embedding.EmbeddingIndex()
metadata_index.add_embedding(
    "metadata_embedding",
    embedding_dim,
)

chunk_index = embedding.EmbeddingIndex()
chunk_index.add_embedding(
    "chunk_embedding",
    embedding_dim,
)

print("Embedding indexes created.")


# -------------------------
# 7. Final sanity check
# -------------------------

display(df_metadata.head())
display(df_chunks.head())


## <span style="color:#ff5f27;"> üîÆ Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks
from config import HOPSWORKS_API_KEY
# project = hopsworks.login()

project = hopsworks.login(
        # project=HOPSWORKS_PROJECT,
        api_key_value=HOPSWORKS_API_KEY
    )

fs = project.get_feature_store()

## <span style="color:#ff5f27;"> ü™Ñ Feature Group Creation </span>

In [None]:
# === Cell 6.1: Create or Get Metadata Feature Group (Safe Version) ===

from hsfs import embedding

metadata_emb_index = embedding.EmbeddingIndex()
metadata_emb_index.add_embedding(
    "embedding",
    model.get_sentence_embedding_dimension(),
)

metadata_fg = fs.get_or_create_feature_group(
    name="paper_metadata_fg",
    version=1,
    description="Paper-level metadata features (title, abstract, authors, year, item type)",
    primary_key=["paper_id"],
    online_enabled=False,
    embedding_index=metadata_emb_index,
)


metadata_fg.insert(
    df_metadata,
    write_options={"wait_for_job": True},
)

print(f"Inserted {len(df_metadata)} rows into metadata feature group.")


In [None]:
# === Cell 6.2: Create or Get Fulltext Chunk Feature Group (Safe Version) ===

chunk_emb_index = embedding.EmbeddingIndex()
chunk_emb_index.add_embedding(
    "embedding",
    model.get_sentence_embedding_dimension(),
)

chunk_fg = fs.get_or_create_feature_group(
    name="paper_chunk_fg",
    version=1,
    description="Chunk-level full text features for RAG",
    primary_key=["paper_id", "chunk_index"],
    online_enabled=False,
    embedding_index=chunk_emb_index,
)


In [None]:
chunk_fg.insert(
    df_chunks,
    write_options={"wait_for_job": True},
)

print(f"Inserted {len(df_chunks)} rows into chunk feature group.")


## <span style="color:#ff5f27;">ü™Ñ Feature View Creation </span>


In [None]:
# === Cell 8.1: Create Metadata Feature View ===

metadata_fv = fs.get_or_create_feature_view(
    name="paper_metadata_fv",
    version=1,
    description="Paper-level metadata for retrieval and filtering",
    query=metadata_fg.select(
        [
            "paper_id",
            "title",
            "abstract",
            "authors",
            "year",
            "item_type",
            "embedding",
        ]
    ),
)

print("Metadata Feature View ready.")


In [None]:
# === Cell 8.2: Create Chunk Feature View ===

chunk_fv = fs.get_or_create_feature_view(
    name="paper_chunk_fv",
    version=1,
    description="Chunk-level full text for RAG context retrieval",
    query=chunk_fg.select(
        [
            "paper_id",
            "chunk_index",
            "content",
            "year",
            "embedding",
        ]
    ),
)

print("Chunk Feature View ready.")


---