# 0. Configuration

In [None]:
# *** FROM Establishing an API interface for your program ***
# ***              ELSEVIER DEVELOPER PORTAL              ***

"""An example program that uses the elsapy module"""

import requests
import pandas as pd
import numpy as np
import os
import time
import json
import re
import csv
import regex

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch

from urllib.parse import quote_plus as url_encode
import json, pathlib
from tqdm import tqdm  # Para mostrar barra de progreso
from pathlib import Path
from typing import Tuple, Union, Optional, List

import spacy
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

# TEXT EXTRACTION
import unicodedata
import glob
import xml.etree.ElementTree as ET
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor, as_completed

import spacy

In [None]:
## Load configuration
con_file = open("01_Extract_Information_Data/config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']

In [None]:
HEADERS_BASE = {
    "X-ELS-APIKey" : "aba41ff058a2312128b290e1e4ef2408",
    "X-ELS-Insttoken": "5d854c51cbf02f31a0a69c61ee0aafec",
    "Accept": 'application/json'
}

# 1. TEXT EXTRACTION FROM XML

In [None]:
# ====== CONFIG ======
XML_DIR     = "01_Extract_Information_Data/txt_SD"

In [None]:
# ====== FIRST CLEAN ======
URL_RE      = re.compile (r"(https?://\S+|www\.S+)", re.I)
IMG_RE      = re.compile (r"\b(svg|jpg|png|gif|xml|altimg|amazonaws|cdn|pii|doi)\b", re.I)
LONG_ID     = re.compile (r"\b[A-Z0_9\-]{15,}\b")
MATH_SYM    = re.compile (r"[=¬±œÜŒ≤‚àë‚àö√ó‚â§‚â•^{}]")

PARA_TAGS   = {"p", "para", "simple-para"}
TITLE_TAGS  = {"title", "section-title"}
INTRO_RE    = re.compile (r"\b(introduction|motivation and background|background and introduction)\b", re.I)
CONCL_RE    = re.compile (r"\b(?:\d+(\.\d+)*)?\s*(conclusion|conclusions|concluding remarks|summary|discussion and conclusions)\b", re.I)

EXCLUDE_PATTERNS = [
    # Usar \b para l√≠mites de palabra y ^/$ para t√≠tulos exactos
    r"^\s*acknowledgments?\s*$",
    r"^\s*acknowledgements?\s*$",
    r"^\s*ack\.?\s*$",
    r"^\s*references?\s*$", 
    r"^\s*bibliography\s*$",
    r"^\s*appendix\s*$",
    r"^\s*appendices\s*$",
    r"^\s*supplementary\s+(?:materials?|information|data)\s*$",
    r"^\s*supporting\s+information\s*$",
    r"^\s*author\s+contributions?\s*$",
    r"^\s*conflict\s+of\s+interest\s*$",
    r"^\s*competing\s+interests?\s*$", 
    r"^\s*funding\s*$",
    r"^\s*data\s+availability\s*$",
    r"^\s*declarations?\s*$",
    r"^\s*credit\s+author\s*$",
    r"^\s*glossary\s*$",
    r"^\s*nomenclature\s*$",
    r"^\s*acronyms\s*$",
    r"^\s*proof\s+of\s+(?:theorem|lemma|proposition|corollary)\s*$",
    r"^\s*summary\s*$",
]

EXCLUDE_RE = re.compile("|".join(EXCLUDE_PATTERNS), re.IGNORECASE)

SKIP_SUBTREES = {
    "ref_list", "references", "bibliography", "back",
    "figure", "table", "caption", "thead", "tbody", "tfoot",
    "nomenclature", "acronyms", "abbreviations", "mi", "mrow", "mo"
}

FORMULA_CONTAINERS = {
    "display-formula", "inside-formula", "formula", "display", "ce-display", "cd:display", "mml:mi"
}

MATHML_LOCAL= {
    "math", "mrow", "msub", "msup", "msubsup", "mover", "munder", "munderover",
    "mi", "mo", "mn", "mfenced", "mfrac", "msqrt", "mroot", "mtable", "mtr", "mtd", "mrow"
}

STRUCTURE_PATTERNS = [
    r"figure[s]?", r"fig\.?", r"table[s]?", r"tab\.?",  # ‚ùå Quitar \b si usas (?i)
    r"appendix", r"supplementary", r"caption",
    r"equation", r"eq\.?", r"formula",
    r"algorithm", r"alg\.?", r"pseudocode",
    r"definition", r"theorem", r"lemma", r"corollary",
    r"proof", r"proposition", r"axiom",
    r"acknowledgments?", r"ack\.?",
    r"references?", r"bibliography",
    r"introduction", r"conclusion",
    r"methodology", r"methods?",
    r"results?", r"discussion",
    r"abstract", r"keywords",
    r"background", r"related work",
    r"future work", r"limitations",
    r"conflict of interest",
    r"data availability",
    r"author contributions",
    r"funding", r"grant",
    r"peer review",
    r"received.*accepted",
    r"published.*elsevier",
    r"journal.*elsevier",
    r"¬©\s?\d{4}.+?elsevier.+?$"
]
STRUCT_RE = re.compile("|".join(STRUCTURE_PATTERNS), flags=re.IGNORECASE)



UNITS_PATTERNS = [
    r"\b\d+(?:[\.,]\d+)?\s?(hz|khz|mhz|db|mm|nm|ms|fps|gb|mb|km/h)\b",
    r"\b(intel|amd|nvidia|ram|cpu|gpu)"
]
UNITS_RE = re.compile("|".join(UNITS_PATTERNS), flags=re.IGNORECASE)

def _local(tag: str) -> str:
    return tag.rsplit("}", 1)[-1] if "}" in tag else tag

def _ns(tag: str) -> str:
    return tag[1:].split("}")[0] if tag.startswith("{") and "}" in tag else ""

def strip_mathml_and_formulas (root: ET.Element):
    parent_map = {child: parent for parent in root.iter() for child in parent}
    to_remove = []
    for element in root.iter():
        ns = _ns(element.tag).lower()
        loc = _local(element.tag).lower()
        if ("mathml" in ns or ns.endswith(":mathml")):
            to_remove.append(element); continue
        if loc in MATHML_LOCAL:
            to_remove.append(element); continue
        if loc in FORMULA_CONTAINERS:
            to_remove.append(element); continue
        
    for element in to_remove:
        parent = parent_map.get(element)
        if parent is not None:
            try:
                parent.remove(element)
            except Exception:
                pass

GREEK_BLOCKS = (
    (0x0373, 0x03FF),
    (0x1F00, 0x1FFF)
)

def _is_greek_char(ch: str) -> bool:
    cp = ord(ch)
    for a, b in GREEK_BLOCKS:
        if a <= cp <= b:
            return True
        return False
    
def strip_residual_math_chars(text: str) -> str:
    text = "".join(ch if unicodedata.category(ch) != "Sm" else " " for ch in text)
    text = "".join(ch if not _is_greek_char(ch) else " " for ch in text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

def clean_text_strict (t: str) -> str:
    if not t: return ""
    t = regex.sub(r"\s+", " ", t)
    t = unicodedata.normalize ("NFKC", str(t))
    t = t.replace ("\u00A0", " ").replace("\u200B", " ")
    t = URL_RE.sub(" ", t)
    t = IMG_RE.sub(" ", t)
    t = LONG_ID.sub(" ", t)
    t = MATH_SYM.sub(" [EQUATION] ", t)
    t = STRUCT_RE.sub(" ", t)
    t = UNITS_RE.sub(" ", t)
    # compact spaces, jumps...
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\s*\n+\s*", "\n\n", t)
    t = re.sub(r"\s*\[EQUATION\]\s*", " ", t)
    t = re.sub(r"\s{2,}", " ", t)
    return t.strip()

def find_article_body (root: ET.Element):
    bodies = [element for element in root.iter() if _local(element.tag) == "body"]
    return bodies[0] if bodies else None

## 1.2 EXTRACT SECTIONS OF TEXT

In [None]:
# EXTRACT SECTIONS OF TEXT
OUT_PATH = "01_Extract_Information_Data/All_With_Conclusions_SD.parquet"

def _norm_title(s: str) -> str:
    s = re.sub(r"\s+", " ", (s or "")).strip()
    s = re.sub(r"\.$", "", s)
    return s

def extract_sections_int_conc(root: ET.Element, doc_id: str = "unknown") -> str:
    body = find_article_body(root)
    if body is None:
        return [""]

    sections = []
    cur_title = None
    cur_pars = []
    inside = False
    reading_conclusion = False
    finished = False

    def is_introduction(txt: str) -> bool:
        t = re.sub(r"\s+", " ", (txt or "")).strip()
        return bool(INTRO_RE.search(t))
    
    def is_conclusion(txt: str) -> bool:
        t = re.sub(r"\s+", " ", (txt or "")).strip()
        return bool(CONCL_RE.search(t))

    def flush_section():
        nonlocal cur_title, cur_pars
        if cur_title is None and not cur_pars:
            return
        text = clean_text_strict("\n\n".join([p for p in cur_pars if p]).strip())
        if text:
            sections.append({"sec_title": _norm_title(cur_title or ""), "sec_text": text})
        cur_title, cur_pars = None, []
    
    skip_stack = [False]

    def walk(element: ET.Element):
        nonlocal inside, finished, cur_title, cur_pars, reading_conclusion
        
        if finished: 
            return
            
        tag_loc = _local(element.tag)
        parent_skip = skip_stack[-1]
        here_skip = parent_skip or (tag_loc in SKIP_SUBTREES)
        skip_stack.append(here_skip)

        if not here_skip:
            if tag_loc in TITLE_TAGS:
                title = "".join(element.itertext()).strip()
                if title:
                    if is_introduction(title):
                        inside = True
                        cur_title, cur_pars = title, []
                        reading_conclusion = False
                    elif inside:
                        if is_conclusion(title):
                            flush_section()  # Guardar secci√≥n anterior
                            cur_title, cur_pars = title, []
                            reading_conclusion = True
                        else:
                            flush_section()  # Guardar secci√≥n anterior
                            cur_title = title
                            # Si est√°bamos en conclusiones y encontramos otro t√≠tulo, 
                            # es una subsecci√≥n de conclusiones
                            if reading_conclusion:
                                cur_pars.append(f"\n{title}:")  # A√±adir como marcador

            # ‚úÖ CORRECCI√ìN: SOLO recolectar p√°rrafos, NO hacer flush autom√°tico
            if tag_loc in PARA_TAGS and inside and not finished:
                txt = "\n\n".join(element.itertext()).strip()
                if txt:
                    cur_pars.append(txt)
                    # ‚ùå ELIMINADO: el flush autom√°tico que cortaba las conclusiones
        
        for ch in element:
            if not finished:
                walk(ch)
        skip_stack.pop()

    walk(body)

    # ‚úÖ FLUSH FINAL para las conclusiones (no se hace autom√°ticamente)
    if reading_conclusion and cur_pars:
        flush_section()

    # Fallback si no se encontraron secciones
    if not sections:
        all_pars = ["".join(element.itertext()).strip()
                   for element in body.iter()
                   if _local(element.tag) in PARA_TAGS]
        text = clean_text_strict("\n\n".join([p for p in all_pars if p]).strip())
        if text:
            sections = [{"sec_title": "", "sec_text": text}]
    
    # Metadata
    for i, s in enumerate(sections):
        s["sec_order"] = i
        s["n_chars"] = len(s["sec_text"])
        s["n_words"] = len(re.findall(r"\w+", s["sec_text"]))
        s["doc_id"] = doc_id
    
    return sections

def process_file_sections (path_xml: str) -> str:
    doc_id = os.path.splitext(os.path.basename(path_xml))[0]
    try:
        root = ET.parse(path_xml).getroot()
    except ET.ParseError:
        print (f"‚ùå ParseError in file: {doc_id}")
        return ""
    strip_mathml_and_formulas(root)
    txt = extract_sections_int_conc(root, doc_id)
    return txt

def build_sections_parquet ():
    files = sorted (glob.glob(os.path.join(XML_DIR, "*.xml")))
    rows = []

    for path_xml in tqdm(files, desc="Extracting XML sections"):
        try:
            result = process_file_sections(path_xml)
            if result:
                rows.extend(result)
        except Exception as e:
            print(f"‚ùå Error processing file {os.path.basename(path_xml)}: {e}")
    
    rows = [r for r in rows if isinstance(r, dict)]
    
    df = pd.DataFrame(rows)
    expected = ["doc_id", "sec_order", "sec_title", "sec_text", "n_words", "n_chars"]
    for c in expected:
        if c not in df.columns:
            df[c] = "" if c in ("sec_title", "sec_text") else 0
    df = df[expected].reset_index(drop=True)

    table = pa.Table.from_pydict({
        "doc_id"    : pa.array(df["doc_id"].astype(str).tolist()),
        "sec_order" : pa.array(df["sec_order"].astype("int32").tolist(), type=pa.int32()),
        "sec_title" : pa.array(df["sec_title"].astype(str).tolist()),
        "sec_text"  : pa.array(df["sec_text"].astype(str).tolist()),
        "n_chars"   : pa.array(df["n_chars"].astype("int32").tolist(), type=pa.int32()),
        "n_words"   : pa.array(df["n_words"].astype("int32").tolist(), type=pa.int32())
    })
    pq.write_table(table, OUT_PATH, compression="zstd")
    print (f" ‚úÖ Saved: {OUT_PATH}.  ({len(df)} documents)")

In [None]:
build_sections_parquet()

## 1.3 EXTRACT ALL SECTIONS

In [None]:
# EXTRACT SECTIONS OF TEXT
OUT_PATH = "01_Extract_Information_Data/All_With_Conclusions_SD.parquet"

In [None]:
def _norm_title(s: str) -> str:
    """Normalize title: remove extra spaces and trailing period"""
    s = re.sub(r"\s+", " ", (s or "")).strip()
    s = re.sub(r"\.$", "", s)
    return s

def extract_all_sections_with_filter(root: ET.Element, doc_id: str = "unknown") -> list:
    """
    Extract ALL sections from body and then filter unwanted sections
    Maintains the original output structure
    """
    body = find_article_body(root)
    if body is None:
        print(f"‚ùå No body found in file: {doc_id}")
        return [""]

    sections    = []
    cur_title   = None
    cur_pars    = []
    cur_label   = None
    pending_label = None
    
    def should_exclude_section(title: str) -> bool:
        """Check if section should be excluded based on title"""
        if not title:
            return False
        normalized_title = re.sub(r"\s+", " ", title.lower().strip())
        return bool(EXCLUDE_RE.search(normalized_title))

    def flush_section():
        """Save current section to results if not excluded"""
        nonlocal cur_title, cur_pars, cur_label, pending_label
        if cur_title is None and not cur_pars:
            return
        
        # Check if we should exclude this section
        if cur_title and should_exclude_section(cur_title):
            print(f"üîï Excluding section: '{cur_title}'")
            cur_title, cur_pars, cur_label = None, [], None
            return
            
        text = clean_text_strict("\n\n".join([p for p in cur_pars if p]).strip())
        if text or cur_title:  # Keep sections with title even if empty text
            sections.append({
                "sec_title" : _norm_title(cur_title) if cur_title else "",
                "sec_text"  : text,
                "label"     : cur_label if cur_label else ""
            })
        cur_title, cur_pars, cur_label = None, [], None
    
    skip_stack = [False]

    def walk(element: ET.Element):
        """Walk through XML tree and collect all sections"""
        nonlocal cur_title, cur_pars, cur_label, pending_label
        
        tag_loc = _local(element.tag)
        parent_skip = skip_stack[-1]
        here_skip = parent_skip or (tag_loc in SKIP_SUBTREES)
        skip_stack.append(here_skip)

        if not here_skip:

            # Extract label if present 
            if tag_loc == 'label':
                label_text = "".join(element.itertext()).strip()
                if label_text:
                    pending_label = label_text

            # Handle section titles
            if tag_loc in TITLE_TAGS:
                title = "".join(element.itertext()).strip()
                if title:
                    # Save previous section before starting new one
                    flush_section()

                    cur_title     = title
                    cur_pars      = []
                    cur_label     = pending_label       # Use previous label
                    pending_label = None                # Reset 
            
            # Collect paragraphs for current section
            if tag_loc in PARA_TAGS:
                txt = "\n\n".join(element.itertext()).strip()
                if txt:
                    cur_pars.append(txt)
        
        # Process children
        for ch in element:
            walk(ch)
        skip_stack.pop()

    # Extract ALL sections from body
    walk(body)
    
    # Save the last section
    flush_section()

    # Fallback: if no sections found, extract all paragraphs
    if not sections:
        print(f"‚ùå No sections found in file: {doc_id}")
        all_pars = ["".join(element.itertext()).strip()
                    for element in body.iter()
                    if _local(element.tag) in PARA_TAGS]
        text = clean_text_strict("\n\n".join([p for p in all_pars if p]).strip())
        if text:
            sections = [{"sec_title": "", "sec_text": text, "label": ""}]
    
    # Add metadata (maintaining original structure)
    for i, s in enumerate(sections):
        s["sec_order"] = i
        s["n_chars"] = len(s["sec_text"])
        s["n_words"] = len(re.findall(r"\w+", s["sec_text"]))
        s["doc_id"] = doc_id
    
    print(f"‚úÖ {doc_id}: Extracted {len(sections)} sections")
    return sections

def process_file_sections(path_xml: str) -> str:
    """Process individual XML file and extract sections"""
    doc_id = os.path.splitext(os.path.basename(path_xml))[0]
    try:
        root = ET.parse(path_xml).getroot()
    except ET.ParseError:
        print(f"‚ùå ParseError in file: {doc_id}")
        return ""
    strip_mathml_and_formulas(root)
    # Use the new comprehensive extraction function
    txt = extract_all_sections_with_filter(root, doc_id)
    return txt

def build_sections_parquet():
    """Build parquet file with all extracted sections"""
    files = sorted(glob.glob(os.path.join(XML_DIR, "*.xml")))
    rows = []

    for path_xml in tqdm(files, desc="Extracting XML sections"):
        try:
            result = process_file_sections(path_xml)
            if result:
                rows.extend(result)
        except Exception as e:
            print(f"‚ùå Error processing file {os.path.basename(path_xml)}: {e}")
    
    # Filter and validate rows
    rows = [r for r in rows if isinstance(r, dict)]
    
    # Create DataFrame with expected structure
    df = pd.DataFrame(rows)
    expected = ["doc_id", "sec_order", "label", "sec_title", "sec_text", "n_words", "n_chars"]
    for c in expected:
        if c not in df.columns:
            df[c] = "" if c in ("sec_title", "sec_text") else 0
    df = df[expected].reset_index(drop=True)

    # Create PyArrow table with proper types
    table = pa.Table.from_pydict({
        "doc_id": pa.array(df["doc_id"].astype(str).tolist()),
        "sec_order": pa.array(df["sec_order"].astype("int32").tolist(), type=pa.int32()),
        "label"     : pa.array(df["label"].astype(str).tolist()),
        "sec_title": pa.array(df["sec_title"].astype(str).tolist()),
        "sec_text": pa.array(df["sec_text"].astype(str).tolist()),
        "n_chars": pa.array(df["n_chars"].astype("int32").tolist(), type=pa.int32()),
        "n_words": pa.array(df["n_words"].astype("int32").tolist(), type=pa.int32())
    })
    
    # Write to parquet
    pq.write_table(table, OUT_PATH, compression="zstd")
    print(f"‚úÖ Saved: {OUT_PATH}. ({len(df)} documents)")

In [None]:
build_sections_parquet()

In [None]:
df = pd.read_parquet("01_Extract_Information_Data/All_With_Conclusions_SD.parquet")
display(df.head(30))

## JOURNAL

In [None]:
import os
import xml.etree.ElementTree as ET

def extract_journal_name(xml_file_path: str) -> str:
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # Namespaces potenciales (puedes ampliarlos si es necesario)
        namespaces = {
            "prism": "http://prismstandard.org/namespaces/basic/2.0/"
        }

        # 1. Intentar encontrar <prism:publicationName>
        journal_elem = root.find(".//prism:publicationName", namespaces)
        if journal_elem is not None and journal_elem.text:
            return journal_elem.text.strip()

        # 2. Buscar cualquier etiqueta que termine en publicationName (sin namespace)
        for elem in root.iter():
            tag_name = elem.tag.split("}")[-1]  # Quitar namespace
            if tag_name == "publicationName" and elem.text:
                return elem.text.strip()

        # 3. Fallback: intentar prism sin declarar namespace
        journal_elem2 = root.find(".//publicationName")
        if journal_elem2 is not None and journal_elem2.text:
            return journal_elem2.text.strip()

        return None

    except ET.ParseError as e:
        print(f"‚ö†Ô∏è ParseError en {os.path.basename(xml_file_path)}: {e}")
        return None
    except Exception as e:
        print(f"‚ùå Error en {os.path.basename(xml_file_path)}: {e}")
        return None


In [None]:
journals = []
for i in range (0, 3818):
    file_path = f"01_Extract_Information_Data/txt_SD/{i:04d}.xml"
    journal_name = extract_journal_name(file_path)

    journals.append({
        'doc_id'        : i,
        'journal_name'  : journal_name
    })

In [None]:
journals_df = pd.DataFrame(journals)

In [None]:
journals_df.to_csv("01_Extract_Information_Data/Journal.csv")
display(journals_df)


# 2 CONCLUSIONS

In [None]:
df_not_conclusions = pd.read_parquet("01_Extract_Information_Data/02_Clean_Sections_SD_02.parquet")
df_not_conclusions['key'] = df_not_conclusions['doc_id'].astype(str) + '-' + df_not_conclusions['sec_title'] + '-' + df_not_conclusions['n_chars'].astype(str)

df_with_conclusions = pd.read_parquet("01_Extract_Information_Data/All_With_Conclusions_SD.parquet")
df_with_conclusions['key'] = df_with_conclusions['doc_id'].astype(str) + '-' + df_with_conclusions['sec_title'] + '-' + df_with_conclusions['n_chars'].astype(str)

display(df_with_conclusions)

In [None]:
filtre_01 = df_not_conclusions[df_not_conclusions['doc_id'].str.contains('0000')]
filtre_02 = df_with_conclusions[df_with_conclusions['doc_id'].str.contains('0000')]

display(filtre_01)
display(filtre_02)

In [None]:
only_conclusions = df_with_conclusions[~df_with_conclusions['key'].isin(df_not_conclusions['key'])].reset_index(drop=True)
only_conclusions = only_conclusions.drop('key', axis=1)

filtre = only_conclusions[~only_conclusions['sec_title'].str.contains('conclusion', case=False, na=False)]
only_conclusions.to_clipboard()
display(only_conclusions)

In [None]:
#only_conclusions.to_parquet("01_Extract_Information_Data/03_Only_Conclusions.parquet")

# 3. OPTIONAL PREPROCESS

In [None]:
# Download NLTK stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw_1.4')

# Load spaCy (without parser nor ner) & stopwords
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])       # Tokenizer 
stop_words = set(stopwords.words('english'))

In [None]:
# Define a function to preprocess text BASIC FIRST OPTION
def preprocess_text(text):
    # Normalization
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    doc= nlp(text)

    tokens = [
        token.lemma_ for token in doc 
        if token.lemma_ not in stop_words and token.is_alpha and len(token.lemma_) > 2
    ]

    return " ".join(tokens)

## 3.1 Preprocess for titles

In [None]:
# Apply preprocessing to the documents
DIR_TO_CHANGE = "03_Embeddings/Embeddings_SD_02.parquet"

In [None]:
docs = pd.read_parquet(DIR_TO_CHANGE)
titles = docs["chunk_title"]
print(titles.iloc[8])

titles_clean = [preprocess_text(title) for title in titles]

In [None]:
display(titles_clean)

In [None]:
titles_clean = pd.DataFrame(titles_clean)

titles_clean = titles_clean.rename(columns={0: "title"})
print(f"len titles: {len(titles_clean)} & len docs: {len(docs)}")
print(titles_clean)

In [None]:
rows = []
for i in range (0, len(docs)):
    rows.append({
        "doc_id"        : docs["doc_id"][i],
        "sec_order"     : docs["sec_order"][i],
        "sec_title"     : titles_clean["title"][i],
        "chunk_id_int"  : docs["chunk_id_int"][i],
        "chunk_id"      : docs["chunk_id"][i],
        "chunk_text"    : docs["chunk_text"][i],
        #"sec_text_clean": docs_clean["text"][i],
        "n_words"       : docs["n_words"][i],
        "n_chars"       : docs["n_chars"][i]
    })

In [None]:
doc_title_clean = pd.DataFrame(rows)
display(doc_title_clean)

In [None]:
#doc_title_clean.to_parquet(DIR_TO_CHANGE, index=False)

## 3.2 For text

In [None]:
Texts = pd.read_parquet("01_Extract_Information_Data/02_Clean_Sections_SD_02.parquet")["sec_text"]
print(Texts.iloc[8])

Texts_Clean = [preprocess_text(Text) for Text in Texts]

In [None]:
docs = pd.read_parquet("01_Extract_Information_Data/02_Clean_Sections_SD_02.parquet")

rows = []
for i in range (0, len(docs)):
    rows.append({
        "doc_id"        : docs["doc_id"][i],
        "sec_order"     : docs["sec_order"][i],
        #"label"         : docs["label"][i],
        "sec_title"     : docs["sec_title"][i],
        "sec_text"      : Texts_Clean[i],
        "n_words"       : docs["n_words"][i],
        "n_chars"       : docs["n_chars"][i]
    })

In [None]:
Docs_Lema = pd.DataFrame(rows)
display(Docs_Lema)

In [None]:
Docs_Lema.to_parquet("01_Extract_Information_Data/02_Clean_Sections_SD_Lema.parquet")

# 4. CHUNKS

## 4.1 FONCTIONS

In [None]:
# 1. Splits the text into sentences

import spacy
_SPACY = spacy.blank("en")
if "sentencizer" not in _SPACY.pipe_names:
    _SPACY.add_pipe('sentencizer')


In [None]:
def split_sentences (text: str) -> list[str]:
    text = (text or "").strip()
    if not text:
        return []
    doc = _SPACY (text)
    return [s.text.strip() for s in doc.sents if s.text.strip()]
    
def _count_tokens_approx (s: str) -> int:
    return len(re.findall(r"\S+", s))

def _symbol_ratio (s: str) -> float:
    if not s: return 1.0
    total = len(s)
    alnum = sum(ch.isalnum() or ch.isspace() for ch in s)
    return 1.0 - (alnum/total)

In [None]:
def chunky_by_sentences (
        text            : str,
        target_tokens   : int= 400,
        max_tokens      : int= 520,
        min_tokens      : int= 200,
        overlap_sents   : int= 2,
        max_sents_sect  : int | None = None,
        max_chars_chunk: int | None = None,
        ensure_scientific_context : bool = True 
        ) -> list[str]:
    
    # Additional filtre for scientific texts

    sents = split_sentences(text)
    if not sents:
        return []
    
    if ensure_scientific_context:
        scientific_keywords = ['results', 'method', 'conclusion', 'abstract', 
                              'introduction', 'data', 'analysis', 'study']
        sents = [s for s in sents if any(keyword in s.lower() for keyword in scientific_keywords) or len(s.split()) > 5]
           
    if isinstance(max_sents_sect, int) and max_sents_sect > 0:
        sents = sents[:max_sents_sect]
    
    chunks  : list[str] = []
    cur     : list[str] = []
    cur_tok = 0

    def emit_chunk():
        nonlocal cur, cur_tok, chunks
        if cur:
            chunk = " ".join(cur).strip()
            if chunk:
                if isinstance (max_chars_chunk, int) and max_chars_chunk > 0 and len(chunk) > max_chars_chunk:
                    chunk = chunk[:max_chars_chunk].rsplit(" ", 1)[0]
                chunks.append(chunk)
        cur = []
        cur_tok = 0
   
    i = 0
    retry_for_same_sentence = 0
    while i < len(sents):
        sent = sents[i]
        t = _count_tokens_approx(sent)

        # for sentences bigger than max_tokens
        if t > max_tokens:
            if cur_tok >= min_tokens:
                emit_chunk()
            chunks.append(sent.strip())
            i += 1
            retry_for_same_sentence = 0
            continue

        # For normal sentences
        if cur_tok + t <= max_tokens:
            cur.append(sent)
            cur_tok += t

            # target ok
            if  cur_tok >= target_tokens:
                back = cur[-overlap_sents:] if (overlap_sents > 0 and len(cur) >= overlap_sents) else []
                emit_chunk()
                if back:
                    cur = back [:]
                    cur_tok = sum(_count_tokens_approx(x) for x in cur)
            i += 1
            retry_for_same_sentence = 0
            continue

        # For long sentences            
        if cur_tok >= min_tokens:
            back = cur[-overlap_sents:] if (overlap_sents > 0 and len(cur) >= overlap_sents) else []
            emit_chunk()
            if back:
                cur = back [:]
                cur_tok = sum(_count_tokens_approx(x) for x in cur)
            if retry_for_same_sentence == 0:
                retry_for_same_sentence = 1
                continue
            else:
                i += 1
                retry_for_same_sentence = 0
                continue
        else:
            cur.append(sent)
            cur_tok += t
            emit_chunk()
            i += 1
            retry_for_same_sentence = 0
    
    # Final flush
    if cur:
        if chunks and _count_tokens_approx(" ".join(cur)) < min_tokens:
            # fusion with the last one
            last = chunks.pop()
            chunks.append((last + " " + " ".join(cur)).strip())
        else:
            emit_chunk()
    return chunks

## 4.2 APPLICATION

In [None]:
# Parameters 
SECS_PARQUET     = "01_Extract_Information_Data/All_With_Conclusions_SD.parquet"
CHUNKS_PARQUET   = "01_Extract_Information_Data/All_With_Conclusions_SD_Chunks_RAW.parquet"
TARGET           = 320
MAXTOK           = 448
MINTOK           = 192
OVERLAP          = 2
MIN_WORDS_CHUNK  = 15
MAX_SYMBOL_RATIO = 0.25

In [None]:
df = pd.read_parquet("01_Extract_Information_Data/All_With_Conclusions_SD.parquet")
display(df)

In [None]:
def build_chunks_from_sections_vectorized():
    df = pd.read_parquet(SECS_PARQUET)
    
    # Ordenar y preparar
    df = df.sort_values(["doc_id", "label", "sec_order"]).reset_index(drop=True)
    
    # Vectorizar operaciones
    df["text_valid"] = df["sec_text"].fillna("").str.strip().str.len() > 0
    
    # Procesar solo textos v√°lidos
    valid_mask = df["text_valid"]
    valid_df = df[valid_mask].copy()
    
    # Aplicar chunky_by_sentences a todos los textos v√°lidos
    tqdm.pandas(desc="Chunking sections")
    valid_df["chunks"] = valid_df["sec_text"].progress_apply(
        lambda x: chunky_by_sentences(
            text=x,
            target_tokens=TARGET,
            max_tokens=MAXTOK,
            min_tokens=MINTOK,
            overlap_sents=OVERLAP,
            ensure_scientific_context=True
        )
    )
    # Aplicar filtros de calidad de forma vectorizada
    def filter_chunks(chunk_list):
        if not chunk_list:
            return []
        filtered = []
        for idx, ch in enumerate(chunk_list):
            if len(ch.split()) < MIN_WORDS_CHUNK:
                continue
            if _symbol_ratio(ch) > MAX_SYMBOL_RATIO:
                continue
            filtered.append((idx, ch))
        
        if not filtered and chunk_list:
            # Tomar el chunk m√°s largo
            idx = max(range(len(chunk_list)), key=lambda i: len(chunk_list[i]))
            filtered.append((idx, chunk_list[idx]))
        
        return filtered
    
    valid_df["filtered_chunks"] = valid_df["chunks"].apply(filter_chunks)
    
    # Crear DataFrame expandido
    rows = []
    
    # Procesar textos v√°lidos
    for _, row in tqdm(valid_df.iterrows(), total=len(valid_df), desc="Expanding chunks"):
        doc_id = str(row["doc_id"])
        sec_order = int(row["sec_order"])
        label = str(row["label"]) if pd.notna(row["label"]) else ""
        sec_title = str(row["sec_title"]) if pd.notna(row["sec_title"]) else ""
        
        if row["filtered_chunks"]:
            for j, ch in row["filtered_chunks"]:
                rows.append({
                    "doc_id": doc_id,
                    "sec_order": sec_order,
                    "label": label,
                    "sec_title": sec_title,
                    "chunk_id_int": j,
                    "chunk_text": ch,
                    "n_words": len(ch.split()),
                    "n_chars": len(ch)
                })
        else:
            rows.append({
                "doc_id": doc_id,
                "sec_order": sec_order,
                "label": label,
                "sec_title": sec_title,
                "chunk_id_int": 0,
                "chunk_text": "",
                "n_words": 0,
                "n_chars": 0
            })
    
    # Procesar textos no v√°lidos (m√°s r√°pido, sin chunks)
    invalid_df = df[~valid_mask]
    if len(invalid_df) > 0:
        invalid_rows = [{
            "doc_id": str(r["doc_id"]),
            "sec_order": int(r["sec_order"]),
            "label": str(r["label"]) if pd.notna(r["label"]) else "",
            "sec_title": str(r["sec_title"]) if pd.notna(r["sec_title"]) else "",
            "chunk_id_int": 0,
            "chunk_text": "",
            "n_words": 0,
            "n_chars": 0
        } for _, r in invalid_df.iterrows()]
        rows.extend(invalid_rows)
    
    # Crear DataFrame final
    out = pd.DataFrame(rows)
    out = out.sort_values(["doc_id", "sec_order", "chunk_id_int"]).reset_index(drop=True)
    out["chunk_id"] = out.index.astype("int64")
    
    # Guardar con PyArrow
    table = pa.Table.from_pandas(out, preserve_index=False)
    pq.write_table(table, CHUNKS_PARQUET, compression="zstd")
    print(f"‚úÖ Saved: {CHUNKS_PARQUET}. ({len(out)} chunks)")

In [None]:
build_chunks_from_sections_vectorized()

In [None]:
doc = pd.read_parquet(CHUNKS_PARQUET)

terms_label   = ['√ò']
pattern_label = '|'.join(terms_label)
df  = df[~df['label'].str.contains(pattern_label, case=False, na=False)]

terms_title   = [
    'acknowledg', 'author contribution', 'credit author', 
    'competing interest', 'conflict of interest', 'ethical declaration',
    'data availa', 'code availa', 'supplement', 'appendix',
    'disclaim', 'disclosure', 'consent', 'permission']
pattern_title = '|'.join(terms_title)
df = df[~df['sec_title'].str.contains(pattern_title, case=False, na=False)]

df = df[df['chunk_text'].str.len() > 0].reset_index(drop=True)
df["chunk_id"] = df.index.astype("int64")
cols_to_mov = ['chunk_id']
order_new_cols = cols_to_mov + [col for col in df.columns if col not in cols_to_mov]
df = df[order_new_cols]

display(df)


In [None]:
df.to_parquet(CHUNKS_PARQUET)
display(df)

# 5. ABSTRACTS

In [None]:
# 1. Extract only SD abstracts

df = pd.read_csv ("02_AI_Topics_Models_Data/01_Old_Data/Data_20250912.csv", sep=";")
docs = df[df['SOURCE']=='SD'].reset_index()
docs = docs[['DOI', 'YEAR', 'TITLE', 'COUNTRY', 'AB_AI']]

abs = docs["AB_AI"]
AB_LEMMA = [preprocess_text(ab) for ab in abs]


In [None]:
rows = []
for i in range (0, len(docs)):
    rows.append({
        "doc_id"        : [i],
        "DOI"           : docs["DOI"][i],
        "YEAR"          : docs["YEAR"][i],
        "TITLE"         : docs["TITLE"][i],
        "COUNTRY"       : docs["COUNTRY"][i],
        "ABSTRACT"      : docs["AB_AI"][i],
        "ABSTRACT_LEMA" : AB_LEMMA[i]
    })

In [None]:
ABSTRACTS_SD = pd.DataFrame(rows)
ABSTRACTS_SD.to_parquet("01_Extract_Information_Data/02_Abstracts_SD.parquet", index=False)

# 6. Consolidation of ALL WITH CONCLUSIONS

In [None]:
# NORMAL DIR
DIR_OLD_SECT        = "01_Extract_Information_Data/02_Clean_Sections_SD_02.parquet"
DIR_NEW_SECT        = "01_Extract_Information_Data/All_With_Conclusions_SD.parquet"
DIR_EMBEDDINGS      = "03_Embeddings/Embeddings_SD_02.parquet"
DIR_OLD_CHUNKS      = "01_Extract_Information_Data/02_Clean_Chunks_SD_02.parquet"
DIR_NEW_CHUNKS      = "01_Extract_Information_Data/All_With_Conclusions_Chunks_SD.parquet"
DIR_CONS_CHUNKS     = "01_Extract_Information_Data/All_With_Conclusions_Chunks_SD_02.parquet"
DIR_FOR_NEW_CHUNKS  = "01_Extract_Information_Data/All_With_Conclusions_NEW_Chunks_SD.parquet"
DIR_FOR_NEW_EMBS    = "03_Embeddings/Embeddings_NEW_Chunks.parquet"
DIR_CONS_EMB        = "03_Embeddings/ALL_EMBEDDINGS_SD.parquet"
DIR_CONS_EMB_FILT   = "03_Embeddings/Embeddings_All_Filtered_SD.parquet"
DIR_CONS_ENB_NaN    = "03_Embeddings/Embeddings_All_Filtered_SD_nan.parquet"

In [None]:
# LEMA DIR
DIR_OLD_SECT        = "01_Extract_Information_Data/02_Clean_Sections_SD_Lema.parquet"
DIR_NEW_SECT        = "01_Extract_Information_Data/All_With_Conclusions_SD_Lema.parquet"
DIR_EMBEDDINGS      = "03_Embeddings/Embeddings_SD_Lema_02.parquet"
DIR_OLD_CHUNKS      = "01_Extract_Information_Data/02_Clean_Chunks_SD_Lema_02.parquet"
DIR_NEW_CHUNKS      = "01_Extract_Information_Data/All_With_Conclusions_Chunks_SD_Lema.parquet"
DIR_CONS_CHUNKS     = "01_Extract_Information_Data/All_With_Conclusions_Chunks_SD_Lema_02.parquet"
DIR_FOR_NEW_CHUNKS  = "01_Extract_Information_Data/All_With_Conclusions_NEW_Chunks_SD_Lema.parquet"
DIR_FOR_NEW_EMBS    = "03_Embeddings/Embeddings_NEW_Chunks_Lema.parquet"
DIR_CONS_EMB         = "03_Embeddings/ALL_EMBEDDINGS_SD_Lema.parquet"
DIR_CONS_EMB_FILT   = "03_Embeddings/Embeddings_All_Filtered_SD_Lema.parquet"
DIR_CONS_ENB_NaN    = "03_Embeddings/Embeddings_All_Filtered_SD_nan_Lema.parquet"

In [None]:
# Load previous embeddings 
old_chunks = pd.read_parquet(DIR_OLD_CHUNKS)
#filtre = old_chunks[old_chunks["sec_title"].isna()]
filtre = old_chunks[old_chunks["doc_id"].str.contains("0000")]
display (filtre)

In [None]:
# Load previous embeddings 
old_emb = pd.read_parquet(DIR_EMBEDDINGS)
filtre = old_emb[old_emb["doc_id"].str.contains("0000")]
display (filtre)

In [None]:
# Consolidate old embeddings
old_emb_upgrade = []
for i in range (0, len(old_emb)):

    old_emb_upgrade.append({
        "doc_id"        : old_chunks["doc_id"][i],
        "sec_order"     : old_chunks["sec_order"][i],
        "sec_title"     : old_emb["chunk_title"][i],
        "chunk_id_int"  : old_chunks["chunk_id_int"][i],
        "chunk_id"      : old_chunks["chunk_id"][i],
        "chunk_text"    : old_chunks["chunk_text"][i],
        "n_words"       : old_chunks["n_words"][i],
        "n_chars"       : old_chunks["n_chars"][i],
        "chunk_emb"     : old_emb["chunk_emb"][i]
    })

In [None]:
old_emb_upgrade = pd.DataFrame(old_emb_upgrade)
old_emb_upgrade["key"] = old_emb_upgrade["doc_id"] + '-' + old_emb_upgrade["sec_title"] + '-' + old_emb_upgrade["n_words"].astype(str) + '-' + old_emb_upgrade["n_chars"].astype(str)
#filtre = old_emb_upgrade[old_emb_upgrade["sec_title"].isna()]
filtre = old_emb_upgrade[old_emb_upgrade["doc_id"].str.contains("0000")]

display(filtre)

In [None]:
# Load new chunks
new_chunks = pd.read_parquet(DIR_NEW_CHUNKS)
new_chunks["key"] = new_chunks["doc_id"] + '-' + new_chunks["sec_title"] + '-' + new_chunks["n_words"].astype(str) + '-' + new_chunks["n_chars"].astype(str)
#filtre = new_chunks[new_chunks["sec_title"].isna()]
filtre = new_chunks[new_chunks["doc_id"].str.contains("0000")]
display (filtre)

In [None]:
emb = old_emb_upgrade[["key", "chunk_emb"]]
emb = emb.drop_duplicates(subset='key')
dup = emb[emb.duplicated('key', keep=False)]

print(f"Keys duplicadas en df_01: {len(dup)}")
display (emb)

In [None]:
old_emb_upgrade['key'].iloc[0] == new_chunks['key'].iloc[0]

In [None]:
# For new embeddings

new_emb = new_chunks
new_emb['chunk_emb'] = new_emb['key'].map(emb.set_index("key")["chunk_emb"])

filtre = new_emb[new_emb['doc_id'].str.contains('0000')]
#filtre = new_emb[new_emb['chunk_emb'].isna()]

display(filtre)

In [None]:
new_emb.to_parquet(DIR_CONS_CHUNKS)
filtre.to_parquet(DIR_FOR_NEW_CHUNKS)

## 6.2 Embeddings for new texts!!!

In [None]:
# ========================================================================
# ========================================================================
# ==                                                                    ==
# ==    ALL DOCUMENTS HAVE BEEN EMBEDDED IN "PapersWithOpenAI.ipynb"    ==
# ==                                                                    ==
# ========================================================================
# ========================================================================

## 6.3 Embeddings integration

In [None]:
cons_chunks = pd.read_parquet(DIR_CONS_CHUNKS)
cons_chunks['key'] = cons_chunks["doc_id"] + '-' + cons_chunks["sec_title"] + '-' + cons_chunks["chunk_text"].astype(str)
display(cons_chunks)

In [None]:
new_embs = pd.read_parquet(DIR_FOR_NEW_EMBS)
filtre = new_embs[new_embs["doc_id"].str.contains('0029')]
new_embs['key'] = new_embs["doc_id"] + '-' + new_embs["chunk_title"] + '-' + new_embs["chunk_text"].astype(str)
new_embs = new_embs[["key", "chunk_emb"]]
display(new_embs)

In [None]:
new_embs = new_embs.drop_duplicates(subset='key')
dup = new_embs[new_embs.duplicated('key', keep=False)]

print(f"Keys duplicadas en df_01: {len(dup)}")
display (dup)

In [None]:
# For new embeddings
cons_embs = cons_chunks
cons_embs_nan = cons_embs['chunk_emb'].isna()
display(cons_embs_nan)

new_embs = new_embs.set_index('key')['chunk_emb']
display(new_embs)

In [None]:
cons_embs.loc[cons_embs_nan, 'chunk_emb'] = cons_embs.loc[cons_embs_nan, 'key'].map(new_embs)
cons_embs = cons_embs.drop(['key'], axis=1)
filtre = cons_embs[cons_embs['doc_id'].str.contains('0000')]

display(filtre)

In [None]:
cons_embs.to_parquet(DIR_CONS_EMB)

# 7 FILTRES

In [None]:
detail = pd.read_parquet(DIR_NEW_SECT)
detail = detail[['doc_id','sec_title', 'label', 'sec_text']]
#display (detail)
detail.to_csv("All_With_Conclusions_SD.csv", sep=';', encoding='utf-8')

terms_label   = ['theorem','remark','proposition','proof','problem','lemma','√ò','hyperlink','definition','corollary','configuration','assumption','algorithm']
pattern_label = '|'.join(terms_label)
filtre_label  = detail[~detail['label'].str.contains(pattern_label, case=False, na=False)]

filtre_label = filtre_label[['doc_id','sec_title', 'label', 'sec_text']]
filtre_label.to_csv("All_With_Conclusions_SD_Filtre_01.csv", sep=';', encoding='utf-8')

In [None]:
det = detail[detail['doc_id'].str.contains("3423")]
display(det)

## 7.1 Filtre Embeddings

In [None]:
DIR_CONS_EMB = "03_Embeddings/Embeddings_OpenAI.parquet"

In [None]:
cons_embs = pd.read_parquet(DIR_CONS_EMB)
display(cons_embs)

terms_label   = ['theorem','remark','proposition','proof','problem','lemma','√ò','hyperlink','definition','corollary','configuration','assumption','algorithm']
pattern_label = '|'.join(terms_label)
filtre_label  = cons_embs[~cons_embs['label'].str.contains(pattern_label, case=False, na=False)]

terms_title   = ['acknowledg','acronym','abbrevia','algorithm','author','code availa','computational study','computational time', 'confidential',
                 'conflict','consent','credit author','author credit','declaration of','declarations of','disclaim', 'disclosure','ethics approval','ethical approval',
                 'ethics state','ethics declaration',]
pattern_title = '|'.join(terms_title)
filtre_title = filtre_label[~filtre_label['chunk_title'].str.contains(pattern_title, case=False, na=False)]

filtre_nan = filtre_title[~filtre_title['chunk_emb'].isna()]
filtre_empty = filtre_nan[filtre_nan['chunk_text'].str.len() > 0].reset_index(drop=True)
#1 = filtre_empty[filtre_empty["doc_id"].str.contains("0000")]

display(filtre_empty)

In [None]:
#filtre_empty.to_parquet("03_Embeddings/Embeddings_OpenAI_filtered.parquet")

df = pd.read_parquet("03_Embeddings/Embeddings_OpenAI_filtered.parquet")
display(df)

In [None]:
det = filtre_nan[filtre_nan['doc_id'].str.contains("0000")]
display(det)

#display (filtre_title)

# REVISION DE DATOS

In [None]:
df_01 = pd.read_parquet("03_Embeddings/Embeddings_NEW_Chunks_Lema_Long.parquet")
f1 = df_01[df_01["doc_id"].str.contains("0000")]

display(f1)
display(df_01["chunk_text"].iloc[1])

In [None]:
df_02 = pd.read_parquet("01_Extract_Information_Data/All_With_Conclusions_Chunks_SD.parquet")
f2 = df_02[df_02['doc_id'].str.contains('0000')]

display(f2)
display(df_02["chunk_text"].iloc[1])

In [None]:
df_01["label"] = df_02["label"]
df_01["chunk_text"] = df_02["chunk_text"]
df_01 = df_01[["doc_id", "chunk_int_id", "chunk_id", "label", "chunk_title", "chunk_text", "chunk_emb"]]

f1 = df_01[df_01["doc_id"].str.contains("0000")]
display(f1)
display(df_01["chunk_text"].iloc[1])

In [None]:
df_01.to_parquet("03_Embeddings/Embeddings_Ver.04.parquet", index=False)