Project: Theory Discourse Analysis

This notebook prototypes early end-to-end pipeline steps for a theory case study:
retrieval, availability checks, and paragraph selection using embeddings.

Outputs:
- Intermediate lists (e.g., unavailable DOIs)
- Final merged CSV containing selected paragraphs + similarity scores

Notes:
- Retrieval steps should rely on lawful access routes (publisher/library access, OA APIs, manual downloads).
- Embedding-based paragraph selection is heuristic and should be validated.
- Requires an API key in the environment (e.g., GPT4_KEY).


In [None]:
# CONFIG

from pathlib import Path

# Base directory for inputs/outputs (edit to your setup)
DATA_DIR = Path("../data")

# Inputs
DOI_CSV = DATA_DIR / "crossref_relevant_dois.csv"
EBSCO_XML = DATA_DIR / "ebsco_articles.xml"
XML_DIR = DATA_DIR / "articles_xml_final"
INPUT_CSV = DATA_DIR / "pre_result.csv"

# Outputs
OUT_CSV = DATA_DIR / "final.csv"

# Paragraph selection
TOP_N_PARAGRAPHS = 3

# Embeddings
EMBED_MODEL = "text-embedding-ada-002"  # keep consistent with your existing pipeline

# How many articles to process (for quick tests)
MAX_ARTICLES = None  # set to an int for debugging, e.g., 5

DATA_DIR.mkdir(parents=True, exist_ok=True)
print("DATA_DIR:", DATA_DIR.resolve())
print("OUT_CSV:", OUT_CSV.resolve())


In [None]:
# Imports for retrieval + helpers

import csv
import os
import xml.etree.ElementTree as ET
import pandas as pd

from fetch_articles_crossref import fetch_articles_crossref
from fetch_articles_ebsco import fetch_articles_ebsco

# NOTE:
# This public repository does not include automated downloading of copyrighted PDFs.
# If you need PDF acquisition, implement it via lawful access routes in a dedicated script
# (publisher/library links, OA sources, institutional access, or manual downloads).


In [None]:
# Read DOI data from file and (optionally) fetch via Crossref

if DOI_CSV.exists():
    with DOI_CSV.open(newline="") as f:
        reader = csv.reader(f)
        row = list(reader)
        dois = row[0][1:] if row and len(row[0]) > 1 else []
    print("# DOIs:", len(dois))
    # fetch_articles_crossref(dois)  # uncomment if you want to run retrieval
else:
    print("DOI_CSV not found:", DOI_CSV)


In [None]:
# Fetch available articles from EBSCO (interactive / institution-specific workflow)

# fetch_articles_ebsco()  # uncomment if you want to run the Selenium workflow
print("EBSCO fetch is disabled by default in this notebook.")


In [None]:
# Helper: retrieve DOIs of unavailable EBSCO articles based on an XML export

def ebsco_get_unavailable_article_dois(filepath: Path):
    tree = ET.parse(str(filepath))
    root = tree.getroot()

    unavailable_article_dois = []
    for article in root.findall("rec"):
        formats = article.find(".//header/controlInfo/artinfo/formats")
        if formats is None:
            doi = article.find(".//ui[@type='doi']")
            if doi is not None and doi.text:
                unavailable_article_dois.append(doi.text)
    return unavailable_article_dois


In [None]:
# Check which articles are not available to download from EBSCO according to the XML overview file

if EBSCO_XML.exists():
    ebsco_unavailable_article_dois = ebsco_get_unavailable_article_dois(EBSCO_XML)
    print("Articles not retrievable via EBSCO UI (per XML):", len(ebsco_unavailable_article_dois))
    print(ebsco_unavailable_article_dois[:20], "..." if len(ebsco_unavailable_article_dois) > 20 else "")
else:
    print("EBSCO_XML not found:", EBSCO_XML)


## Paragraph selection via embeddings

Given TEI XML article files, extract paragraphs, embed them, and select the most relevant ones for a theory query.


In [None]:
# Embedding + similarity helpers

import numpy as np
import openai
from dotenv import load_dotenv
from openai.embeddings_utils import get_embedding, cosine_similarity

import warnings
warnings.filterwarnings("ignore")

load_dotenv()
openai.api_key = os.getenv("GPT4_KEY")

if not openai.api_key:
    print("WARNING: GPT4_KEY not found in environment. Embeddings will fail until you set it.")


In [None]:
def search_paragraphs(df: pd.DataFrame, query: str, n: int = 5):
    query_embedding = get_embedding(query, engine=EMBED_MODEL)
    df["similarity"] = df["embeddings"].apply(lambda x: cosine_similarity(x, query_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
          .head(n)
          .sort_index()
    )
    return df, results


In [None]:
# Load the input CSV (must include a 'filename' column pointing to TEI XML files)

if not INPUT_CSV.exists():
    raise FileNotFoundError(f"INPUT_CSV not found: {INPUT_CSV}")
if not XML_DIR.exists():
    raise FileNotFoundError(f"XML_DIR not found: {XML_DIR}")

df_in = pd.read_csv(INPUT_CSV)
print("Rows in input:", len(df_in))

if MAX_ARTICLES is not None:
    df_in = df_in.head(int(MAX_ARTICLES)).copy()
    print("Limiting to MAX_ARTICLES:", len(df_in))

dfs = []

for _, row in df_in.iterrows():
    filename = row["filename"]
    xml_path = XML_DIR / filename
    if not xml_path.exists():
        continue

    tree = ET.parse(str(xml_path))
    root = tree.getroot()

    paragraphs = root.findall(".//{http://www.tei-c.org/ns/1.0}p")
    paragraphs = ["".join(p.itertext()).strip() for p in paragraphs]
    paragraphs = [p for p in paragraphs if p]

    if paragraphs:
        df = pd.DataFrame({"filename": filename, "paragraphs": paragraphs})
        dfs.append(df)

print("Articles with paragraphs:", len(dfs))


In [None]:
# Convert paragraphs into embeddings

for df in dfs:
    df["embeddings"] = df["paragraphs"].apply(lambda t: get_embedding(t, engine=EMBED_MODEL))
    print("Embedded:", df["filename"].iloc[0], "#paragraphs:", len(df))


In [None]:
# Select TOP_N_PARAGRAPHS most relevant paragraphs per article

query = """The concept of memory decay in scientific psychology describes that
memory traces are stored with an initial strength value and that this strength decays passively over time unless it is reactivated.
Reactivation of memory traces according to the memory decay theory can be done by practice.
Once the activation level for a stored memory trace becomes too low, the memory trace is lost.
The memory decay theory concerns memory loss in healthy individuals.
Changes solely due to aging processes and abnormal changes in memory capacity due to impairments like dementia are not the explanatory focus of this theory."""

selected = []
for df in dfs:
    _, res = search_paragraphs(df, query, n=TOP_N_PARAGRAPHS)
    res = res.reset_index(drop=True)
    selected.append(res)

print("Selected paragraph sets:", len(selected))


In [None]:
# Merge selections back into a single CSV (p1â€“p3 + embeddings + cosine similarities)

final_rows = []

for res in selected:
    if len(res) < TOP_N_PARAGRAPHS:
        continue

    filename = res.loc[0, "filename"]

    row = {
        "filename": filename,
        "p1": res.loc[0, "paragraphs"],
        "p1_embedding": res.loc[0, "embeddings"],
        "p1_cos_similarity": res.loc[0, "similarity"],
        "p1_rating_category": "",
        "p1_rating_rationale": "",
        "p2": res.loc[1, "paragraphs"],
        "p2_embedding": res.loc[1, "embeddings"],
        "p2_cos_similarity": res.loc[1, "similarity"],
        "p2_rating_category": "",
        "p2_rating_rationale": "",
        "p3": res.loc[2, "paragraphs"],
        "p3_embedding": res.loc[2, "embeddings"],
        "p3_cos_similarity": res.loc[2, "similarity"],
        "p3_rating_category": "",
        "p3_rating_rationale": "",
    }
    final_rows.append(row)

df_out = pd.DataFrame(final_rows)

# Optional: merge with df_in if you want to keep original metadata columns
df_merged = pd.merge(df_in, df_out, on="filename", how="left")

df_merged.to_csv(OUT_CSV, index=False)
print("Wrote:", OUT_CSV)
print("Rows:", len(df_merged))
