In [None]:
!pip install -q PyMuPDF PyPDF2 sentence-transformers pyvis scikit-learn requests

import os, re, json, random, math, time
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from PyPDF2 import PdfReader
import fitz  # PyMuPDF

from google.colab import files
from IPython.display import HTML, IFrame, display

DATA_DIR = Path("./data")
DATA_DIR.mkdir(exist_ok=True, parents=True)

print("Setup complete.")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m756.0/756.0 kB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.6/1.6 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
[?25hSetup complete.


In [None]:
# ============================================================
# 1) Upload PDF ‚Üí sections + TF-IDF + clusters
# ============================================================

print("Please upload your article PDF (e.g. EREPAPER2025.pdf)")

uploaded = files.upload()
if not uploaded:
    raise RuntimeError("No file uploaded.")
pdf_filename = list(uploaded.keys())[0]
print(f"\nFile uploaded successfully: {pdf_filename}")

# ---- Extract text page by page ----
reader = PdfReader(pdf_filename)
raw_pages = [page.extract_text() or "" for page in reader.pages]

# Clean and drop empty pages
pages = [re.sub(r"\s+", " ", t.strip()) for t in raw_pages if t and t.strip()]
if not pages:
    raise RuntimeError("No text could be extracted from the PDF.")

print(f"Extracted {len(pages)} non-empty pages of text.")

# Full text for later (literature search + summarization)
full_text = " ".join(pages)
print(f"Full text length: {len(full_text)} characters")

# Treat each page as a section
filenames = [f"section_{i+1:02d}.txt" for i in range(len(pages))]
texts = pages

# ---- Token counts ----
df_tokens = pd.DataFrame({
    "filename": filenames,
    "token_count": [len(t.split()) for t in texts]
})
df_tokens.to_csv("tokens.csv", index=False)
df_tokens.to_csv(DATA_DIR / "tokens.csv", index=False)

print("Saved tokens.csv")

# ---- TF-IDF ----
vectorizer = TfidfVectorizer(max_features=2000, stop_words="english")
X = vectorizer.fit_transform(texts)
tfidf_terms = vectorizer.get_feature_names_out()
print(f"TF-IDF matrix shape: {X.shape}")

tfidf_array = X.toarray()
tfidf_dict = {}
for i, fname in enumerate(filenames):
    weights = tfidf_array[i]
    top_idx = np.argsort(-weights)[:15]
    tfidf_dict[fname] = {
        tfidf_terms[j]: float(weights[j])
        for j in top_idx if weights[j] > 0
    }

with open("tfidf_terms.json", "w", encoding="utf-8") as f:
    json.dump(tfidf_dict, f, indent=2, ensure_ascii=False)
with open(DATA_DIR/"tfidf_terms.json", "w", encoding="utf-8") as f:
    json.dump(tfidf_dict, f, indent=2, ensure_ascii=False)

print("Saved tfidf_terms.json")

# ---- Cosine similarity between sections ----
cosine_sim = cosine_similarity(X)
df_cosine = pd.DataFrame(cosine_sim, index=filenames, columns=filenames)
df_cosine.to_csv("cosine.csv")
df_cosine.to_csv(DATA_DIR / "cosine.csv")

print("Saved cosine.csv")

# ---- KMeans clustering (on sections) ----
if len(filenames) <= 2:
    n_clusters = 2
else:
    # heuristic: number of clusters grows slowly with #sections
    n_clusters = max(2, min(8, len(filenames)//5))

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)

df_clusters = pd.DataFrame({
    "filename": filenames,
    "cluster": clusters
})
df_clusters.to_csv("clusters.csv", index=False)
df_clusters.to_csv(DATA_DIR / "clusters.csv", index=False)

print(f"Saved clusters.csv with {n_clusters} clusters.")

# ---- Nearest neighbors per section ----
neighbors = {}
for i, fname in enumerate(filenames):
    sims = cosine_sim[i].copy()
    sims[i] = -1  # ignore self
    top_neighbors_idx = np.argsort(-sims)[:3]
    neighbors[fname] = [filenames[j] for j in top_neighbors_idx]

# ---- Short heuristic summary per section ----
def infer_summary(fname, tfidf_dict):
    terms = list(tfidf_dict.get(fname, {}).keys())
    if not terms:
        return "No significant keywords found."
    if len(terms) >= 6:
        head = terms[:3]
        tail = terms[3:6]
        return (
            f"Section highlights {head[0]}, {head[1]} and {head[2]}, "
            f"with related themes around {', '.join(tail)}."
        )
    else:
        return f"Section highlights {', '.join(terms)}."

agent_corpus = []
for fname, text in zip(filenames, texts):
    section_id = fname.replace(".txt", "")
    cluster = int(df_clusters.loc[df_clusters["filename"] == fname, "cluster"].iloc[0])
    tokens = int(df_tokens.loc[df_tokens["filename"] == fname, "token_count"].iloc[0])
    kws = list(tfidf_dict.get(fname, {}).keys())[:12]
    agent_corpus.append({
        "id": section_id,
        "filename": fname,
        "cluster": cluster,
        "token_count": tokens,
        "keywords": kws,
        "neighbors": neighbors.get(fname, []),
        "summary": infer_summary(fname, tfidf_dict),
        "representation": f"{fname}: {' '.join(kws)}",
        "text": text
    })

with open("chat_agent_corpus.json", "w", encoding="utf-8") as f:
    json.dump(agent_corpus, f, indent=2, ensure_ascii=False)
with open(DATA_DIR/"chat_agent_corpus.json", "w", encoding="utf-8") as f:
    json.dump(agent_corpus, f, indent=2, ensure_ascii=False)

print(f"\nGenerated chat_agent_corpus.json with {len(agent_corpus)} sections.")
for s in random.sample(agent_corpus, min(3, len(agent_corpus))):
    print(f"\nüîπ {s['filename']} | Cluster {s['cluster']} | Tokens {s['token_count']}")
    print("Keywords:", ", ".join(s['keywords']))
    print("Summary:", s['summary'])
    print("Neighbors:", ", ".join(s['neighbors']))

Please upload your article PDF (e.g. EREPAPER2025.pdf)


Saving EREPAPER2025.pdf to EREPAPER2025.pdf

File uploaded successfully: EREPAPER2025.pdf
Extracted 16 non-empty pages of text.
Full text length: 79775 characters
Saved tokens.csv
TF-IDF matrix shape: (16, 2000)
Saved tfidf_terms.json
Saved cosine.csv
Saved clusters.csv with 3 clusters.

Generated chat_agent_corpus.json with 16 sections.

üîπ section_15.txt | Cluster 2 | Tokens 698
Keywords: aiaa, code, conference, prediction, digital, monitoring, akselos, fa9550, public, repository, used, software
Summary: Section highlights aiaa, code and conference, with related themes around prediction, digital, monitoring.
Neighbors: section_08.txt, section_04.txt, section_13.txt

üîπ section_02.txt | Cluster 1 | Tokens 722
Keywords: twin, digital, state, asset, model, vehicle, physical, structural, decision, abstraction, graphical, data
Summary: Section highlights twin, digital and state, with related themes around asset, model, vehicle.
Neighbors: section_01.txt, section_03.txt, section_05.txt

In [None]:
# ============================================================
# 2) Simple extractive summary of the article
# ============================================================

def split_sentences(text):
    # Light-weight splitter to avoid NLTK downloads
    sents = re.split(r"(?<=[.!?])\s+", text)
    return [s.strip() for s in sents if s.strip()]

sentences_all = split_sentences(full_text)
print(f"Found {len(sentences_all)} sentences in the article.")

if len(sentences_all) <= 8:
    top_sents = sentences_all
else:
    # Rank sentences by TF-IDF versus global sentence corpus
    vec = TfidfVectorizer(stop_words="english", max_features=3000)
    X_sent = vec.fit_transform(sentences_all)
    # Score each sentence by its L2 norm (rough importance)
    norms = np.linalg.norm(X_sent.toarray(), axis=1)
    idx = np.argsort(-norms)[:8]
    idx = sorted(idx)  # keep roughly document order
    top_sents = [sentences_all[i] for i in idx]

print("\n=== Extractive summary (8-ish key sentences) ===\n")
for s in top_sents:
    print("‚Ä¢", s)

Found 553 sentences in the article.

=== Extractive summary (8-ish key sentences) ===

‚Ä¢ In healthcare, digital twins of human- beings promise to advance medical assessment, diagnosis, per- sonalized treatment, and in-silico drug testing [7, 8, 9].
‚Ä¢ We adopt a view of the physical asset and its digital twin as two coupled dynamical systems, evolving over time through their respective state spaces as shown in Figure 1.
‚Ä¢ These two dynamical systems are tightly coupled: The digital twin uses observational data to estimate the state of the physical twin, so that it can, in turn, provide optimal control inputs that steer the physical asset to favorable states, while balancing other factors like maintaining observability over the asset state and minimizing control costs.
‚Ä¢ We can extend the target belief state to include prediction of digital state, quantity of interest, and reward variables up until the prediction horizon, tp, as follows: p(D0;:::;Dtp;Q0;:::;Qtp;R0;:::;Rtp; Utc+1;

Summary recommendation from my end --- to structure it the best based on what the Prof. also recommended: 1. Domain, 2. Theoretical frame, 3. Mechanism, 4. Formal model, 5. Evidence base, 6. Technical context, and 7. check if you fullfilled Prof.'s recs.

Prof.'s recs: 1. "The most important thing is to teach something."

2. "Break the broader picture into - why are we even doing the model?"

3. "What are the key features that you've extracted?"

4. "Are you able to define the key relationships between the most important variables in the model? - like say free trade vs. climate change."

5. "You should be able to deconstruct: What is the purpose of this article? What is the point?"

Then you reconstruct: Aim ‚Üí Method ‚Üí Positive ‚Üí Normative.

In [None]:
# ============================================================
# 3) Fetch related literature using OpenAlex
# ============================================================

import requests

def fetch_openalex(query, per_page=50):
    base_url = "https://api.openalex.org/works"
    params = {
        "search": query,
        "per-page": per_page,
        "mailto": "research@example.com"
    }
    try:
        resp = requests.get(base_url, params=params, timeout=30)
        resp.raise_for_status()
        results = resp.json().get("results", [])
        out = []
        for r in results:
            # Reconstruct abstract from inverted index if needed
            idx = r.get("abstract_inverted_index")
            if idx:
                pos_word = []
                for word, positions in idx.items():
                    for p in positions:
                        pos_word.append((p, word))
                pos_word.sort(key=lambda x: x[0])
                abstract = " ".join(w for _, w in pos_word)
            else:
                abstract = ""
            out.append({
                "title": r.get("title", "Untitled"),
                "abstract": abstract,
                "year": r.get("publication_year"),
                "citations": r.get("cited_by_count", 0),
                "authors": [a["author"]["display_name"] for a in r.get("authorships", [])],
                "url": r.get("id"),
            })
        return out
    except Exception as e:
        print("OpenAlex error:", e)
        return []

# Option 1: let you type a keyword
default_keyword = "digital twin infrastructure asset monitoring"
print(f"Default query = {default_keyword}")
keyword = input("Enter a keyword for related literature (or press Enter to use default): ").strip()
if not keyword:
    keyword = default_keyword

print(f"\nFetching literature for: {keyword!r}")
external_papers = fetch_openalex(keyword, per_page=100)
print(f"Retrieved {len(external_papers)} records.")

with open("external_papers.json", "w", encoding="utf-8") as f:
    json.dump(external_papers, f, indent=2, ensure_ascii=False)

with open(DATA_DIR/"external_papers.json", "w", encoding="utf-8") as f:
    json.dump(external_papers, f, indent=2, ensure_ascii=False)

if external_papers:
    print("\nExample paper:")
    print("Title:", external_papers[0]["title"])
    print("Year:", external_papers[0]["year"])
    print("Citations:", external_papers[0]["citations"])
else:
    print("No related papers found. Try a different keyword.")

Default query = digital twin infrastructure asset monitoring
Enter a keyword for related literature (or press Enter to use default): uncertainty

Fetching literature for: 'uncertainty'
Retrieved 100 records.

Example paper:
Title: Judgment under Uncertainty: Heuristics and Biases
Year: 1974
Citations: 26637


In [None]:
# ============================================================
# 4) Build interactive literature graph (pyvis)
# ============================================================

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from pyvis.network import Network

# Load external papers from previous cell
with open("external_papers.json", "r", encoding="utf-8") as f:
    external_papers = json.load(f)

if not external_papers:
    raise RuntimeError("No external papers loaded. Run the previous cell first with a successful query.")

# Texts: your full paper + external abstracts
all_texts = [full_text] + [p.get("abstract") or "" for p in external_papers]
titles = ["Your Uploaded Article"] + [p.get("title", "Untitled") for p in external_papers]

meta = [{
    "type": "focal",
    "year": None,
    "citations": 0,
    "authors": ["You"],
    "url": ""
}] + [
    {
        "type": "external",
        "year": p.get("year"),
        "citations": p.get("citations", 0),
        "authors": p.get("authors", []),
        "url": p.get("url", "")
    }
    for p in external_papers
]

print("Encoding texts for graph (this uses CPU, may take a bit)...")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(all_texts, show_progress_bar=True, convert_to_numpy=True)
print("Embeddings shape:", embeddings.shape)

sim_matrix = cosine_similarity(embeddings)

# Build graph
G = nx.Graph()
for i, title in enumerate(titles):
    m = meta[i]
    node_title_html = (
        f"{title}<br>"
        f"Type: {m['type']}<br>"
        f"Year: {m.get('year', 'N/A')}<br>"
        f"Citations: {m.get('citations', 'N/A')}<br>"
        f"Authors: {', '.join(m.get('authors', [])[:5])}<br>"
        f"<a href='{m.get('url','')}' target='_blank'>Open Link</a>"
    )
    G.add_node(
        i,
        label=title[:60] + ("..." if len(title) > 60 else ""),
        title=node_title_html,
        group=0 if m["type"] == "focal" else 1,
        size=28 if m["type"] == "focal" else max(6, np.log1p(m.get("citations", 1)) * 2),
    )

# Add edges above similarity threshold
threshold = 0.45
for i in range(len(titles)):
    for j in range(i+1, len(titles)):
        if sim_matrix[i, j] >= threshold:
            G.add_edge(i, j, weight=float(sim_matrix[i, j]))

print(f"Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.")

# Visualize with pyvis
net = Network(
    height="750px",
    width="100%",
    notebook=True,
    bgcolor="#111111",
    font_color="white",
    heading="Literature Landscape"
)
net.from_nx(G)
net.force_atlas_2based(gravity=-50, central_gravity=0.03, spring_length=120)

html_path = "literature_landscape.html"
net.save_graph(html_path)
print(f"Saved interactive graph to {html_path}")

# Show inline in Colab (no localhost issues)
display(IFrame(html_path, width="100%", height="800"))

Encoding texts for graph (this uses CPU, may take a bit)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings shape: (101, 384)
Graph: 101 nodes, 625 edges.
Saved interactive graph to literature_landscape.html


In [None]:
from google.colab import files
files.download('literature_landscape.html')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from pyvis.network import Network

net = Network(
    height="750px",
    width="100%",
    notebook=True,
    bgcolor="#222222",
    font_color="white",
    heading="Literature Landscape Graph",
    cdn_resources="in_line",
)

In [None]:
# ============================================================
# 5) Simple retrieval over sections (no LLM)
# ============================================================

# Reuse the TF-IDF vectorizer and X from Cell 2

def top_k_sections(query, k=5):
    if not query.strip():
        return []
    q_vec = vectorizer.transform([query])
    sims = cosine_similarity(q_vec, X)[0]
    idx = np.argsort(-sims)[:k]
    results = []
    for i in idx:
        results.append({
            "filename": filenames[i],
            "score": float(sims[i]),
            "text": texts[i],
            "keywords": list(tfidf_dict.get(filenames[i], {}).keys())[:10]
        })
    return results

q = input("Enter a question about the paper (e.g. 'What is the main policy implication?'): ").strip()
if q:
    hits = top_k_sections(q, k=3)
    print(f"\nTop sections for: {q!r}\n")
    for h in hits:
        print(f"=== {h['filename']} (score={h['score']:.3f}) ===")
        print("Keywords:", ", ".join(h["keywords"]))
        print()
        print(h["text"][:1500], "...\n")
        print("-"*80)
else:
    print("No query entered.")

Enter a question about the paper (e.g. 'What is the main policy implication?'): What is the main invention?

Top sections for: 'What is the main invention?'

=== section_02.txt (score=0.035) ===
Keywords: twin, digital, state, asset, model, vehicle, physical, structural, decision, abstraction

state variable 2 state variable 1physical asset state digital twin state observational data control inputs 4) Control inputs informed by the updated digital twin steer the physical asset to a favorable state 2 ) The digital twin assimilates observational data and updates its state to mimic the physical twin 1 ) The state of the physical asset evolves over time 3 ) The updated digital twin enables in-depth analysis of the asset and prediction of future state evolution Figure 1: Conceptual model of a physical asset and its digital twin, evolving over time through their respective state spaces. These two dynamical systems are tightly coupled: The digital twin uses observational data to estimate the 

In [None]:
# --- PART 1: ANALYTICAL SUMMARIZER ---

!pip install -q transformers sentencepiece PyPDF2

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from PyPDF2 import PdfReader
import numpy as np

# 1) Load your PDF and get full_text
pdf_path = "EREPAPER2025.pdf"   # CHANGE FILE NAME ACCORDINGLY!

reader = PdfReader(pdf_path)
pages = [p.extract_text() or "" for p in reader.pages]
full_text = "\n\n".join(pages)
print(f"Loaded PDF with {len(pages)} pages and {len(full_text.split())} words.")

# 2) Load a reasonably small model (CPU-friendly)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("text2text-generation",
                      model=model,
                      tokenizer=tokenizer,
                      device=-1)   # CPU

def split_text(text, max_chars=6000):
    """Simple character-based splitter to avoid context overflow."""
    text = text.replace("\n", " ")
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

# 3) First-pass chunk summaries
chunks = split_text(full_text, max_chars=6000)[:3]  # cap at 3 chunks for speed
print("Number of chunks:", len(chunks))

chunk_notes = []
template = """
You are an economics teaching assistant.
Based on the article excerpt below, write *notes* (not full prose) answering:

1. Aim ‚Äì What question does the article address and why is it relevant?
2. Methods ‚Äì What methodological approach does it use (descriptive, theory, simulation, econometrics, or mixed)?
3. Positive conclusions ‚Äì What factual / analytical findings does it report?
4. Normative conclusions ‚Äì What recommendations or 'should' claims does it make, if any?

Write bullet-point notes (5‚Äì10 bullets total), keep key details, but stay non-technical.

Article excerpt:
{chunk}
"""

for i, ch in enumerate(chunks, start=1):
    prompt = template.format(chunk=ch)
    out = summarizer(prompt, max_new_tokens=320)[0]["generated_text"]
    print(f"\n==== Chunk {i} notes ====\n")
    print(out)
    chunk_notes.append(out)

combined_notes = "\n\n".join(chunk_notes)

# 4) Second-pass: turn notes into a clean 3‚Äì5 min script
final_prompt = """
You are an Nobel Laureate Oxford Economics Professor preparing a 3‚Äì5 minute oral presentation
for an undergraduate class in environmental & resource economics.

Using ONLY the notes below, write a structured summary with four headings:

1. Aim
2. Methods
3. Positive conclusions
4. Normative conclusions / policy relevance

Guidelines:
- 400‚Äì600 words total.
- Clear, non-technical language, but keep important details and numbers.
- Speak as if you are explaining aloud to the class.

NOTES:
{notes}
"""

article_summary = summarizer(final_prompt.format(notes=combined_notes),
                             max_new_tokens=420)[0]["generated_text"]

print("\n\n===== FINAL 3‚Äì5 MINUTE SUMMARY =====\n")
print(article_summary)

Loaded PDF with 16 pages and 12382 words.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1514 > 512). Running this sequence through the model will result in indexing errors


Number of chunks: 3

==== Chunk 1 notes ====

A Probabilistic Graphical Model Foundation for Enabling Predictive Digital Twins at Scale Michael G. Kapteyn1, Jacob V.R. Pretorius2, and Karen E. Willcox3 1Department of Aeronautics and Astronautics, Massachusetts Institute of Technology, Cambridge, MA 02139, United States 2The Jessara Group, Austin, TX 78704, United States 3Oden Institute for Computational Engineering and Sciences, University of Texas at Austin, Austin, TX 78712, United States Abstract A unifying mathematical formulation is needed to move from one-o digital twins built through custom implementations to robust digital twin implementations at scale. This work proposes a probabilistic graphical model as a formal math- ematical representation of a digital twin and its associated physical asset. We create an abstraction of the asset-twin system as a set of coupled dynamical systems, evolving over time through their respective state-spaces and interacting via observed data and 

**Argument Analysis: For or Against?**

In [None]:
!pip install -q sentence-transformers transformers

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import torch

# === 1. Built-in reference article (can be replaced silently later) ===
article_summary = """
This paper develops a probabilistic graphical model foundation for digital twins,
allowing predictive control, optimization, and resilience in coupled physical‚Äìdigital systems.
It integrates control theory and uncertainty quantification to scale digital twin adoption
in critical infrastructure and healthcare.
"""

# === 2. Small autonomous mock literature set ===
papers = [
    {"title": "Predictive Maintenance via Bayesian Control",
     "abstract": "This study supports probabilistic models for asset optimization using digital twins."},
    {"title": "Digital Twins in Infrastructure Resilience",
     "abstract": "This paper criticizes the over-dependence on probabilistic control methods and advocates hybrid deterministic-stochastic modeling."},
    {"title": "Smart City Twins for Energy Optimization",
     "abstract": "Extends digital twin methodology to smart city management, aligning with the focal model‚Äôs optimization framework."},
    {"title": "Skepticism toward Fully Autonomous Digital Twins",
     "abstract": "Argues against full automation in twin feedback systems, emphasizing human oversight instead."}
]

# === 3. Compute semantic similarity to ensure relevance ===
model = SentenceTransformer("all-MiniLM-L6-v2")
summary_vec = model.encode(article_summary, convert_to_tensor=True)
paper_vecs = model.encode([p["abstract"] for p in papers], convert_to_tensor=True)
similarities = util.cos_sim(summary_vec, paper_vecs).squeeze().tolist()

# Keep only relevant papers (similarity > 0.3)
relevant = [(p, s) for p, s in zip(papers, similarities) if s > 0.3]

# === 4. Classify stance (support vs contradict) ===
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

for paper, sim in relevant:
    text = paper["title"] + ". " + paper["abstract"]
    res = classifier(
        text,
        candidate_labels=["supports", "contradicts"],
        hypothesis_template="This paper {} the arguments made in the focal article."
    )
    paper["stance"] = res["labels"][0]
    paper["confidence"] = round(res["scores"][0], 3)

# === 5. Display concise summary ===
print("\n=== SUPPORTING PAPERS ===")
for p, s in relevant:
    if p["stance"] == "supports":
        print(f"‚Ä¢ {p['title']}  ({p['confidence']})")

print("\n=== CONTRADICTING PAPERS ===")
for p, s in relevant:
    if p["stance"] == "contradicts":
        print(f"‚Ä¢ {p['title']}  ({p['confidence']})")

print("\nAuto-analysis complete.")

Device set to use cpu



=== SUPPORTING PAPERS ===
‚Ä¢ Predictive Maintenance via Bayesian Control  (0.994)
‚Ä¢ Smart City Twins for Energy Optimization  (0.988)
‚Ä¢ Skepticism toward Fully Autonomous Digital Twins  (0.562)

=== CONTRADICTING PAPERS ===
‚Ä¢ Digital Twins in Infrastructure Resilience  (0.807)

Auto-analysis complete.
