<a href="https://colab.research.google.com/github/awkerns/awkerns.github.io/blob/main/Search_Term_Scraping_from_Crossref.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import modules/packages

In [None]:
# Cell 2: Main script – Multiple phrases, Crossref only, with end-of-line labels
import requests
import pandas as pd
import time
import matplotlib.pyplot as plt

Searching over crossref.org

In [None]:


# === Define Search Phrases ===
phrases = [
    # Group 1: Core spatial point process language (Blue)
    "point process",
    "spatial point process",
    "spatial point pattern",
    "Poisson point process",
    "inhomogeneous Poisson point process",
    "Cox process",
    "Log-Gaussian Cox process",
    "marked point process",
    "spatio-temporal point process",

    # Group 2: Intensity / interaction concepts (Green)
    "intensity function",
    "conditional intensity",
    "Papangelou conditional intensity",
    "pair correlation function",
    "Ripley’s K function",
    "spatial interaction model",

    # Group 3: PGF / PGFL – underutilized target (Red)
    "probability generating function",
    "probability generating functional",
    "point process generating functional",
    "probability generating functional of a point process",
    "PGFL"
]

years = list(range(1996, 1996 + 30))

# Define color groups
color_groups = {
    "Core Point Process Terms": (
        ["point process", "spatial point process", "spatial point pattern",
         "Poisson point process", "inhomogeneous Poisson point process",
         "Cox process", "Log-Gaussian Cox process", "marked point process",
         "spatio-temporal point process"],
        ["#1f77b4", "#4f9bd8", "#72b4e6", "#95c8f0", "#a8d4f5", "#bbdfff", "#cce8ff", "#ddeeff", "#eef7ff"]  # Blues
    ),
    "Intensity & Interaction": (
        ["intensity function", "conditional intensity", "Papangelou conditional intensity",
         "pair correlation function", "Ripley’s K function", "spatial interaction model"],
        ["#2ca02c", "#5ab55a", "#80c480", "#a0d6a0", "#c0e8c0", "#e0f0e0"]  # Greens
    ),
    "PGF / PGFL Terms": (
        ["probability generating function", "probability generating functional",
         "point process generating functional", "probability generating functional of a point process", "PGFL"],
        ["#d62728", "#e55a5b", "#f08c8d", "#f8bcbc", "#fcd9d9"]  # Reds
    )
}

# =========================

rows = []

print(f"Searching Crossref for {len(phrases)} phrases across {len(years)} years...\n")

for phrase in phrases:
    print(f"Processing phrase: \"{phrase}\"")
    for y in years:
        url = "https://api.crossref.org/works"
        params = {
            "query.bibliographic": f'"{phrase}"',
            "filter": f"from-pub-date:{y}-01-01,until-pub-date:{y}-12-31",
            "rows": 0
        }

        try:
            r = requests.get(url, params=params, timeout=15)
            r.raise_for_status()
            count = r.json()["message"].get("total-results", 0)
        except Exception as e:
            print(f"  Error for {y}: {e}")
            count = 0

        rows.append({"Year": y, "Phrase": phrase, "Count": count})
        print(f"  {y}: {count}")
        time.sleep(0.8)

# Create DataFrame and pivot
df = pd.DataFrame(rows)
df_pivot = df.pivot(index="Year", columns="Phrase", values="Count").fillna(0).astype(int)

print("\n=== Final Results Table ===")
print(df_pivot)


Plot code below

In [None]:
# === Plot with Grouped Colors + End-of-Line Labels ===
plt.figure(figsize=(16, 9))

ax = plt.gca()

# Plot each individual phrase
for group_name, (group_phrases, colors) in color_groups.items():
    for phrase, color in zip(group_phrases, colors):
        if phrase in df_pivot.columns:
            line = ax.plot(df_pivot.index, df_pivot[phrase],
                           marker="o", color=color, linewidth=2, markersize=4)

            # Add phrase label at the end of the line (2025)
            last_year = df_pivot.index[-1]
            last_value = df_pivot.loc[last_year, phrase]

            # Offset label slightly to the right and adjust vertically to avoid overlap
            ax.text(last_year + 0.6, last_value, phrase,
                    fontsize=9, color=color, fontweight='medium',
                    verticalalignment='center')

# Custom group legend only
handles = [
    plt.Line2D([0], [0], color="#1f77b4", lw=5, label="Core Point Process Terms"),
    plt.Line2D([0], [0], color="#2ca02c", lw=5, label="Intensity & Interaction"),
    plt.Line2D([0], [0], color="#d62728", lw=5, label="PGF / PGFL Terms")
]

plt.xlabel("Year", fontsize=12)
plt.ylabel("Number of Publications (Crossref)", fontsize=12)
plt.title("Crossref Publication Trends by Phrase Group (1996–2025)\nIndividual phrases labeled at end of lines",
          fontsize=14, pad=20)
plt.legend(handles=handles, loc='upper left', fontsize=12, frameon=True, fancybox=True, shadow=True)
plt.grid(True, alpha=0.3)
plt.xlim(years[0] - 1, years[-1] + 6)
plt.tight_layout()
plt.show()

# Save data
df_pivot.to_csv("crossref_phrase_counts_grouped.csv")
print("\nData saved to 'crossref_phrase_counts_grouped.csv'")