In [26]:
import csv
import os
import time
from typing import List
import random


from semanticscholar import SemanticScholar


# ------------------------------------------------
# STEP 1: Gather main papers (by keywords + venue)
# ------------------------------------------------

def run_step1_initial(
    keywords: List[str], 
    limit: int = 20, 
    output_csv: str = "step1_papers.csv"
):
    sch = SemanticScholar()

    publication_types = ["JournalArticle", "Conference"]
    rows = []

    for kw in keywords:
        for pub_type in publication_types:
            print(f"[STEP1] Searching for '{kw}' in {pub_type}, limit={limit}...")
            results = sch.search_paper(
                query=kw,
                limit=limit,
                publication_types=[pub_type],
                fields=["paperId"]
            )
            print(f"  Found {results.total} total. We'll process {len(results.items)} items...")

            for paper in results.items:
                pid = paper.paperId
                if not pid:
                    continue
                print(f"  -> Paper {pid}, retrieving ...")

                references_list = []
                try:
                    ref_result = sch.get_paper_references(paper_id=pid)
                    if ref_result.items:
                        for ref_obj in ref_result.items:
                            if ref_obj.paper and ref_obj.paper.paperId:
                                references_list.append(ref_obj.paper.paperId)
                except Exception as e:
                    print(f"    [STEP1] Error retrieving references for {pid}: {e}")

                row = {
                    "paperId": pid,
                    "references": "; ".join(references_list),
                    "keyword": kw,
                    "venueType": pub_type
                }
                rows.append(row)

                time.sleep(random.uniform(0.2, 0.8))

    fieldnames = ["paperId", "references", "keyword", "venueType"]
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows:
            writer.writerow(r)

    print(f"[STEP1] Wrote {len(rows)} rows to {os.path.abspath(output_csv)}")
    return rows


# --------------------------------------------
# STEP 2: Gather second-level references
# --------------------------------------------

def run_step2_expanded(
    input_csv: str = "step1_papers.csv",
    output_csv: str = "step2_papers.csv"
):
    """
    Step 2:
      - Read 'input_csv' from step1.
      - For each 'reference' paperId in that CSV, 
        fetch up to references_limit references for that paper 
        (i.e. second-level references).
      - Store in columns: paperId, references. Keyword, venueType will
        be filled-in later
    """
    sch = SemanticScholar()

    ref_ids = set()
    ref_ids_list = []
    
    with open(input_csv, "r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            refs_str = row.get("references", "")
            if refs_str.strip():
                splitted = [x.strip() for x in refs_str.split(";") if x.strip()]
                for r in splitted:
                    ref_ids.add(r)
                    ref_ids_list.append(r)
    
    print(f"[STEP2] Total references (with duplicates): {len(ref_ids_list)}")
    print(f"[STEP2] Found {len(ref_ids)} unique reference IDs from {input_csv}.")

    rows = []
    for i, rid in enumerate(ref_ids, 1):
        print(f"[{i}] -> Gathering references for {rid} ")
        references_list = []
        try:
            ref_result = sch.get_paper_references(paper_id=rid)
            if ref_result.items:
                for ref_obj in ref_result.items:
                    if ref_obj.paper and ref_obj.paper.paperId:
                        references_list.append(ref_obj.paper.paperId)
        except Exception as e:
            print(f"[STEP2] Error retrieving references for {rid}: {e}")

        row = {
            "paperId": rid,
            "references": "; ".join(references_list),
            "keyword": "null",
            "venueType": "null"
        }
        rows.append(row)

        #time.sleep(random.uniform(0.2, 0.8)) 

    fieldnames = ["paperId", "references", "keyword", "venueType"]
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows:
            writer.writerow(r)

    print(f"[STEP2] Wrote {len(rows)} rows to {os.path.abspath(output_csv)}")
    return rows

In [16]:
# Step 1
step1_rows = run_step1_initial(
    keywords=['data management', 'indexing', 'data modeling', 'big data', 'data processing', 'data storage', 'data querying', 'artificial intelligence', 'machine learning', 'ethics', 'semantic data', 'data warehouse', 'process mining', 'decision support'],
    limit=20,
output_csv="step1_papers.csv"
)


[STEP1] Searching for 'data management' in JournalArticle, limit=20...
  Found 9179358 total. We'll process 20 items...
  -> Paper e936f248b2c0489316ed1521656af2564c3502c3, retrieving ...
  -> Paper 8dd0c1e955c66092ff951941a151336211e6e171, retrieving ...
  -> Paper bab5e35001757719d0f8338f94dde2860dae784a, retrieving ...
  -> Paper 04eaa03dc670afa88c9f3c83fc8da08ef4d31cdd, retrieving ...
  -> Paper c07802ed8a25998e9bd44ee1ddbcc63b7eb34060, retrieving ...
  -> Paper 6548106035c7208ad498730627874a482734b9ac, retrieving ...
  -> Paper 82e1e8b222aeaca19d45375b31fdc825d1a821b8, retrieving ...
  -> Paper 28b5df48dd23ffc7e7d64fc43e2a420e05ab88f8, retrieving ...
  -> Paper c96fc88631f2b8e2fe192027a8a237445635328c, retrieving ...
  -> Paper 91bda0785eaf642515eefc9ff2ecd7ddbacaccae, retrieving ...
  -> Paper d008893e01fa7f6c5fb01dadf3f97ee96835c303, retrieving ...
  -> Paper b7034546bee38ba13d3b312fce893a22e33ce4dd, retrieving ...
  -> Paper 0df5a4f9cc8a244715fe9968732497d2ac2a7cd1, retrieving 

In [47]:
# Step 2
step2_rows = run_step2_expanded(
    input_csv="step1_papers.csv",
    output_csv="step2_papers.csv"
)


[STEP2] Total references (with duplicates): 21958
[STEP2] Found 18014 unique reference IDs from step1_papers.csv.
[1] -> Gathering references for aaddbc150828556386afbd21acead6d47ed63ad9 
[2] -> Gathering references for 1b62d32d291332697f755312690eb5ac867da533 
[3] -> Gathering references for 442d50f13dbfc1d550a30f80fbe0d9ec38df6ac3 
[4] -> Gathering references for 922a21245a0c52645c9f784c881f6beff3f58690 
[5] -> Gathering references for 33bcf3348fb08d549eba745f899fef4f8b5f1dc8 
[6] -> Gathering references for 356cddd204074262d32495583209671bd91498e0 
[7] -> Gathering references for 915d0fbe3243eab27781bc5e492877b81fc0ce33 
[8] -> Gathering references for 049f4c438ce9eefa622ae5ba5fb7e34443b86133 
[9] -> Gathering references for cddf40e579a596d0110b260313adf43470617c4c 
[10] -> Gathering references for ebad46b35d9ceaac70956abe9be7b92215a25544 
[11] -> Gathering references for bd3e894559bbcbd114ab08108767b6efefbeb25e 
[12] -> Gathering references for 86171b495d1a0cb1ac21705b837e5b87e39b8

In [33]:
import pandas as pd
df1 = pd.read_csv("step1_papers.csv")
df1 = df1[df1["references"].notna() & ~df1["references"].str.strip().eq("")]
df2 = pd.read_csv("step2_papers.csv")
combined_df = pd.concat([df1, df2], ignore_index=True)
combined_df.to_csv("papers_combined.csv", index=False)