In [1]:
# First need to download dblp.xml.gz and unzip it
# https://dblp.uni-trier.de/xml/ 

from lxml import etree
from tqdm import tqdm
import pandas as pd
import html
import re
from html.entities import name2codepoint

TYPE = ("inproceedings", "article") # only pull paper

In [2]:
# uncompressed dblp.xml.gz file
dblp = "dblp/dblp.xml"

# file for output
output_file = "output.csv"

In [3]:
# PARAMETERS (if specifying target venue and publication years)

IS_TARGET_VENUE = False # FALSE if there are no target venues
TARGET_VENUES = {
     # identifier in <booktitle> field in XML, if conference paper
    "SP", "USENIX Security Symposium", "CCS", "NDSS", "SOUPS", "EuroS&P", "ICCPS", "IROS", "ICRA"
 }

START_YEAR = 0 # oldest year to search (inclusive)

In [4]:
# ---- KEYWORDS AND DESIRED CATEGORIES -----
DESIRED_CATEGORIES = {
    "ROBOT", "PRIVACY"
}

CATEGORIES = {
    "ROBOT": {
        "sensor", "robot", "robotic"
    },
    "PRIVACY": {
        "privacy"
    }
    # "ROBOT": {
    #     "robot", "robotics", "mobile robot", "robot mobility", "autonomous vehicle", "automated vehicle", "automated",
    #     "autonomous", "self-driving", "cobot", "manipulator", "unmaned", "agrobot", 
    #     "delivery robot", "vacuum", "drone", "telepresence robot", "humanoid robot", "rover",
    #     "inspection robot", "service robot", "home robot", "domestic robot", "assistive robot",
    #     "companion robot", "food service robot", "surveillance robot", "robot navigation", "multirotor",
    #     "quadrotor", "quadcopter",
    # },
    # "VISUAL": {
    #     "camera", "rgb camera", "vision", "visual", "thermal imaging", "infrared camera", "infrared imaging",
    #     "depth", "lidar", "structured light", "3d sensing", "stereo vision"
    # }, 
    # "AUDIO": {
    #     "microphone", "acoustic", "ultrasonic", "ultrasound",
    #     "passive listening", "active audio sensing", "doppler sensing", "sound localization"
    # },
    # "RADAR": {
    #     "radar", "radio frequency", "mmwave", "millimeter wave", "wifi sensing",
    #     "fmcw", "rf sensing", "wireless sensing", 
    # },
    # "OTHER_SENSORS": {
    #     "accelerometer", "gyroscope", "ambient light", "light sensor", "temperature sensor",
    #     "sensor fusion", "multi-modal sensing", "multimodal sensing", "light-based sensor"
    # }, 
    # "PRIVACY_AND_SECURITY": {
    #     "privacy", "security", "sensor privacy", "data privacy", "user privacy", "privacy-preserving",
    #     "privacy preserving", "privacy enhancing", "safety", "resilience", "attack",
    #     "privacy-aware", "privacy control", "data leakage", "privacy risk", "sensor leakage",
    #     "user privacy", "data collection", "context-aware privacy", "surveillance", "anonymization",
    #     "obfuscation", "masking", "cloaking", "blurring", "spoofing", "jamming", "sensor blocking",
    #     "invisibility cloak", "retroreflective material", "adversarial", "privacy paradox", "trusted execution", "authentication", "fidelity",
    #     "selective sensing", "privacy-aware sensing", "cybersecurity", "cyber attack", "cyberattack", "breach", "intrusion", "exploit", 
    #     "penetration testing", "pentest", "side-channel", "side channel", "replay attack", "eavesdropping", "data exfiltration", "phishing", 
    #     "secure", "digital signature", "blockchain", "access control", "key management", "identity management", "identification", "zero-proof knowledge", 
    #     "ZPK"
    # }, 
}

# keywords that cannot be searched by substring
RISKY_KEYWORDS = {
    # "ir", "rf", "imu", "ros", "uav", "ugv", "auv", "uav", "rov", "usv"
}

CATEGORIES["RISKY_KEYWORDS"] = RISKY_KEYWORDS

In [5]:
# --- HELPER FUNCTIONS (NO CHANGES NEEDED) ---

# checks if entry is a journal article
def is_journal(entry):
    return entry.tag == "article"

# finds and extracts the information for the given tag for the given entry, if it exists
def extract_tag(entry, tag):
    t = entry.find(tag)
    # if the tag and info exist, returns that info
    return t.text.strip() if t is not None and t.text else None

# checks if entry is from valid year
def valid_year(entry):
    year = extract_tag(entry, "year")
    if year is None:
        return None
    try:
        if int(year) >= START_YEAR:
            return year
    except ValueError:
        return None

# checks if there is a keyword match
def keyword_match(title, keyword):
    title = title.lower()
    keyword = keyword.lower()
    if keyword in RISKY_KEYWORDS:
        return re.search(rf'\b{re.escape(keyword)}\b', title, re.IGNORECASE) is not None
    else:
        return keyword in title

# unwanted artifacts
bad_artifacts = ["Poster:", "extended abstract", "Demo:", "demo ", "poster "]

# checks if title has keywords
def check_title(entry):
    title = extract_tag(entry, "title")
    if not title:
        return None
    for item in bad_artifacts:
        if item in title:
            return None
    categories = []
    for category_name, keyword_set in CATEGORIES.items():
        for keyword in keyword_set:
            if keyword_match(title, keyword):
                categories.append(category_name)
    return (title, categories)

# checks if desired categories are hit
def desired_categories(categories):
    if not categories:
        return False
    for item in DESIRED_CATEGORIES:
        if item not in categories:
            return False
    return True

In [6]:
# --- VENUE HELPER FUNCTION (may need customization) ---

def valid_venue(entry):
    if IS_TARGET_VENUE:
        if is_journal(entry):
            key = entry.get("key")
            # customize to identifier in target JOURNAL key
            if "popets" in key:
                return "PETS"
        else:
            v = extract_tag(entry, "booktitle")
            if v:
                for venue in TARGET_VENUES:
                    if venue == "SOUPS":
                        if "soups" in v.lower():
                            return "SOUPS"
                    else:
                        if re.search(rf'\b{re.escape(venue)}\b', v):
                            return venue
        return None
    else:
        if is_journal(entry):
            key = entry.get("key")
            match = re.search(r"^journals/([^/]+)/", key)
            if match:
                result = match.group(1)
                return result
        else:
            v = extract_tag(entry, "booktitle")
            return v or None
        return None

In [7]:
# --- MAIN HELPER FUNCTIONS (NO CHANGES NEEDED) ---

# checks if entry is valid, and returns all necessary info if valid
def extract_entry(entry):
    year = valid_year(entry)
    venue = valid_venue(entry)
    
    if not year or not venue:
        return None
   
    tc = check_title(entry)
    if not tc:
        return None
    title, categories = tc

    if not desired_categories(categories):
        return None

    ee = extract_tag(entry, "ee")
    url = extract_tag(entry, "url")
    
    if url and not url.startswith("http"):
        url = "https://dblp.org/" + url
   
    authors = [a.text.strip() for a in entry.findall("author") if a.text]
    if not authors:
        authors = ["N/A"]

    return {
        "title": title,
        "category": ";".join(categories),
        "authors": "; ".join(authors),
        "year": year,
        "venue": venue,
        "ee": ee,
        "url": url,
    }

# --- PARSER ---
def parse_dblp(xml_file, skip=0, max=None):
    results = []
    total = 0
    skipped = 0

    parser = etree.XMLParser(load_dtd=False, no_network=True, recover=True, resolve_entities=False)
    
    with open(xml_file, 'rb') as file:  # open file as binary
        try:
            context = etree.iterparse(
                file,
                events=('end',),
                tag=TYPE,
                load_dtd=False,
                no_network=True,
                resolve_entities=False,
                recover=True,
                huge_tree=True
            )
        except Exception as e:
            print(f"Could not initialize parser: {e}")
            return []
        
        print("Parsing DBLP... (this may take several minutes)")
        for _, element in tqdm(context, desc="Entries processed", unit="entry"):
            total += 1

            if total < skip:
                continue

            if max and total >= max:
                break
        
            try:
                record = extract_entry(element)
                if record:
                    results.append(record)
            except Exception as e:
                skipped += 1
                print(f"Skipping entry {total} due to error: {e}")
                try:
                    print(etree.tostring(element, pretty_print=True, encoding="unicode"))
                except Exception as sub_e:
                    print(f"Could not print element {total}: {sub_e}")
            finally:
                # Free memory
                try:
                    element.clear()
                    while element.getprevious() is not None:
                        del element.getparent()[0]
                except Exception as cleanup_e:
                    print(f"Cleanup error at entry {total}: {cleanup_e}")
                    continue

    print(f"\nTotal entries parsed: {total}")
    print(f"Total matching papers: {len(results)}")
    print(f"Total skipped entries: {skipped}")
    return results

In [8]:
# --- MAIN DBLP SCRAPER ----
if __name__ == "__main__":
    papers = parse_dblp(dblp)

    df = pd.DataFrame(papers)
    print(len(df))
    df.to_csv(output_file, index=False)
    print("Saved to " + output_file)

Parsing DBLP... (this may take several minutes)


Entries processed: 0entry [00:00, ?entry/s]

Entries processed: 7691562entry [01:46, 72329.76entry/s]


Total entries parsed: 7691562
Total matching papers: 870
Total skipped entries: 0
870
Saved to output.csv



