# Data Ingestion

In [4]:
DATA_PATH = "../data/"

## arXiv API

In [4]:
!pip install arxiv -q


[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
arxiv_categories = {
    "cs.LG": "Machine Learning",
    "cs.CV": "Computer Vision and Pattern Recognition",
    "cs.CL": "Computation and Language",
    "cs.AI": "Artificial Intelligence",
    "stat.ML": "Statistics",
    "eess.IV": "Electrical Engineering and Systems Science",
    "cs.RO": "Robotics",
    "cs.NE": "Neural and Evolutionary Computing",
    #"cs.HC": "Human-Computer Interaction",
    #"cs.DS": "Data Structures and Algorithms",
    #"cs.CR": "Cryptography and Security",
    #"astro-ph": "Astrophysics",
    #"cond-mat": "Condensed Matter",
    #"hep-ph": "High Energy Physics",
    #"hep-th": "High Energy Physics",
    #"quant-ph": "Quantum Physics",
    #"math-ph": "Mathematical Physics",
    #"gr-qc": "General Relativity and Quantum Cosmology",
}

trending_keywords = {
    "model_architecture": [
        "transformer",
        "BERT",
        "GPT",
        "LLaMA",
        "vision transformer",
        "ViT",
        "GNN",
        "CNN",
        "RNN",
        "autoencoder",
        "variational autoencoder",
        "attention mechanism",
        "GAN"
    ],
    "techniques": [
        "self-supervised learning",
        "unsupervised learning",
        "reinforcement learning",
        "contrastive learning",
        "few-shot learning",
        "zero-shot learning",
        "meta-learning",
        "transfer learning",
        "foundation model",
        "multimodal learning"
    ],
    "applications": [
        "computer vision",
        "image classification",
        "object detection",
        "segmentation",
        "NLP",
        "speech recognition",
        "ASR",
        "robotics",
        "autonomous systems",
        "bioinformatics",
        "recommendation system"
    ],
    "content_generation": [
        "text generation",
        "image generation",
        "diffusion model",
        "LLM",
        "retrieval augmented generation"
    ]
}

all_keywords = sum(trending_keywords.values(), [])

len(arxiv_categories), len(all_keywords)

(8, 39)

In [None]:
import arxiv
from arxiv import UnexpectedEmptyPageError
from tqdm.notebook import tqdm
from datetime import datetime, timedelta

client = arxiv.Client(
        page_size=2000,
        delay_seconds=10.0,
        num_retries=5
    )

articles = []
num_keywords = 2
years=7
now = datetime.now()
start_date = (now - timedelta(days=365*years)).strftime("%Y%m%d%H%M")
end_date = now.strftime("%Y%m%d%H%M")
categories = arxiv_categories.items()

for cat, name in tqdm(categories, total=len(categories), desc="Processing categories"):
    for keyword in tqdm(all_keywords, total=len(all_keywords), desc="Processing keywords", leave=False, position=1):
        query = f"cat:{cat} AND ti:\"{keyword}\" AND submittedDate:[{start_date} TO {end_date}]"
        search = arxiv.Search(query=query, max_results=15000, sort_by=arxiv.SortCriterion.SubmittedDate, sort_order=arxiv.SortOrder.Descending)
        try:
            for result in client.results(search):
                articles.append({
                    "title": result.title,
                    "categories": result.categories,
                    "keyword": keyword,
                    "abstract": result.summary,
                    "authors": [author.name for author in result.authors],
                    "url": result.entry_id,
                    "published": result.published,
                    "year": result.published.year
                })
        except UnexpectedEmptyPageError:
            continue

Processing categories:   0%|          | 0/8 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

Processing keywords:   0%|          | 0/39 [00:00<?, ?it/s]

In [55]:
len(articles)

30645

## OpenAlex API

In [58]:
!pip install pyalex python-dotenv -q


[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [56]:
from pyalex import config
import os
from dotenv import load_dotenv

load_dotenv()

config.email = os.getenv("EMAIL")
config.max_retries = 5
config.retry_backoff_factor = 0.5
config.retry_http_codes = [429, 500, 503]

In [57]:
from pyalex import Topics, Works

articles_oa = []

for category, name in tqdm(arxiv_categories.items()):

    topics = Topics().search(name).get()

    for t in topics:
        topic_id = t["id"]

        if t["works_count"] > 0:
            works_generator = Works().filter(
                topics={"id": [topic_id.split("/")[-1]]},
                has_abstract=True
            ).paginate(per_page=200)

            for i, works_page in enumerate(works_generator):
                for w in works_page[:1000]:
                    if "abstract_inverted_index" in w and w["abstract_inverted_index"]:
                        inv_index = w["abstract_inverted_index"]
                        max_pos = max(pos for positions in inv_index.values() for pos in positions)
                        abstract_words = [None] * (max_pos + 1)

                        for word, positions in inv_index.items():
                            for pos in positions:
                                abstract_words[pos] = word

                        abstract_text = " ".join(w for w in abstract_words if w)
                        articles_oa.append({
                            "title": w["display_name"],
                            "categories": category,
                            "abstract": abstract_text,
                            "authors": [a["author"]["display_name"] for a in  w["authorships"]],
                            "url": w["id"],
                            "published": w.get("publication_date"),
                            "year": w.get("publication_year")
                        })

                    else:
                        print("Abstract: N/A")

                if i == 5:
                    break

                    

  0%|          | 0/8 [00:00<?, ?it/s]

In [58]:
len(articles_oa)

108485

## Lists to parquet file

In [59]:
import pandas as pd

df = pd.DataFrame(data=articles)
df["categories"] = df["categories"].apply(lambda x: ", ".join(x))
df.head()

Unnamed: 0,title,categories,keyword,abstract,authors,url,published,year
0,Structure-Attribute Transformations with Marko...,cs.LG,transformer,Graph domain adaptation has gained significant...,"[Zhen Liu, Yongtao Zhang, Shaobo Ren, Yuxin You]",http://arxiv.org/abs/2509.21059v1,2025-09-25 12:09:53+00:00,2025
1,MAIFormer: Multi-Agent Inverted Transformer fo...,cs.LG,transformer,Flight trajectory prediction for multiple airc...,"[Seokbin Yoon, Keumjin Lee]",http://arxiv.org/abs/2509.21004v1,2025-09-25 10:59:29+00:00,2025
2,Why Attention Fails: The Degeneration of Trans...,cs.LG,transformer,Transformer-based architectures achieved high ...,"[Zida Liang, Jiayi Zhu, Weiqiang Sun]",http://arxiv.org/abs/2509.20942v1,2025-09-25 09:25:51+00:00,2025
3,FHRFormer: A Self-supervised Transformer Appro...,"cs.LG, cs.AI, cs.CE, cs.CV",transformer,Approximately 10\% of newborns require assista...,"[Kjersti Engan, Neel Kanwal, Anita Yeconia, La...",http://arxiv.org/abs/2509.20852v1,2025-09-25 07:40:21+00:00,2025
4,T2I-Diff: fMRI Signal Generation via Time-Freq...,cs.LG,transformer,Functional Magnetic Resonance Imaging (fMRI) i...,"[Hwa Hui Tew, Junn Yong Loo, Yee-Fan Tan, Xiny...",http://arxiv.org/abs/2509.20822v1,2025-09-25 07:08:19+00:00,2025


In [60]:
df_oa = pd.DataFrame(data=articles_oa)
df_oa.head()

Unnamed: 0,title,categories,abstract,authors,url,published,year
0,UCSF Chimera—A visualization system for explor...,cs.LG,"The design, implementation, and capabilities o...","[Eric F. Pettersen, Thomas D. Goddard, Conrad ...",https://openalex.org/W2132629607,2004-07-01,2004
1,AutoDock Vina: Improving the speed and accurac...,cs.LG,"AutoDock Vina, a new program for molecular doc...","[Oleg Trott, Arthur J. Olson]",https://openalex.org/W2134967712,2009-06-04,2009
2,Gaussian basis sets for use in correlated mole...,cs.LG,"In the past, basis sets for use in correlated ...",[Thom H. Dunning],https://openalex.org/W2069006374,1989-01-15,1989
3,The M06 suite of density functionals for main ...,cs.LG,We present two new hybrid meta exchange- corre...,"[Yan Zhao, Donald G. Truhlar]",https://openalex.org/W2150697053,2007-07-12,2007
4,<i>VESTA 3</i>for three-dimensional visualizat...,cs.LG,VESTA is a three-dimensional visualization sys...,"[Koichi Momma, Fujio Izumi]",https://openalex.org/W2028056984,2011-10-28,2011


In [61]:
idx_duplicated = df[df[["title", "abstract", "url"]].duplicated()].index
df.drop(idx_duplicated, axis=0, inplace=True)

idx_duplicated = df_oa[df_oa[["title", "abstract", "url"]].duplicated()].index
df_oa.drop(idx_duplicated, axis=0, inplace=True)

In [62]:
df.shape, df_oa.shape

((23294, 8), (92525, 7))

In [63]:
df_full = pd.concat([df, df_oa])

df_full["year"] = df_full["year"].astype(str)
df_full["published"] = df_full["published"].astype(str)
df_full["authors"] = df_full["authors"].apply(lambda x: ", ".join(x))

short_abstracts = df_full[df_full.abstract.apply(lambda x: len(x)) < 200].index
df_full = df_full.drop(short_abstracts, axis=0)

nan_titles = df_full[df_full.title.isna()].index
df_full = df_full.drop(nan_titles, axis=0)

df_full.shape

(113252, 8)

In [None]:
df_full.to_parquet(DATA_PATH+"articles.parquet", engine="pyarrow", index=False)

In [None]:
import pandas as pd

df_full = pd.read_parquet(DATA_PATH+"articles.parquet")

In [66]:
import re
import unicodedata

def clean_whitespace(text: str) -> str:
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def remove_latex(text: str) -> str:
    text = re.sub(r"\$.*?\$", " ", text) 
    text = re.sub(r"\\[a-zA-Z]+", " ", text)
    text = re.sub(r"\{.*?\}", " ", text)
    return text

def normalize_unicode(text: str) -> str:
    return unicodedata.normalize("NFKC", text)

def remove_urls(text: str) -> str:
    return re.sub(r"http\S+|www\S+", "", text)

def remove_special_chars(text: str) -> str:
    return re.sub(r"[^a-zA-Z0-9\s.,;:!?()\-']", "", text)

def remove_emails(text: str) -> str:
    return re.sub(r"\S+@\S+", "", text)

def clean_text(text: str) -> str:
    text = normalize_unicode(text)
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_latex(text)
    text = remove_special_chars(text)
    text = clean_whitespace(text)
    return text


In [67]:
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer

def chunk_by_tokens(text, tokenizer, chunk_size=512, overlap=50):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i+chunk_size]
        chunk_text = tokenizer.decode(chunk)
        chunks.append(chunk_text)
    
    return chunks

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en")

articles_chunks = []
lenght = 500
overlap = 100

for n, row in tqdm(df_full.iterrows(), total=len(df_full), desc="Processing"):
    """number_chunks = np.ceil(len(df_full.iloc[0].abstract) / lenght).astype('int')
    for i in range(number_chunks):
        index = i*lenght 
        abstract = row["abstract"][index - overlap*(i > 0):]
        
        if len(abstract) >= lenght+overlap:
            chunk = abstract[:lenght+(2*overlap)]
        else:
            chunk = abstract"""
    text = "Title: "+ row["title"] + " - Abstract:" + row["abstract"]
    text = clean_text(text)
    chunks = chunk_by_tokens(text=text, tokenizer=tokenizer, chunk_size=512, overlap=50)
    for i, chunk in enumerate(chunks):
        articles_chunks.append({"id": n,
                            "title": row.title,
                            "categories": row.categories,
                            "abstract_chunk": chunk,
                            "id_chunk":i,
                            "authors": row.authors,
                            "url": row.url,
                            "published": row.published,
                            "year": row.year})
        
        #if len(row["abstract"]) < (index+lenght+overlap):
            #break

Processing:   3%|▎         | 3782/113252 [00:03<01:40, 1086.21it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors
Processing: 100%|██████████| 113252/113252 [01:44<00:00, 1080.86it/s]


In [68]:
df_final = pd.DataFrame(data=articles_chunks)
df_final.head()

Unnamed: 0,id,title,categories,abstract_chunk,id_chunk,authors,url,published,year
0,0,Structure-Attribute Transformations with Marko...,cs.LG,title : structure - attribute transformations ...,0,"Zhen Liu, Yongtao Zhang, Shaobo Ren, Yuxin You",http://arxiv.org/abs/2509.21059v1,2025-09-25 12:09:53+00:00,2025
1,1,MAIFormer: Multi-Agent Inverted Transformer fo...,cs.LG,title : maiformer : multi - agent inverted tra...,0,"Seokbin Yoon, Keumjin Lee",http://arxiv.org/abs/2509.21004v1,2025-09-25 10:59:29+00:00,2025
2,2,Why Attention Fails: The Degeneration of Trans...,cs.LG,title : why attention fails : the degeneration...,0,"Zida Liang, Jiayi Zhu, Weiqiang Sun",http://arxiv.org/abs/2509.20942v1,2025-09-25 09:25:51+00:00,2025
3,3,FHRFormer: A Self-supervised Transformer Appro...,"cs.LG, cs.AI, cs.CE, cs.CV",title : fhrformer : a self - supervised transf...,0,"Kjersti Engan, Neel Kanwal, Anita Yeconia, Lad...",http://arxiv.org/abs/2509.20852v1,2025-09-25 07:40:21+00:00,2025
4,4,T2I-Diff: fMRI Signal Generation via Time-Freq...,cs.LG,title : t2i - diff : fmri signal generation vi...,0,"Hwa Hui Tew, Junn Yong Loo, Yee-Fan Tan, Xinyu...",http://arxiv.org/abs/2509.20822v1,2025-09-25 07:08:19+00:00,2025


In [69]:
df_final.shape

(121881, 9)

In [None]:
df_final.to_parquet(DATA_PATH+"articles_chunks.parquet", engine="pyarrow", index=False)

In [2]:
!jupyter nbconvert --to script data_ingestion.ipynb

[NbConvertApp] Converting notebook data_ingestion.ipynb to script
[NbConvertApp] Writing 10113 bytes to data_ingestion.py
