In [1]:
import arxiv
import os
import csv
import zlib
import pandas as pd
import calendar

StatementMeta(, , , Waiting, , Waiting)

In [2]:
# for pipeline paramaters 
query_id = ""
query = ""
paper_num = 0
date_from = ""  
date_to = ""

StatementMeta(, 669cd7cd-0251-4696-a1f4-95f98e0976de, 6, Finished, Available, Finished)

In [5]:
from pyspark.sql.types import StructType, StructField, LongType, StringType, TimestampType

# Define schema for flattened research paper data
schema = StructType(
    [
        StructField('hash_id', LongType(), True),  # Unique hash ID for the paper
        StructField('title', StringType(), True),  # Title of the paper
        StructField('authors', StringType(), True),  # Authors of the paper as a comma-separated string
        StructField('published', TimestampType(), True),  # Publication date of the paper
        StructField('summary', StringType(), True),  # Abstract or summary of the paper
        StructField('pdf_url', StringType(), True),  # URL to download the paper's PDF
        StructField('entry_id', StringType(), True),  # Unique entry ID for the paper (e.g., arXiv ID)
        StructField('recommended', LongType(), True),  # Flag indicating if the paper is recommended (1 if recommended, 0 otherwise)
        StructField('referenceCount', LongType(), True),  # Number of references in the paper
        StructField('citationCount', LongType(), True),  # Number of times the paper has been cited
        StructField('references', StringType(), True),  # JSON string of references
        StructField('citations', StringType(), True),  # JSON string of citations
        StructField('s2FieldsOfStudy', StringType(), True),  # JSON string of fields of study
        StructField('tldr', StringType(), True),  # JSON string for TLDR summary
        StructField('query_id', StringType(), True)  # Identifier for the query that generated the result
    ]
)

StatementMeta(, 669cd7cd-0251-4696-a1f4-95f98e0976de, 9, Finished, Available, Finished)

In [6]:
import os
import calendar
import arxiv
import zlib
import csv
import json
import requests
from datetime import datetime
from time import sleep
import re
import pandas as pd  # Ensure pandas is imported

class ArxivResearchHelper:
    def __init__(self, download_dir="downloads", page_size=10, delay_seconds=3.0, num_retries=3):
        self.download_dir = download_dir
        if not os.path.exists(self.download_dir):
            os.makedirs(self.download_dir)
        
        # Initialize the arxiv.Client with custom settings
        self.client = arxiv.Client(
            page_size=page_size,
            delay_seconds=delay_seconds,
            num_retries=num_retries
        )
    
    def format_paper_id(self, entry_id):
        """
        Format the paper ID to match Semantic Scholar's expected format.

        Parameters:
        - entry_id (str): The arXiv entry ID.

        Returns:
        - str: Formatted paper ID.
        """
        arxiv_id = re.sub(r"v\d+$", "", entry_id.split("/")[-1])
        return f"ARXIV:{arxiv_id}"

    def search_papers(self, query, max_results=50, date_from=None, date_to=None):
        """
        Search for papers on arXiv with an optional date range.

        Parameters:
        - query (str): The search query.
        - max_results (int): Maximum number of results to return.
        - date_from (str): Start date in 'YYYY-MM-DD' format.
        - date_to (str): End date in 'YYYY-MM-DD' format.

        Returns:
        - List of dictionaries containing paper details.
        """
        # Build the date range query if date_from or date_to is specified
        if date_from or date_to:
            date_query = "submittedDate:["
            
            # Handle start date
            if date_from:
                try:
                    date_from_parsed = datetime.strptime(date_from, "%Y-%m-%d")
                    date_query += date_from_parsed.strftime("%Y%m%d") + " TO "
                except ValueError:
                    raise ValueError("Invalid date_from format. Use 'YYYY-MM-DD'.")
            else:
                date_query += "* TO "
                
            # Handle end date
            if date_to:
                try:
                    date_to_parsed = datetime.strptime(date_to, "%Y-%m-%d")
                    last_day = calendar.monthrange(date_to_parsed.year, date_to_parsed.month)[1]
                    end_date = date_to_parsed.replace(day=last_day)
                    date_query += end_date.strftime("%Y%m%d") + "]"
                except ValueError:
                    raise ValueError("Invalid date_to format. Use 'YYYY-MM-DD'.")
            else:
                date_query += "*]"

            # Combine the main query with the date range query
            query = f"({query}) AND {date_query}"

        # Create the search object
        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate  # Sort by submission date
        )

        results = []
        try:
            # Fetch results and store in a list
            for result in self.client.results(search):
                paper = {
                    "hash_id": zlib.crc32(bytes(result.entry_id, 'utf-8')),
                    "title": result.title,
                    "authors": ", ".join([author.name for author in result.authors]),
                    "published": result.published,
                    "summary": result.summary,
                    "pdf_url": result.pdf_url,
                    "entry_id": result.entry_id,
                    "recommended": 0  # Flag as original search result
                }
                results.append(paper)
                
                if len(results) >= max_results:
                    break  # Stop if we reach the max results limit
        except Exception as e:
            print(f"Error while fetching results: {e}")

        return results

    def get_citation_data(self, papers):
        """
        Enrich papers with citation data from Semantic Scholar.

        Parameters:
        - papers (list of dict): List of paper dictionaries.

        Returns:
        - List of dictionaries containing enriched paper details.
        """
        def chunk_list(lst, chunk_size):
            for i in range(0, len(lst), chunk_size):
                yield lst[i:i + chunk_size]

        all_papers_with_citations = []

        # Process papers in batches
        for paper_chunk in chunk_list(papers, 100):
            paper_ids = []
            for paper in paper_chunk:
                if "arxiv.org" in paper["entry_id"]:
                    paper_id = self.format_paper_id(paper["entry_id"])
                    paper_ids.append(paper_id)
            
            # Make a batch request to Semantic Scholar API for each chunk
            response = requests.post(
                'https://api.semanticscholar.org/graph/v1/paper/batch',
                params={'fields': 'referenceCount,citationCount,tldr,s2FieldsOfStudy,citations,references'},
                json={"ids": paper_ids}
            )
            
            if response.status_code == 200:
                citation_data = response.json()
                for paper, data in zip(paper_chunk, citation_data):
                    if not data:
                        continue
                    paper['referenceCount'] = data.get("referenceCount", 0)
                    paper['citationCount'] = data.get("citationCount", 0)
                    paper["references"] = data.get("references", [])
                    s = ''
                    for ref in paper["references"]:
                        s += ref['title'] + '|'
                    paper["references"] = s
                    paper["citations"] = data.get("citations", [])
                    s = ''
                    for cit in paper["citations"]:
                        s += cit['title'] + '|'
                    paper["citations"] = s
                    paper['s2FieldsOfStudy'] = data.get("s2FieldsOfStudy", [])
                    s = ''
                    for field in paper['s2FieldsOfStudy']:
                        s += field['category'] + '|'
                    paper['s2FieldsOfStudy'] = s
                    paper['tldr'] = data.get("tldr", "")
                    if paper['tldr']:
                        paper['tldr'] = paper['tldr']['text']
                all_papers_with_citations.extend(paper_chunk)
            else:
                print("Error fetching citation data:", response.text)
            
            # Optional: Delay to avoid hitting rate limits
            sleep(1)

        return all_papers_with_citations

    def download_pdf(self, entry_id):
        """
        Download the PDF of a paper.

        Parameters:
        - entry_id (str): The arXiv entry ID.

        Returns:
        - str or None: File path of the downloaded PDF or None if failed.
        """
        try:
            paper = next(self.client.results(arxiv.Search(id_list=[entry_id])))
            pdf_url = paper.pdf_url
            title = paper.title.replace(" ", "_").replace("/", "_")
            pdf_filename = os.path.join(self.download_dir, f"{title}.pdf")
            
            if os.path.exists(pdf_filename):
                print(f"PDF already exists: {pdf_filename}")
                return pdf_filename

            print(f"Downloading PDF: {pdf_url}")
            paper.download_pdf(dirpath=self.download_dir, filename=f"{title}.pdf")
            return pdf_filename
        except Exception as e:
            print(f"Error while downloading PDF: {e}")
            return None

    def get_recommended_papers(self, papers, num_recommendations=5):
        """
        Fetch recommended papers using Semantic Scholar and align them with arXiv entries.

        Parameters:
        - papers (list of dict): List of paper dictionaries.
        - num_recommendations (int): Number of recommendations per paper.

        Returns:
        - List of dictionaries containing recommended paper details.
        """
        recommended_papers = []
        seen_entry_ids = set(paper['entry_id'] for paper in papers)

        for paper in papers:
            paper_id = self.format_paper_id(paper["entry_id"])
            try:
                # Get recommended papers from Semantic Scholar
                response = requests.get(
                    f'https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{paper_id}',
                    params={'limit': num_recommendations}
                )

                if response.status_code == 200:
                    rec_data = response.json().get('recommendedPapers', [])
                    for rec in rec_data:
                        rec_title = rec.get("title")
                        if not rec_title:
                            continue

                        # Search for the recommended paper on arXiv
                        arxiv_results = self.search_papers(query=f'ti:"{rec_title}"', max_results=1)
                        if arxiv_results:
                            arxiv_paper = arxiv_results[0]
                            if arxiv_paper['entry_id'] not in seen_entry_ids:
                                arxiv_paper['recommended'] = 1  # Flag as recommended paper
                                recommended_papers.append(arxiv_paper)
                                seen_entry_ids.add(arxiv_paper['entry_id'])
                else:
                    print(f"Error fetching recommendations for {paper_id}: {response.status_code} {response.text}")
            except Exception as e:
                print(f"Exception fetching recommendations for {paper_id}: {e}")

        return recommended_papers

    def save_papers_to_csv(self, papers, filename='papers.csv'):
        """
        Save the list of paper dictionaries to a CSV file.

        Parameters:
        - papers (list of dict): List of paper dictionaries.
        - filename (str): Filename for the CSV file.
        """
        if not papers:
            print("No papers to save.")
            return

        keys = papers[0].keys()
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            dict_writer = csv.DictWriter(csvfile, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(papers)

    def search_papers_aug(self, query, max_results=50, date_from=None, date_to=None):
        """
        Search for papers and include recommended papers.

        Parameters:
        - query (str): The search query.
        - max_results (int): Maximum number of search results to return.
        - date_from (str): Start date in 'YYYY-MM-DD' format.
        - date_to (str): End date in 'YYYY-MM-DD' format.

        Returns:
        - List of dictionaries containing paper details with citation data.
        """
        papers = self.search_papers(query, max_results, date_from, date_to)
        # recommended_papers = self.get_recommended_papers(papers, num_recommendations=5)
        # all_papers = papers + recommended_papers
        papers_with_citations = self.get_citation_data(papers)
        return papers_with_citations


StatementMeta(, 669cd7cd-0251-4696-a1f4-95f98e0976de, 10, Finished, Available, Finished)

In [7]:
helper = ArxivResearchHelper()
papers = helper.search_papers_aug(query, max_results=paper_num, date_from=date_from, date_to=date_to)
for paper in papers:
    paper['query_id'] = query_id 





StatementMeta(, 669cd7cd-0251-4696-a1f4-95f98e0976de, 11, Finished, Available, Finished)

{'hash_id': 918220639, 'title': 'Data is missing again -- Reconstruction of power generation data using $k$-Nearest Neighbors and spectral graph theory', 'authors': 'Amandine Pierrot, Pierre Pinson', 'published': datetime.datetime(2024, 8, 30, 23, 58, 28, tzinfo=datetime.timezone.utc), 'summary': 'The risk of missing data and subsequent incomplete data records at wind farms\nincreases with the number of turbines and sensors. We propose here an\nimputation method that blends data-driven concepts with expert knowledge, by\nusing the geometry of the wind farm in order to provide better estimates when\nperforming Nearest Neighbor imputation. Our method relies on learning Laplacian\neigenmaps out of the graph of the wind farm through spectral graph theory.\nThese learned representations can be based on the wind farm layout only, or\nadditionally account for information provided by collected data. The related\nweighted graph is allowed to change with time and can be tracked in an online\nfas

In [8]:

def saveAsTable(source, name):
    df = pd.DataFrame(source)
    spark_df = spark.createDataFrame(df,schema=schema)
    spark_df.write.format("delta").saveAsTable(name)

def appendTable(source,name):
    df = pd.DataFrame(source)
    spark_df = spark.createDataFrame(df,schema=schema)
    spark_df.write.mode("append").format("delta").saveAsTable(name)

StatementMeta(, 669cd7cd-0251-4696-a1f4-95f98e0976de, 12, Finished, Available, Finished)

In [9]:
import time
import re
t = str(time.time()).split(".")[0]
query_name = re.sub(r'[^a-zA-Z]', '', query)
filename = f"bronze._{query_id}_{query_name}"
appendTable(papers,filename)
time.sleep(3)

StatementMeta(, 669cd7cd-0251-4696-a1f4-95f98e0976de, 13, Finished, Available, Finished)

In [11]:
mssparkutils.notebook.run("arxiv_tag", 10000, {"filename": filename })

StatementMeta(, 669cd7cd-0251-4696-a1f4-95f98e0976de, 15, Finished, Available, Finished)

''