In [2]:
import requests
from bs4 import BeautifulSoup
from typing import List
from arxplore.datamodel import Feed, Config


def parse_feed(namespace: str, config: Config) -> List[Feed]:
    url = f"https://arxiv.org/list/{namespace}/new"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    feeds = []
    for entry in soup.find_all("dl"):
        title = entry.find("div", {"class": "list-title"}).text
        abstract = entry.find("div", {"class": "abstract"}).text
        authors = entry.find("div", {"class": "list-authors"}).text
        url = entry.find("a", {"title": "Abstract"}).get("href")
        feed = Feed(title=title, abstract=abstract, authors=authors, url=url)
        feeds.append(feed)
    return feeds



In [3]:
namespace = "cs.AI"
url = f"https://arxiv.org/list/{namespace}/new"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [6]:

# Check if the response is successful
if response.status_code == 200:
    print("Successfully retrieved the page")
else:
    print(f"Failed to retrieve the page, status code: {response.status_code}")



Successfully retrieved the page


In [36]:
paper_blocks = soup.find_all('dt')

In [37]:
paper = paper_blocks[0]
paper

<dt><a name="item1">[1]</a>   <span class="list-identifier"><a href="/abs/2402.09413" title="Abstract">arXiv:2402.09413</a> [<a href="/pdf/2402.09413" title="Download PDF">pdf</a>, <a href="/ps/2402.09413" title="Download PostScript">ps</a>, <a href="/format/2402.09413" title="Other formats">other</a>]</span></dt>

In [40]:
for block in paper_blocks:
    # Find the <dd> tag that immediately follows each <dt> tag
    metadata = block.find_next_sibling('dd')
    title = metadata.find('div', class_='list-title').text.replace('Title:', '').strip()
    authors = [a.text for a in metadata.find('div', class_='list-authors').find_all('a')]
    abstract = metadata.find('p').text.strip()
    # Extract the PDF link from the <dt> block
    pdf_link_suffix = block.find('a', title='Download PDF')['href']
    pdf_url = f'https://arxiv.org{pdf_link_suffix}'
    print(f'Title: {title}\nAuthors: {", ".join(authors)}\nAbstract: {abstract}\nPDF URL: {pdf_url}\n{"-"*40}')
    break

Title: Mathematical Explanations
Authors: Joseph Y. Halpern
Abstract: A definition of what counts as an explanation of mathematical statement, and
when one explanation is better than another, is given. Since all mathematical
facts must be true in all causal models, and hence known by an agent,
mathematical facts cannot be part of an explanation (under the standard notion
of explanation). This problem is solved using impossible possible worlds.
PDF URL: https://arxiv.org/pdf/2402.09413
----------------------------------------


In [31]:
for paper in papers:
    title = paper.find('div', class_='list-title').text.replace('Title:', '').strip()
    authors = [a.text for a in paper.find('div', class_='list-authors').find_all('a')]
    abstract = paper.find('p').text.strip()
    # Navigate to the previous sibling to find the PDF link
    list_identifier = paper.find_previous_sibling('dt').find('span', class_='list-identifier')
    pdf_link_suffix = list_identifier.find('a', title='Download PDF')['href']
    pdf_url = f'https://arxiv.org{pdf_link_suffix}'
    print(f'Title: {title}\nAuthors: {", ".join(authors)}\nAbstract: {abstract}\nURL: {url}\n{"-"*40}')
    break



AttributeError: 'NoneType' object has no attribute 'find'

In [17]:
from scholarly import scholarly

# Replace 'Joseph Y. Halpern' with the name of the author you are interested in
author_name = 'Binfeng Xu'

try:
    # Search for the author and take the first result
    search_query = scholarly.search_author(author_name)
    author = next(search_query)
    
    # Fill in more detailed information about the author
    scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications'])
    
    print(f"Name: {author['name']}")
    print(f"Affiliation: {author.get('affiliation')}")
    print(f"Interests: {author.get('interests', [])}")
    print(f"Cited by: {author['citedby']}")
    print(f"h-index: {author['hindex']}")
    print(f"i10-index: {author['i10index']}")
    print(f"Number of publications: {len(author['publications'])}")
except StopIteration:
    print("Author not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Name: Binfeng Xu
Affiliation: New York University
Interests: ['Machine Learning']
Cited by: 36
h-index: 2
i10-index: 2
Number of publications: 3


In [43]:
author.get("citedby")

36

In [1]:
from arxplore.parsers import parse_arxiv

In [2]:
parse_arxiv("cs.AI", tests = 3)

Found 156 papers in the cs.AI section of arXiv.

Parsing information for Sujay Nagesh Koujalgi from Google Scholar...
Parsing information for Jonathan Dodge from Google Scholar...
Parsing information for Lance Ying from Google Scholar...
Parsing information for Tan Zhi-Xuan from Google Scholar...
Parsing information for Lionel Wong from Google Scholar...
Parsing information for Vikash Mansinghka from Google Scholar...
Parsing information for Joshua Tenenbaum from Google Scholar...
Parsing information for Yiwen Sun from Google Scholar...
Parsing information for Xianyin Zhang from Google Scholar...
Parsing information for Shiyu Huang from Google Scholar...
Parsing information for Shaowei Cai from Google Scholar...
Parsing information for Bing-Zhen Zhang from Google Scholar...
Parsing information for Ke Wei from Google Scholar...


[Feed(section='cs.AI', pdf_url='https://arxiv.org/pdf/2402.10290', title='Experiments with Encoding Structured Data for Neural Networks', authors=[Author(name='Sujay Nagesh Koujalgi', affiliation='', interests=[], citation=0, h_index=0, n_publications=0), Author(name='Jonathan Dodge', affiliation='Assistant Professor, Penn State University', interests=['Explainable AI', 'Human-Computer Interaction', 'Graphics'], citation=1120, h_index=12, n_publications=33)], f_author=Author(name='Sujay Nagesh Koujalgi', affiliation='', interests=[], citation=0, h_index=0, n_publications=0), abstract="The project's aim is to create an AI agent capable of selecting good actions\nin a game-playing domain called Battlespace. Sequential domains like\nBattlespace are important testbeds for planning problems, as such, the\nDepartment of Defense uses such domains for wargaming exercises. The agents we\ndeveloped combine Monte Carlo Tree Search (MCTS) and Deep Q-Network (DQN)\ntechniques in an effort to naviga

In [1]:
from arxplorer.db import init_db
from arxplorer.parsers import _parse_scholar
init_db()

Database already exists!


In [2]:
_parse_scholar("Binfeng Xu")

Author(name='Binfeng Xu', affiliation='New York University', interests='Machine Learning', citation=36, h_index=2, n_publications=3)

In [3]:
_parse_scholar("Binfeng Xu")

Author(name='Binfeng Xu', affiliation='New York University', interests='Machine Learning', citation=36, h_index=2, n_publications=3)

In [3]:
# Defining useful signals for ranking

from arxplorer.datamodel import Feed


class FeatureExtractor:
    def __init__(self, feeds: list[Feed]):
        self.feed = feeds

    @property
    def first_author_citation(self) -> int:
        return self.feed.authors[0].citation

    @property
    def avg_authors_citation(self) -> float:
        return sum([a.citation for a in self.feed.authors]) / len(self.feed.authors)

    @property
    def variance_authors_citation(self) -> float:
        avg = self.avg_authors_citation
        return sum([(a.citation - avg) ** 2 for a in self.feed.authors]) / len(self.feed.authors)

    @property
    def first_author_h_index(self) -> int:
        return self.feed.authors[0].h_index

    @property
    def avg_authors_h_index(self) -> float:
        return sum([a.h_index for a in self.feed.authors]) / len(self.feed.authors)

    @property
    def variance_authors_h_index(self) -> float:
        avg = self.avg_authors_h_index
        return sum([(a.h_index - avg) ** 2 for a in self.feed.authors]) / len(self.feed.authors)
    

fe = FeatureExtractor([])
fe.avg_authors_citation

AttributeError: 'list' object has no attribute 'authors'

In [14]:
from sentence_transformers import SentenceTransformer

ins = "I like papers with innovative ideas instead of replication of existing methods on subfields. World modeling, planning and automation interest me most while others are also welcome."
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(ins)
embedding2 = model.encode("Robotics")
embedding3 = model.encode("Machine Learning")

Batches: 100%|██████████| 1/1 [00:00<00:00, 497.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 323.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 608.05it/s]


In [17]:
from arxplorer.utils import *
from arxplorer.utils import embedding_L2_similarity

embedding_L2_similarity(embeddings, embedding2)


1.1791422797013742

In [20]:
# Rankings for each feature (the index represents the paper, values represent rankings)
F1_rankings = ['B', 'A', 'C']  # F1 ranks A as the highest, followed by B, then C
F2_rankings = ['B', 'C', 'A']  # F2 ranks B as the highest, followed by C, then A

# Weights for each feature (reflecting their importance)
weights = {'F1': 2, 'F2': 1}

# List of papers
papers = ['A', 'B', 'C']


def calculate_weighted_copeland_scores(rankings, weights, papers):
    scores = {paper: 0 for paper in papers}  # Initialize scores for each paper
    
    # Function to get rank of a paper in a feature
    def get_rank(paper, feature_rankings):
        return feature_rankings.index(paper)
    
    # Perform pairwise comparisons
    for i in range(len(papers)):
        for j in range(i + 1, len(papers)):
            paper1, paper2 = papers[i], papers[j]
            for feature, ranking in rankings.items():
                rank1, rank2 = get_rank(paper1, ranking), get_rank(paper2, ranking)
                weight = weights[feature]
                if rank1 < rank2:  # paper1 is ranked higher than paper2
                    scores[paper1] += weight
                    scores[paper2] -= weight
                elif rank1 > rank2:  # paper2 is ranked higher than paper1
                    scores[paper1] -= weight
                    scores[paper2] += weight
                # Ties are ignored in this implementation

    sorted_papers = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [paper for paper, _ in sorted_papers]

# Combine rankings into a single dictionary for easier processing
rankings = {'F1': F1_rankings, 'F2': F2_rankings}

calculate_weighted_copeland_scores(rankings, weights, papers)


['B', 'A', 'C']