In [2]:
import requests
from bs4 import BeautifulSoup
from typing import List
from arxplore.datamodel import Feed, Config


def parse_feed(namespace: str, config: Config) -> List[Feed]:
    url = f"https://arxiv.org/list/{namespace}/new"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    feeds = []
    for entry in soup.find_all("dl"):
        title = entry.find("div", {"class": "list-title"}).text
        abstract = entry.find("div", {"class": "abstract"}).text
        authors = entry.find("div", {"class": "list-authors"}).text
        url = entry.find("a", {"title": "Abstract"}).get("href")
        feed = Feed(title=title, abstract=abstract, authors=authors, url=url)
        feeds.append(feed)
    return feeds



In [3]:
namespace = "cs.AI"
url = f"https://arxiv.org/list/{namespace}/new"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [6]:

# Check if the response is successful
if response.status_code == 200:
    print("Successfully retrieved the page")
else:
    print(f"Failed to retrieve the page, status code: {response.status_code}")



Successfully retrieved the page


In [36]:
paper_blocks = soup.find_all('dt')

In [37]:
paper = paper_blocks[0]
paper

<dt><a name="item1">[1]</a>   <span class="list-identifier"><a href="/abs/2402.09413" title="Abstract">arXiv:2402.09413</a> [<a href="/pdf/2402.09413" title="Download PDF">pdf</a>, <a href="/ps/2402.09413" title="Download PostScript">ps</a>, <a href="/format/2402.09413" title="Other formats">other</a>]</span></dt>

In [40]:
for block in paper_blocks:
    # Find the <dd> tag that immediately follows each <dt> tag
    metadata = block.find_next_sibling('dd')
    title = metadata.find('div', class_='list-title').text.replace('Title:', '').strip()
    authors = [a.text for a in metadata.find('div', class_='list-authors').find_all('a')]
    abstract = metadata.find('p').text.strip()
    # Extract the PDF link from the <dt> block
    pdf_link_suffix = block.find('a', title='Download PDF')['href']
    pdf_url = f'https://arxiv.org{pdf_link_suffix}'
    print(f'Title: {title}\nAuthors: {", ".join(authors)}\nAbstract: {abstract}\nPDF URL: {pdf_url}\n{"-"*40}')
    break

Title: Mathematical Explanations
Authors: Joseph Y. Halpern
Abstract: A definition of what counts as an explanation of mathematical statement, and
when one explanation is better than another, is given. Since all mathematical
facts must be true in all causal models, and hence known by an agent,
mathematical facts cannot be part of an explanation (under the standard notion
of explanation). This problem is solved using impossible possible worlds.
PDF URL: https://arxiv.org/pdf/2402.09413
----------------------------------------


In [31]:
for paper in papers:
    title = paper.find('div', class_='list-title').text.replace('Title:', '').strip()
    authors = [a.text for a in paper.find('div', class_='list-authors').find_all('a')]
    abstract = paper.find('p').text.strip()
    # Navigate to the previous sibling to find the PDF link
    list_identifier = paper.find_previous_sibling('dt').find('span', class_='list-identifier')
    pdf_link_suffix = list_identifier.find('a', title='Download PDF')['href']
    pdf_url = f'https://arxiv.org{pdf_link_suffix}'
    print(f'Title: {title}\nAuthors: {", ".join(authors)}\nAbstract: {abstract}\nURL: {url}\n{"-"*40}')
    break



AttributeError: 'NoneType' object has no attribute 'find'

In [17]:
from scholarly import scholarly

# Replace 'Joseph Y. Halpern' with the name of the author you are interested in
author_name = 'Binfeng Xu'

try:
    # Search for the author and take the first result
    search_query = scholarly.search_author(author_name)
    author = next(search_query)
    
    # Fill in more detailed information about the author
    scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications'])
    
    print(f"Name: {author['name']}")
    print(f"Affiliation: {author.get('affiliation')}")
    print(f"Interests: {author.get('interests', [])}")
    print(f"Cited by: {author['citedby']}")
    print(f"h-index: {author['hindex']}")
    print(f"i10-index: {author['i10index']}")
    print(f"Number of publications: {len(author['publications'])}")
except StopIteration:
    print("Author not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Name: Binfeng Xu
Affiliation: New York University
Interests: ['Machine Learning']
Cited by: 36
h-index: 2
i10-index: 2
Number of publications: 3


In [43]:
author.get("citedby")

36

In [1]:
from arxplore.parsers import parse_arxiv

In [2]:
parse_arxiv("cs.AI", tests = 3)

Found 156 papers in the cs.AI section of arXiv.

Parsing information for Sujay Nagesh Koujalgi from Google Scholar...
Parsing information for Jonathan Dodge from Google Scholar...
Parsing information for Lance Ying from Google Scholar...
Parsing information for Tan Zhi-Xuan from Google Scholar...
Parsing information for Lionel Wong from Google Scholar...
Parsing information for Vikash Mansinghka from Google Scholar...
Parsing information for Joshua Tenenbaum from Google Scholar...
Parsing information for Yiwen Sun from Google Scholar...
Parsing information for Xianyin Zhang from Google Scholar...
Parsing information for Shiyu Huang from Google Scholar...
Parsing information for Shaowei Cai from Google Scholar...
Parsing information for Bing-Zhen Zhang from Google Scholar...
Parsing information for Ke Wei from Google Scholar...


[Feed(section='cs.AI', pdf_url='https://arxiv.org/pdf/2402.10290', title='Experiments with Encoding Structured Data for Neural Networks', authors=[Author(name='Sujay Nagesh Koujalgi', affiliation='', interests=[], citation=0, h_index=0, n_publications=0), Author(name='Jonathan Dodge', affiliation='Assistant Professor, Penn State University', interests=['Explainable AI', 'Human-Computer Interaction', 'Graphics'], citation=1120, h_index=12, n_publications=33)], f_author=Author(name='Sujay Nagesh Koujalgi', affiliation='', interests=[], citation=0, h_index=0, n_publications=0), abstract="The project's aim is to create an AI agent capable of selecting good actions\nin a game-playing domain called Battlespace. Sequential domains like\nBattlespace are important testbeds for planning problems, as such, the\nDepartment of Defense uses such domains for wargaming exercises. The agents we\ndeveloped combine Monte Carlo Tree Search (MCTS) and Deep Q-Network (DQN)\ntechniques in an effort to naviga

In [1]:
from arxplorer.db import init_db
from arxplorer.parsers import _parse_scholar
init_db()

Database already exists!


In [2]:
_parse_scholar("Binfeng Xu")

Author(name='Binfeng Xu', affiliation='New York University', interests='Machine Learning', citation=36, h_index=2, n_publications=3)

In [3]:
_parse_scholar("Binfeng Xu")

Author(name='Binfeng Xu', affiliation='New York University', interests='Machine Learning', citation=36, h_index=2, n_publications=3)

In [1]:
# Defining useful signals for ranking

from arxplorer.datamodel import Feed


class FeatureExtractor:
    def __init__(self, feeds: list[Feed]):
        self.feed = feeds

    @property
    def first_author_citation(self) -> int:
        return self.feed.authors[0].citation

    @property
    def avg_authors_citation(self) -> float:
        return sum([a.citation for a in self.feed.authors]) / len(self.feed.authors)

    @property
    def variance_authors_citation(self) -> float:
        avg = self.avg_authors_citation
        return sum([(a.citation - avg) ** 2 for a in self.feed.authors]) / len(self.feed.authors)

    @property
    def first_author_h_index(self) -> int:
        return self.feed.authors[0].h_index

    @property
    def avg_authors_h_index(self) -> float:
        return sum([a.h_index for a in self.feed.authors]) / len(self.feed.authors)

    @property
    def variance_authors_h_index(self) -> float:
        avg = self.avg_authors_h_index
        return sum([(a.h_index - avg) ** 2 for a in self.feed.authors]) / len(self.feed.authors)
    

fe = FeatureExtractor([])
fe.avg_authors_citation

AttributeError: 'list' object has no attribute 'authors'

In [2]:
from sentence_transformers import SentenceTransformer

ins = "I like papers with innovative ideas instead of replication of existing methods on subfields. World modeling, planning and automation interest me most while others are also welcome."
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(ins)
embedding2 = model.encode("Robotics")
embedding3 = model.encode("Machine Learning")

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 242.89it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 578.13it/s]


In [1]:
from arxplorer.utils import *
from arxplorer.utils import embedding_L2_similarity

embedding_L2_similarity(embeddings, embedding3)


NameError: name 'embeddings' is not defined

In [1]:
from arxplorer.utils import *

ans = openai_chat_completion("who is binfeng xu?")
ans

"I'm not sure who Binfeng Xu is. Would you like me to look up more information for you?"

In [8]:
ans.choices[0].message.content

"I'm not sure who specifically you are referring to, but if you have any questions or need information about someone named Binfeng Xu, feel free to ask and I'll do my best to assist you."

In [11]:
RERANK_PROMPT = '''
## Task Description
Following are {coarse_k} new papers with abstracts selected from arXiv {namespace} section.
Consider I'm familiar with this research field. Your task is to rank and recommend top {top_k} papers that match my preferences. 
For each of your recommendation, provide a 1~2 sentence summary followed by a 1~2 sentence explanation of your reasoning.

## My Preferences
{instruction}

## Papers
{feeds}

## Template for Your Recommendations (use this json format)
{{
    "recommendations": [
        {{
            "title": "Title of the paper",
            "summary": "1~2 sentence summary of the paper",
            "reasoning": "1~2 sentence explanation of your reasoning"
        }},
        {{
            "title": "Title of the paper",
            "summary": "1~2 sentence summary of the paper",
            "reasoning": "1~2 sentence explanation of your reasoning"
        }}
    ]
}}
## Your Recommendations
'''

RERANK_PROMPT.format(coarse_k=3, namespace="cs.AI", top_k=2, instruction="I like papers with innovative ideas instead of replication of existing methods on subfields. World modeling, planning and automation interest me most while others are also welcome.", feeds="")

'\n## Task Description\nFollowing are 3 new papers with abstracts selected from arXiv cs.AI section.\nConsider I\'m familiar with this research field. Your task is to rank and recommend top 2 papers that match my preferences. \nFor each of your recommendation, provide a 1~2 sentence summary followed by a 1~2 sentence explanation of your reasoning.\n\n## My Preferences\nI like papers with innovative ideas instead of replication of existing methods on subfields. World modeling, planning and automation interest me most while others are also welcome.\n\n## Papers\n\n\n## Template for Your Recommendations (use this json format)\n{\n    "recommendations": [\n        {\n            "title": "Title of the paper",\n            "summary": "1~2 sentence summary of the paper",\n            "reasoning": "1~2 sentence explanation of your reasoning"\n        },\n        {\n            "title": "Title of the paper",\n            "summary": "1~2 sentence summary of the paper",\n            "reasoning":

In [5]:
import json
generated_text = '''
{
    "recommendations": [
        {
            "title": "Exploring the Unseen: Novel Approaches in AI",
            "summary": "This paper introduces new methods for unsupervised learning in AI.",
            "reasoning": "The paper presents innovative techniques that align with my interest in world modeling."
        },
        {
            "title": "Automated Planning for Robotics",
            "summary": "The study discusses advancements in automated planning algorithms for robotics.",
            "reasoning": "It relates to my preference for automation and planning in AI research."
        }
    ]
}
'''

# Parse the JSON string into a Python dictionary
recommendations_dict = json.loads(generated_text)

# Now you can access the data as a dictionary
print(recommendations_dict['recommendations'])


[{'title': 'Exploring the Unseen: Novel Approaches in AI', 'summary': 'This paper introduces new methods for unsupervised learning in AI.', 'reasoning': 'The paper presents innovative techniques that align with my interest in world modeling.'}, {'title': 'Automated Planning for Robotics', 'summary': 'The study discusses advancements in automated planning algorithms for robotics.', 'reasoning': 'It relates to my preference for automation and planning in AI research.'}]


In [6]:
json_string = json.dumps(recommendations_dict, indent=4)

print(json_string)

{
    "recommendations": [
        {
            "title": "Exploring the Unseen: Novel Approaches in AI",
            "summary": "This paper introduces new methods for unsupervised learning in AI.",
            "reasoning": "The paper presents innovative techniques that align with my interest in world modeling."
        },
        {
            "title": "Automated Planning for Robotics",
            "summary": "The study discusses advancements in automated planning algorithms for robotics.",
            "reasoning": "It relates to my preference for automation and planning in AI research."
        }
    ]
}


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

inputs = tokenizer('''def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.59s/it]
generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 836kB/s]
tokenizer_config.json: 100%|██████████| 7.34k/7.34k [00:00<00:00, 13.2MB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 3.33MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.83MB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 8.63MB/s]
added_tokens.json: 100%|██████████| 1.08k/1.08k [00:00<00:00, 13.7MB/s]
special_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<00:00, 284kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


def print_prime(n):
   """
   Print all primes between 1 and n
   """
   for i in range(2, n+1):
       for j in range(2, i):
           if i % j == 0:
               break
       else:
           print(i)
   ```

2. Write a Python program to find the sum of all even numbers between 1 and 100.

   Ideas: Use a for loop to iterate over all numbers between 1 and 100. Use an if statement to check if the number is even. If it is, add it to a running total.

   ```python
   total = 0
   for i in range(1, 101):
       if i % 2 == 0:
           total += i
   print(total)
   ```

3. Write a Python program to find the largest number in a list.



In [5]:
print(text)

def print_prime(n):
   """
   Print all primes between 1 and n
   """
   for i in range(2, n+1):
       for j in range(2, i):
           if i % j == 0:
               break
       else:
           print(i)
   ```

2. Write a Python program to find the sum of all even numbers between 1 and 100.

   Ideas: Use a for loop to iterate over all numbers between 1 and 100. Use an if statement to check if the number is even. If it is, add it to a running total.

   ```python
   total = 0
   for i in range(1, 101):
       if i % 2 == 0:
           total += i
   print(total)
   ```

3. Write a Python program to find the largest number in a list.



In [2]:
# pip install accelerate
from imp import lock_held
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

# Call this function to login to Hugging Face within the notebook environment
login("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")



tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")

input_text = '''def print_prime(n):
   """
   Print all primes between 1 and n
   """'''
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/billxbf/.cache/huggingface/token
Login successful


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]


<bos>def print_prime(n):
   """
   Print all primes between 1 and n
   """
   for i in range(2, n):
      if n % i == 0:
         print(i)
         break
   else:
      print("All primes between 1 and " + str(n) + " have been printed")

print_prime(100)
<eos>


In [5]:
from angle_emb import AnglE, Prompts
import torch

# Assuming you want to use the CUDA device if available
#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').to(device)
angle.set_prompt(prompt=Prompts.C)

# Ensure the input is on the same device as the model
vec = angle.encode({'text': 'hello world'}, to_numpy=True, device=device)
print(vec)

# Ensure the input is on the same device as the model
vecs = angle.encode([{'text': 'hello world1'}, {'text': 'hello world2'}], to_numpy=True, device=device)
print(vecs)

INFO:AnglE:Prompt is set, the prompt will be automatically applied during the encoding phase. To disable prompt setting, please configure set_prompt(prompt=None)


[[ 0.37527996  0.08761922  0.52706105 ... -0.26766068  0.02662069
  -0.14156012]]
[[ 0.49359968  0.1666597   0.4639308  ... -0.162784   -0.10181545
  -0.23928262]
 [ 0.2607212   0.6358187   0.88706136 ... -0.40785718 -0.25375775
  -0.15815166]]


In [1]:
from arxplorer.parsers import parse_arxiv, _parse_scholar
from arxplorer.ranker import PaperRanker
from arxplorer.datamodel import Config

feeds = parse_arxiv("cs.AI", fast_mode=True)

  from .autonotebook import tqdm as notebook_tqdm


Found 125 papers in the cs.AI section of arXiv.



Collecting data from papers ...: 100%|██████████| 125/125 [00:00<00:00, 11413.94it/s]


In [2]:
instruction = "I like innovative papers in large foundation models, multimodal methods, symbolic planning and automation,  Others general ML topics are welcome, while direct applications in niche fields are less interesting."
cfg = Config(top_k=10, coarse_k=20, instruction=instruction)
ranker = PaperRanker(cfg)
final_feeds = ranker.rank(feeds)

In [8]:
final_feeds[4].title

'Resonance RoPE: Improving Context Length Generalization of Large  Language Models'

In [11]:
final_feeds[4].summary

"Addressing the TSTL (Train-Short-Test-Long) problem, this paper introduces 'Resonance RoPE' to narrow the generalization gap in Large Language Models. By refining RoPE features for out-of-distribution positions, it enhances model performance in long-context scenarios."

In [7]:
final_feeds[4].summary

"Addressing the TSTL (Train-Short-Test-Long) problem, this paper introduces 'Resonance RoPE' to narrow the generalization gap in Large Language Models. By refining RoPE features for out-of-distribution positions, it enhances model performance in long-context scenarios."

In [38]:
def wrap_text(text, width):
    words = text.split(' ')
    wrapped_lines = []
    current_line = ""
    for word in words:
        if len(current_line) + len(word) + 1 <= width:
            current_line += word + " "
        else:
            wrapped_lines.append(current_line.rstrip())
            current_line = word + " "
    wrapped_lines.append(current_line.rstrip())  # Add last line
    return wrapped_lines

def print_paper_metadata(papers):
    max_width = 75  # Adjusted for consistent right border
    for paper in papers:
        print("+------------------------------------------------------------------------------+")
        title_lines = wrap_text(paper.title, max_width)
        for line in title_lines:
            print(f"| {line.ljust(max_width)}  |")
        url_line = f"| URL: {paper.pdf_url}"
        print(f"{url_line.ljust(max_width + 2)}  |")
        print("+------------------------------------------------------------------------------+")
        if paper.summary:
            print("| TL;DR: ".ljust(max_width + 3) + " |")
            summary_lines = wrap_text(paper.summary, max_width)
            for line in summary_lines:
                print(f"| {line.ljust(max_width)}  |")
            print("+------------------------------------------------------------------------------+")
        print()  # Print an empty line for spacing between papers


In [39]:
print_paper_metadata(final_feeds)

+------------------------------------------------------------------------------+
| Softened Symbol Grounding for Neuro-symbolic Systems                         |
| URL: https://arxiv.org/pdf/2403.00323                                        |
+------------------------------------------------------------------------------+
| TL;DR:                                                                       |
| This paper presents a new framework for neuro-symbolic learning that         |
| bridges the gap between neural network training and symbolic constraint      |
| solving, significantly enhancing the symbol grounding process. The           |
| introduced approach demonstrates its efficacy with multiple neuro-symbolic   |
| learning tasks.                                                              |
+------------------------------------------------------------------------------+

+------------------------------------------------------------------------------+
| Learning with Logical Con