In [326]:
import os
import re
import json
import uuid
import requests
from typing import List, Dict, Any
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
from openai import OpenAI

class SemanticScholarAPI:
    API_BASE_URL = "https://api.semanticscholar.org/graph/v1"
    DEFAULT_PAPER_RESPONSE_FIELDS = "paperId,title,abstract,year,citationStyles,citationCount,influentialCitationCount,authors,venue,publicationVenue,journal,url"

    def __init__(self, api_key: str):
        self.api_headers = {"x-api-key": api_key}

    def _send_api_request(self, endpoint: str, method: str = "GET", params: Dict[str, Any] = None, data: Dict[str, Any] = None) -> Dict[str, Any]:
        url = f"{self.API_BASE_URL}/{endpoint}"
        response = requests.request(method, url, headers=self.api_headers, params=params, json=data)
        response.raise_for_status()
        return response.json()

    def get_paper_details(self, paper_id: str) -> Dict[str, Any]:
        return self._send_api_request(f"paper/{paper_id}")

    def get_paper_details_batch(self, paper_ids: List[str], fields: str = DEFAULT_PAPER_RESPONSE_FIELDS) -> List[Dict[str, Any]]:
        return self._send_api_request("paper/batch", method="POST", data={"ids": paper_ids, "fields": fields})

    def search_for_papers(self, query: str, limit: int = 10) -> Dict[str, Any]:
        return self._send_api_request("paper/search", params={"query": query, "limit": limit, "fields": self.DEFAULT_PAPER_RESPONSE_FIELDS})

    def get_paper_references(self, paper_id: str, limit: int = 10) -> Dict[str, Any]:
        return self._send_api_request(f"paper/{paper_id}/references", params={"fields": "paperId,title,year,authors", "limit": limit})

    def get_paper_references_batch(self, paper_ids: List[str], fields: str = "paperId,title,year,authors", limit: int = 10) -> Dict[str, Any]:
        return self._send_api_request("paper/batch/references", method="POST", data={"ids": paper_ids, "fields": fields, "limit": limit})

    def get_paper_citations(self, paper_id: str, limit: int = 10) -> Dict[str, Any]:
        return self._send_api_request(f"paper/{paper_id}/citations", params={"fields": "paperId,title,year,authors", "limit": limit})

    def get_paper_citations_batch(self, paper_ids: List[str], fields: str = "paperId,title,year,authors", limit: int = 10) -> Dict[str, Any]:
        return self._send_api_request("paper/batch/citations", method="POST", data={"ids": paper_ids, "fields": fields, "limit": limit})

class BibTexHandler:
    @staticmethod
    def entries_to_bibtex_string(entries: List[Dict[str, Any]]) -> str:
        db = BibDatabase()
        db.entries = entries
        writer = BibTexWriter()
        return writer.write(db)

    @staticmethod
    def bibtex_string_to_entries(bibtex_content: str) -> List[Dict[str, Any]]:
        parser = bibtexparser.bparser.BibTexParser(common_strings=True)
        bib_database = bibtexparser.loads(bibtex_content, parser)
        return bib_database.entries

class LaTeXHandler:
    @staticmethod
    def add_uids_to_latex_citations(latex_text: str) -> str:
        def replace_citation(match):
            command, optional, citations = match.groups()
            optional = optional or ''
            if not citations:
                return f"\\{command}{optional}{{}}"
            replaced_citations = [f"{citation.strip()}_uid={str(uuid.uuid4())[:4]}" for citation in citations.split(',')]
            return f"\\{command}{optional}{{{', '.join(replaced_citations)}}}"

        citation_pattern = r'\\(cite|citet|citep)(\[.*?\])?{(.*?)}'
        return re.sub(citation_pattern, replace_citation, latex_text)

    @staticmethod
    def get_citation_keys_from_latex(latex_text: str) -> List[str]:
        citation_pattern = r'\\(?:cite|citet|citep)(?:\[.*?\])?{(.*?)}'
        matches = re.findall(citation_pattern, latex_text)
        return [citation.strip() for match in matches for citation in match.split(',')]


class GPTEnhancer:
    def __init__(self):
        self.gpt_client = OpenAI()

    def _get_gpt_response(self, system_instruction: str, user_prompt: str, max_tokens: int = 2000) -> str:
        response = self.gpt_client.chat.completions.create(
            model="gpt-4o-mini",  # Replace with the appropriate model
            messages=[
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=max_tokens,
        )
        return response.choices[0].message.content

    def add_references_to_paragraph(self, paragraph: str, bibliography: str) -> str:
        system_instruction = "You are an AI assistant that enhances academic writing."
        user_prompt = f"""
        Given the following LaTeX paragraph and bibliography, add appropriate references from the bibliography.
        You can either add references to existing \\cite, \\citep, or \\citet commands, or add new ones wherever needed.
        Attention: Make sure not to remove any existing citations. 
        Only print the output. 

        ```latex
        (tex with added references)
        ```

        Paragraph:
        {paragraph}

        Bibliography:
        {bibliography}
        """

        enhanced_text = self._get_gpt_response(system_instruction, user_prompt)
        latex_match = re.search(r'```latex\s*([\s\S]*?)\s*```', enhanced_text)
        return latex_match.group(1) if latex_match else None

    def evaluate_reference_quality(self, paragraph_with_refs: str, bibliography_entries: List[Dict[str, Any]]) -> Dict[str, Any]:
        citation_keys = LaTeXHandler.get_citation_keys_from_latex(paragraph_with_refs)
        cited_bibliography_entries = [entry for entry in bibliography_entries for ref in set(citation_keys) if entry['ID'] == ref]
        references_bibtex = BibTexHandler.entries_to_bibtex_string(cited_bibliography_entries)
        paragraph_with_uids = LaTeXHandler.add_uids_to_latex_citations(paragraph_with_refs)

        system_instruction = "You are an AI assistant that enhances academic writing."
        user_prompt = f"""
        Given the following LaTeX and bibliography, add appropriate explanations for each unique key,
        and explain if it is a good citation or not. Give a score of 0-100 in terms of being a good citation
        Try to be critical and judicious, and use a low score if a citation (determined by uid) is or isn't relevant to the sentence and paragraph it appears in.

        Output must be in a JSON format, as an array of items:

        ```json
        [{{
            "citation_id": "(the BibTeX ID)",
            "uid": "(uid)",
            "score": "(score)",
            "explanation": "(Explaining if this is a good citation, and if yes, why)"
        }}]
        ```

        Paragraph: 
        {paragraph_with_uids}

        Bibliography:
        {references_bibtex}
        """

        analysis_text = self._get_gpt_response(system_instruction, user_prompt, max_tokens=2000)
        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', analysis_text)
        explanations = json.loads(json_match.group(1)) if json_match else []
        return {
            'paragraph': paragraph_with_uids, 
            'explanations': explanations,
        }


semantic_scholar_api = SemanticScholarAPI(os.getenv('S2_API_KEY'))
gpt_reference_enhancer = GPTEnhancer()

with open('sample2.tex', 'r') as f:
    input_paragraph = f.read()

with open('sample.bib', 'r') as f:
    input_bibliography = f.read()

print('\nInput paragraph:')
print(input_paragraph)

bibliography_entries = BibTexHandler.bibtex_string_to_entries(input_bibliography)

paragraph_with_references = gpt_reference_enhancer.add_references_to_paragraph(input_paragraph, input_bibliography)
print('\nEnhanced paragraph:')
print(paragraph_with_references)

citation_quality_analysis = gpt_reference_enhancer.evaluate_reference_quality(paragraph_with_references, bibliography_entries)
print('\nReference analysis:')
print(json.dumps(citation_quality_analysis, indent=2))


Enhanced paragraph:
\chapter{Introduction}

Deep neural networks have revolutionized the field of artificial intelligence, achieving unprecedented performance in a wide range of tasks, from image recognition to natural language processing \cite{zhang2019root, krizhevsky2012imagenet}. Despite their remarkable success, these models often remain enigmatic, functioning as ``black boxes'' that transform inputs into outputs through a complex series of non-linear operations. This opacity poses significant challenges for researchers and practitioners, as it hinders our ability to fully understand and optimize these powerful systems \cite{castelvecchi2016can}.

At the heart of the deep learning paradigm lies the concept of signal propagation—the journey of information as it flows through the layers of a neural network during both forward and backward passes. Understanding this process is crucial for several reasons. It provides insights into how neural networks process and transform informatio

In [324]:
s2_api = SemanticScholarAPI(os.getenv('S2_API_KEY'))
response = s2_api.search_papers_batch(['NLP', 'astronomy'], limit=1)
print(json.dumps(response, indent=2))


HTTPError: 405 Client Error: Method Not Allowed for url: https://api.semanticscholar.org/graph/v1/paper/search/batch

In [337]:
import anthropic
from openai import OpenAI

class GPTEnhancer:
    def __init__(self, api_type: str = "openai"):
        self.api_type = api_type
        if api_type == "openai":
            self.client = OpenAI()
        elif api_type == "claude":
            self.client = anthropic.Anthropic()
        else:
            raise ValueError("Invalid API type. Choose 'openai' or 'claude'.")

    def _get_ai_response(self, system_instruction: str, user_prompt: str, max_tokens: int = 2000) -> str:
        if self.api_type == "openai":
            messages = [
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": user_prompt}
            ]
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",  # Replace with the appropriate model
                messages=messages,
                max_tokens=max_tokens,
            )
            return response.choices[0].message.content
        elif self.api_type == "claude":
            response = self.client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=max_tokens,
                system=system_instruction,
                messages=[
                    {"role": "user", "content": user_prompt}
                ]
            )
            return response.content[0].text

    def add_references_to_paragraph(self, paragraph: str, bibliography: str) -> str:
        system_instruction = "You are an AI assistant that enhances academic writing."
        user_prompt = f"""
        Given the following LaTeX paragraph and bibliography, add appropriate references from the bibliography.
        You can either add references to existing \\cite, \\citep, or \\citet commands, or add new ones wherever needed.
        Attention: Make sure not to remove any existing citations. 
        Only print the output, following the following format exactly:
        
        ```latex
        (tex with added references)
        ```
        Paragraph:
        {paragraph}
        Bibliography:
        {bibliography}
        """
        enhanced_text = self._get_ai_response(system_instruction, user_prompt)
        latex_match = re.search(r'```latex\s*([\s\S]*?)\s*```', enhanced_text)
        return latex_match.group(1) if latex_match else None

    def evaluate_reference_quality(self, paragraph_with_refs: str, bibliography_entries: List[Dict[str, Any]]) -> Dict[str, Any]:
        citation_keys = LaTeXHandler.get_citation_keys_from_latex(paragraph_with_refs)
        cited_bibliography_entries = [entry for entry in bibliography_entries for ref in set(citation_keys) if entry['ID'] == ref]
        references_bibtex = BibTexHandler.entries_to_bibtex_string(cited_bibliography_entries)
        paragraph_with_uids = LaTeXHandler.add_uids_to_latex_citations(paragraph_with_refs)
        system_instruction = "You are an AI assistant that enhances academic writing."
        user_prompt = f"""
        Given the following LaTeX and bibliography, add appropriate explanations for each unique key,
        and explain if it is a good citation or not. Give a score of 0-100 in terms of being a good citation
        Try to be critical and judicious, and use a low score if a citation (determined by uid) is or isn't relevant to the sentence and paragraph it appears in.
        Output must be in a json format. Print onnly the JSON output, following the format below exactly:
        
        ```json
        [{{
            "citation_id": "BibTeX ID",
            "uid": "uid",
            "score": score,
            "explanation": "Explaining if this is a good citation, and if yes, why"
        }}]
        ```
        Paragraph: 
        {paragraph_with_uids}
        Bibliography:
        {references_bibtex}
        """
        analysis_text = self._get_ai_response(system_instruction, user_prompt, max_tokens=2000)
        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', analysis_text)
        explanations = json.loads(json_match.group(1)) if json_match else analysis_text
        return {
            'paragraph': paragraph_with_uids, 
            'explanations': explanations,
        }

# Usage example:
enhancer = GPTEnhancer(api_type="claude")  # or "claude"

print('\nInput paragraph:')
print(input_paragraph)

# paragraph_with_references = enhancer.add_references_to_paragraph(input_paragraph, input_bibliography)
print('\nEnhanced paragraph:')
print(paragraph_with_references)

citation_quality_analysis = enhancer.evaluate_reference_quality(paragraph_with_references, bibliography_entries)
print('\nReference analysis:')
print(citation_quality_analysis['paragraph'])
print(json.dumps(citation_quality_analysis['explanations'], indent=2))


Input paragraph:
\chapter{Introduction}

% \subsection{The Rise and Challenges of Deep Neural Networks}
Deep neural networks have revolutionized the field of artificial intelligence, achieving unprecedented performance in a wide range of tasks, from image recognition to natural language processing \cite{zhang2019root}. Despite their remarkable success, these models often remain enigmatic, functioning as ``black boxes'' that transform inputs into outputs through a complex series of non-linear operations. This opacity poses significant challenges for researchers and practitioners, as it hinders our ability to fully understand and optimize these powerful systems.

At the heart of the deep learning paradigm lies the concept of signal propagation—the journey of information as it flows through the layers of a neural network during both forward and backward passes. Understanding this process is crucial for several reasons. It provides insights into how neural networks process and transform i

In [343]:
analysis_text = citation_quality_analysis['explanations']
analysis_text
json_match = re.search(r'```json\s*([\s\S]*?)\s*```', analysis_text)
json_match.group(0)