# Autonomous Literature Review (AutoLit) Pipeline

In [22]:
# Load Test data
title = "Feasibility of Artificial Intelligence Driven Analysis in the Context of Nepalese Legal System"
abstract = "We proposed an innovative solution through an Artificial Intelligence driven legal analysis customized to the utility of the Nepalese legal context. Using advanced machine learning (ML) models and Retrieval-Augmented Generation (RAG) techniques, the research provides legal insights, streamlines judicial processes, and enhances accessibility to legal information. The legal documents were processed to convert into JSON format, and then to convert into vector data. GPT-4o was used for query expansion and response generation, whereas text embeddings were generated through text-embedding-ada-002. Key features include efficient document retrieval and query expansion for enhanced search precision. The model performs well across different query types, achieving an 𝐹1 score of 0.797 for rule-recall, 0.857 for rhetorical understanding, and 0.875 for interpretation-based queries. This work marks a significant step towards integrating AI into the legal domain of Nepal."

In [23]:
# Keyword Generation
import os
from fastapi import HTTPException
import requests
from dotenv import load_dotenv
from typing import Dict, List
import hdbscan
from sentence_transformers import SentenceTransformer, util
import json
from collections import defaultdict
from sklearn.preprocessing import StandardScaler

load_dotenv()

def keyword_gen(title: str, abstract: str) -> list[str]:
    GEMINI_API_URL = os.environ.get("GEMINI_API_URL") or ""
    res = requests.post(GEMINI_API_URL, json={
        "contents": [{
            "parts": [{
                "text": f"""
                    Extract 5–10 relevant and concise keywords from the following
                    research paper title and abstract. Each keyword should be 1–3
                    words long. Return only the keywords as a plain list, one per
                    line. Don't say anything else.

                    Title: {title}

                    Abstract: {abstract}
                """
                }]
            }]
    })

    if not res.ok:
        print(res.json())
        raise HTTPException(status_code=500)

    res = res.json()

    keywords_str: str = res["candidates"][0]["content"]["parts"][0]["text"]
    keywords = keywords_str.split("\n")
    return [keyword for keyword in keywords if keyword.strip() != ""]

keywords = keyword_gen(title, abstract)
print(keywords)

python-dotenv could not parse statement starting at line 3


['AI legal analysis', 'Nepalese legal system', 'Machine learning', 'RAG', 'Document retrieval', 'Query expansion', 'GPT-4o', 'Text embeddings']


In [24]:
# Retrieve Papers

from typing import List
import requests
import xmltodict
import time

ARXIV_URI = "https://export.arxiv.org/api/query"


ARXIV_URI = "https://export.arxiv.org/api/query"
HEADERS = {
    "User-Agent": "YourAppName/1.0 (Contact: your-email@example.com)"
}

'''
Sample query	Error Explanation
http://export.arxiv.org/api/query?start=not_an_int	start must be an integer
http://export.arxiv.org/api/query?start=-1	start must be >= 0
http://export.arxiv.org/api/query?max_results=not_an_int	max_results must be an integer
http://export.arxiv.org/api/query?max_results=-1	max_results must be >= 0
http://export.arxiv.org/api/query?id_list=1234.1234	malformed id - see arxiv identifier explanation
http://export.arxiv.org/api/query?id_list=cond—mat/0709123	malformed id - see arxiv identifier explanation

'''
def validate_arxiv_query_params(start: int, max_results: int):
    if not isinstance(start, int) or start < 0:
        raise ValueError("start must be a non-negative integer")
    if not isinstance(max_results, int) or max_results <= 0:
        raise ValueError("max_results must be a positive integer")

def fetch_arxiv_data(query: str, start: int = 0, max_results: int = 10, retries: int = 3, delay: float = 3.0) -> List[dict]:
    validate_arxiv_query_params(start, max_results)

    params = {
        'search_query': query,
        'start': start,
        'max_results': max_results
    }

    for attempt in range(retries):
        try:
            response = requests.get(ARXIV_URI, params=params, headers=HEADERS, timeout=10)
            response.raise_for_status()

            if not response.text.strip():
                raise Exception("Empty response from arXiv")

            parsed_response = xmltodict.parse(response.text)
            entries = parsed_response.get('feed', {}).get('entry', [])

            if isinstance(entries, dict):
                entries = [entries]

            if not entries:
                return []

            papers = []
            for entry in entries:
                authors_data = entry.get('author', [])
                if isinstance(authors_data, dict):
                    authors = [authors_data.get('name', '')]
                elif isinstance(authors_data, list):
                    authors = [author.get('name', '') for author in authors_data]
                else:
                    authors = []

                categories_data = entry.get('category', [])
                if isinstance(categories_data, dict):
                    categories = [categories_data.get('@term', '')]
                elif isinstance(categories_data, list):
                    categories = [cat.get('@term', '') for cat in categories_data]
                else:
                    categories = []

                links_data = entry.get('link', [])
                if isinstance(links_data, dict):
                    links_data = [links_data]
                links = {link.get('@rel', ''): link.get('@href', '') for link in links_data}

                paper = {
                    'id': entry.get('id', ''),
                    'title': entry.get('title', '').strip(),
                    'summary': entry.get('summary', '').strip(),
                    'published': entry.get('published', ''),
                    'updated': entry.get('updated', ''),
                    'authors': authors,
                    'categories': categories,
                    'links': links
                }

                papers.append(paper)

            time.sleep(delay)  # rate limit
            return papers

        except requests.exceptions.RequestException as e:
            if attempt < retries - 1:
                time.sleep(2 ** attempt)
                continue
            raise Exception(f"Request to arXiv failed after {retries} attempts: {str(e)}")
        except Exception as e:
            raise Exception(f"An error occurred while processing the response: {str(e)}")

def retrieve_papers(keyword, max_results=20):
    return fetch_arxiv_data(query=keyword, max_results=max_results, start=0)

papers_unfiltered = []
for keyword in keywords:
    papers_unfiltered = papers_unfiltered + retrieve_papers(keyword=keyword)

# Remove repeated
papers = []
seen_ids = set()
for paper in papers_unfiltered:
    if paper['id'] not in seen_ids:
        papers.append(paper)
        seen_ids.add(paper['id'])

In [28]:
# Retrieve Relevant Papers
from typing import List, Dict
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_relevant_papers(title: str, abstract: str, related_papers: List[Dict]) -> List[Dict[str,str]]:
    '''
    Retrieve the top k related papers based on the title and abstract of a given paper.
    '''
    if not related_papers:
        return []
    
    # Generate embeddings for the title and abstract
    query = f"{title.strip()} {abstract.strip()}"

    query_embedding = model.encode(query, convert_to_tensor=True)

    # Prepare the contents of the related papers for embedding

    paper_contents = []

    for paper in related_papers:
        paper_title = paper.get('title', '').strip()
        paper_abstract = paper.get('summary', '').strip()
        paper_contents.append(f"{paper_title} {paper_abstract}")

    paper_embeddings = model.encode(paper_contents, convert_to_tensor=True)

    # Compute cosine similarities

    cosine_scores = util.cos_sim(query_embedding, paper_embeddings)[0]
    min_papers = 8

    thresholds = [0.75, 0.7, 0.65, 0.6]
    for threshold in thresholds:
        relevant_indices = (cosine_scores >= threshold).nonzero(as_tuple=True)[0]
        if len(relevant_indices) >= min_papers or threshold == thresholds[-1]:
            relevant_papers = [related_papers[i] for i in relevant_indices.tolist()]
            return relevant_papers

    return []  # fallback (shouldn't be reached if thresholds cover enough ground)top_papers = retrieve_relevant_papers(title=title, abstract=abstract, related_papers=papers)

top_papers = retrieve_relevant_papers(title, abstract, papers)
print("papers count", len(top_papers))

papers count 12
