<a href="https://colab.research.google.com/github/anuraglahon16/ResearchPaperSimplify/blob/main/bookcreate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install arxiv openai markdown2 weasyprint

import arxiv
import openai
import markdown2
from weasyprint import HTML
import logging
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set your OpenAI API key here
api_key = ""

def extract_papers(topic, date_range, max_results=30):
    start_date, end_date = date_range.split(":")
    search_query = f'"{topic}" AND submittedDate:[{start_date} TO {end_date}]'

    logger.info(f"Search query: {search_query}")

    papers = []
    try:
        search_results = arxiv.Search(
            query=search_query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        for result in search_results.results():
            paper = {
                'title': result.title,
                'authors': [author.name for author in result.authors],
                'summary': result.summary,
                'url': result.pdf_url
            }
            papers.append(paper)

        logger.info(f"Extracted {len(papers)} papers.")
    except Exception as e:
        logger.error(f"Error occurred while extracting papers: {str(e)}")

    return papers

def concise_summary_openai(paper, api_key, max_retries=3, retry_delay=5):
    openai.api_key = api_key

    prompt = f"Concise the following research paper summary:\n\n"
    prompt += f"Paper Title: {paper['title']}\n\n"
    prompt += f"Paper Summary: {paper['summary']}\n\n"
    prompt += "Concise Summary:"

    for attempt in range(max_retries):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that concises research paper summaries."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
                n=1,
                stop=None,
                temperature=0.7,
            )

            concise_summary = response.choices[0].message['content'].strip()
            paper['concise_summary'] = concise_summary
            return paper
        except Exception as e:
            logger.warning(f"Attempt {attempt + 1} failed for '{paper['title']}': {str(e)}")
            if attempt < max_retries - 1:
                logger.info(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                logger.error(f"Max retries reached for '{paper['title']}'. Skipping concise summary generation.")
                return paper

def categorize_papers_openai(papers, book_structure, api_key, max_retries=3, retry_delay=5):
    openai.api_key = api_key

    categorized_papers = []

    for paper in papers:
        if 'concise_summary' not in paper:
            paper['concise_summary'] = ''  # Set an empty string if concise_summary is missing

        prompt = f"You are an expert in categorizing research papers on artificial intelligence into different sections of a book.\n\n"
        prompt += f"Categorize the following research paper into one of the book sections based on its title and concise summary:\n\n"
        prompt += f"Paper Title: {paper['title']}\n\n"
        prompt += f"Concise Paper Summary: {paper['concise_summary']}\n\n"
        prompt += "Book Sections and Examples:\n"
        for section in book_structure:
            prompt += f"- {section['title']}: {section['description']}\n"
            if 'examples' in section:
                prompt += "Examples:\n"
                for example in section['examples']:
                    prompt += f"  - {example}\n"
                prompt += "\n"
        prompt += "Category:"

        logger.debug(f"Prompt for '{paper['title']}': {prompt}")

        for attempt in range(max_retries):
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that categorizes research papers."},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=200,
                    n=1,
                    stop=None,
                    temperature=0.7,
                )

                category = response.choices[0].message['content'].strip()
                logger.debug(f"Model response for '{paper['title']}': {category}")
                paper['category'] = category
                categorized_papers.append(paper)
                break
            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for '{paper['title']}': {str(e)}")
                if attempt < max_retries - 1:
                    logger.info(f"Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    logger.error(f"Max retries reached for '{paper['title']}'. Skipping categorization.")
                    paper['category'] = 'Uncategorized'
                    categorized_papers.append(paper)

    return categorized_papers

def structure_content(categorized_papers, book_structure):
    structured_content = {}

    for section in book_structure:
        section_title = section['title']
        structured_content[section_title] = {
            'description': section['description'],
            'papers': []
        }

    for paper in categorized_papers:
        category = paper['category']
        if category in structured_content:
            structured_content[category]['papers'].append(paper)
        else:
            logger.warning(f"Paper '{paper['title']}' was not categorized into any section.")
            if 'Uncategorized' not in structured_content:
                structured_content['Uncategorized'] = {'description': 'Papers that could not be categorized', 'papers': []}
            structured_content['Uncategorized']['papers'].append(paper)

    return structured_content

def create_playlists(structured_content):
    playlists = {}

    for section_title, section_content in structured_content.items():
        papers = section_content['papers']
        playlist = sorted(papers, key=lambda x: x['title'])
        playlists[section_title] = playlist

    return playlists

def generate_book_content(structured_content, playlists):
    book_content = ""

    for section_title, section_content in structured_content.items():
        book_content += f"# {section_title}\n\n"
        book_content += f"{section_content['description']}\n\n"

        playlist = playlists[section_title]
        for paper in playlist:
            book_content += f"## {paper['title']}\n\n"
            book_content += f"{paper['summary']}\n\n"

        book_content += "---\n\n"

    return book_content

def save_book_content(book_content, output_format, output_file):
    try:
        if output_format == 'markdown':
            with open(output_file, 'w') as file:
                file.write(book_content)
        elif output_format == 'html':
            html_content = markdown2.markdown(book_content)
            with open(output_file, 'w') as file:
                file.write(html_content)
        elif output_format == 'pdf':
            html_content = markdown2.markdown(book_content)
            html = HTML(string=html_content)
            html.write_pdf(output_file)
        else:
            logger.error(f"Unsupported output format: {output_format}")
            return

        logger.info(f"Book content saved as {output_file} in {output_format} format.")
    except Exception as e:
        logger.error(f"Error occurred while saving book content: {str(e)}")

def process_papers(papers, book_structure, api_key):
    concised_papers = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(concise_summary_openai, paper, api_key) for paper in papers]
        with tqdm(total=len(futures), desc="Generating Concise Summaries") as progress:
            for future in futures:
                concised_paper = future.result()
                concised_papers.append(concised_paper)
                progress.update(1)

    categorized_papers = categorize_papers_openai(concised_papers, book_structure, api_key)
    return categorized_papers

# Example usage
topic = "artificial intelligence"
date_range = "2010-01-01:2024-04-09"

book_structure = [
    {'title': 'Introduction', 'description': 'Overview of artificial intelligence and its applications'},
    {'title': 'Machine Learning', 'description': 'Techniques and algorithms for machine learning'},
    {'title': 'Deep Learning', 'description': 'Neural networks and deep learning architectures'},
    {'title': 'Natural Language Processing', 'description': 'Processing and understanding human language'},
    {'title': 'Computer Vision', 'description': 'Analyzing and interpreting visual information'}
]

if not api_key:
    logger.error("OpenAI API key is missing. Please set the 'api_key' variable.")
    exit(1)

extracted_papers = extract_papers(topic, date_range)
categorized_papers = process_papers(extracted_papers, book_structure, api_key)
structured_book = structure_content(categorized_papers, book_structure)
playlists = create_playlists(structured_book)
book_content = generate_book_content(structured_book, playlists)

output_format = 'pdf'
output_file = 'book_content.pdf'
save_book_content(book_content, output_format, output_file)



  for result in search_results.results():
Generating Concise Summaries: 100%|██████████| 30/30 [00:09<00:00,  3.27it/s]


In [None]:
!pip install openai==0.28

