In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
from pprint import pprint
import textwrap
import yaml

In [2]:
def load_prompts(yaml_file='system_prompts.yaml'):
    """Load system prompts from a YAML file."""
    with open(yaml_file, 'r') as file:
        prompts = yaml.safe_load(file)
    return prompts


def open_data_file(data_file):
    """Open a JSON data file and return its contents as a dictionary."""

    with open(data_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


def initialize_model(model_name):
    """Initialize the model and tokenizer for the given model name."""

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    return model, tokenizer


def format_papers_for_topic(papers):
    """Format papers for a specific topic into a structured string."""

    formatted_output = "\nPapers to analyze:\n\n"

    papers = sorted(papers, key=lambda x: x['id'])
    
    for paper in papers:
        formatted_output += f"Paper ID: {paper['id']}\n"
        formatted_output += f"Title: {paper['title']}\n"
        formatted_output += f"Abstract: {paper['abstract']}\n"
        formatted_output += "-" * 80 + "\n\n"
    
    return formatted_output


def create_conversation_messages(system_prompt, data):
    """Create conversation messages for each topic in the JSON file."""

    topic_messages = {}
    
    for topic, papers in data.items():
        formatted_papers = format_papers_for_topic(papers)

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": formatted_papers}
        ]

        topic_messages[topic] = messages

    return topic_messages


def get_topic_stats(topic, papers):
    """Print the number of papers for a given topic."""

    num_papers = len(papers)
    print(f"Processing topic: {topic}")
    print(f"Number of papers in topic: {num_papers}")


def get_number_of_tokens(tokenizer, system_prompt, papers, text):
    """Calculate the number of tokens for the system prompt, papers, and text."""

    system_prompt_tokens = tokenizer.tokenize(system_prompt)
    paper_tokens = tokenizer.tokenize(papers)
    output_tokens = tokenizer.tokenize(text)
    total_words = len(text.split())
    total_tokens = len(system_prompt_tokens) + len(paper_tokens) + len(output_tokens)
    
    print(f"Number of tokens in system prompt: {len(system_prompt_tokens)}")
    print(f"Number of tokens in papers: {len(paper_tokens)}")
    print(f"Number of tokens in output: {len(output_tokens)}")
    print(f"Total number of words in output: {total_words}")
    print(f"Total number of tokens: {total_tokens}")


def generate_response(model, tokenizer, messages):
    """Generate a response from the model based on the provided messages."""

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1000,
    #     num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        top_p=0.95
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return response


def pretty_print_model_response(topic, papers, response, width=100, new_response=False):
    """Pretty print the model's response with academic formatting."""

    print("\n" + "="*width)
    if new_response:
        print("UPDATED SUMMARY FOR TOPIC: " + topic.upper())
    else:
        print(f"TOPIC: {topic.upper()}")
    print("="*width + "\n")

    sections = response.split('\n\n')
    
    for section in sections:
        formatted_section = textwrap.fill(
            section.strip(),
            width=width,
            initial_indent="",
            subsequent_indent="",
            # subsequent_indent="    "
        )
        print(formatted_section + "\n")
    
    print("References:")
    for paper in papers:
        print(f"[{paper['id']}] {paper['title']}")
    
    print("="*width + "\n")


def rewrite_request(model, tokenizer, system_prompt, summary):
    """Generate a shorter summary if the word limit is exceeded."""

    print("The summary exceeds the word limit of 250 words.")

    rewrite_request = f"""
    The summary provided exceeds the word limit of 250 words. Please revise the summary to be shorter and more concise and adhere to the 250-word maximum limit.
    
    {system_prompt}
    ------
    Summary:
    {summary}
    """

    messages = [
        {"role": "user", "content": rewrite_request}
    ]

    response = generate_response(model, tokenizer, messages)
    
    return response


def get_summary(data_file, system_prompt, model, tokenizer, response_only=False, print_response=True, show_papers=False):
    data = open_data_file(data_file)

    topic_messages = create_conversation_messages(system_prompt, data)

    for topic, messages in topic_messages.items():
        papers = data[topic]

        response = generate_response(model, tokenizer, messages)
        total_words = len(response.split())

        if response_only:
            if print_response:
                pretty_print_model_response(topic, papers, response)

                if total_words > 250:
                    response = rewrite_request(model, tokenizer, system_prompt, response)
                    pretty_print_model_response(topic, papers, response, new_response=True)
            else:
                if total_words > 250:
                    response = rewrite_request(model, tokenizer, system_prompt, response)

                    return response
        else:
            get_topic_stats(topic, papers)
            if show_papers:
                print(format_papers_for_topic(papers))

            pretty_print_model_response(topic, papers, response)
            get_number_of_tokens(tokenizer, system_prompt, format_papers_for_topic(papers), response)

            if total_words > 250:
                response = rewrite_request(model, tokenizer, system_prompt, response)
                pretty_print_model_response(topic, papers, response, new_response=True)
                get_number_of_tokens(tokenizer, system_prompt, format_papers_for_topic(papers), response)
        
    return response

In [None]:
system_prompt = ["""
Aanalyze the following research papers' titles and abstracts to create a comprehensive summary. Follow these specific guidelines:

1. Input Format:
For each paper, you will receive:
- ID: [Paper ID number]
- Title: [Paper Title]
- Abstract: [Paper Abstract]

2. Summary Requirements:
- Generate a cohesive summary of approximately 300 words.
- Only use information explicitly stated in the provided abstracts.
- Use numeric citations in square brackets [1], [2], etc., corresponding to the paper IDs.
- Do not say "Paper [ID]", use only the [ID].
- Focus on key findings, methodologies, and connections between papers.
- Highlight common themes and potential contradictions.
- Maintain academic tone and language.

3. Citation Rules:
- Every claim must be supported by at least one citation using [n] format.
- Use only the provided papers as sources.
- Multiple citations should be listed in ascending order, separated by commas: [1,3,4].

4. Structure:
- Begin with a brief overview of the research area.
- Group related findings and themes.
- Present methodological approaches.
- Discuss key conclusions.
- Identify potential gaps or areas of consensus.

Synthesize the information while maintaining academic integrity and avoiding information not present in the provided abstracts.
""",
"""
Aanalyze the following research papers' titles and abstracts to create a comprehensive summary. Follow these specific guidelines:

1. Input Format:
For each paper, you will receive:
- ID: [Paper ID number]
- Title: [Paper Title]
- Abstract: [Paper Abstract]

2. Summary Requirements:
- Generate a cohesive summary of up to 300 words (not exceeding this limit).
- Only use information explicitly stated in the provided abstracts.
- Use numeric citations in square brackets [1], [2], etc., corresponding to the paper IDs.
- Do not say "Paper [ID]", use only the [ID].
- Focus on key findings, methodologies, and connections between papers.
- Highlight common themes and potential contradictions.
- Maintain academic tone and language.

3. Citation Rules:
- Every claim must be supported by at least one citation using [n] format.
- Use only the provided papers as sources.
- Multiple citations should be listed in ascending order, separated by commas: [1,3,4].

Synthesize the information while maintaining academic integrity and avoiding information not present in the provided abstracts.
""",
"""
Aanalyze the following research papers' titles and abstracts to create a comprehensive summary. Follow these specific guidelines:

1. Input Format:
For each paper, you will receive:
- ID: [Paper ID number]
- Title: [Paper Title]
- Abstract: [Paper Abstract]

2. Summary Requirements:
- Generate a cohesive summary of 200-250 words STRICTLY. Do not exceed 250 words under any circumstances.
- Structure the summary in 2-3 concise paragraphs.
- Only use information explicitly stated in the provided abstracts.
- Use numeric citations in square brackets [1], [2], etc., corresponding to the paper IDs.
- Do not say "Paper [ID]", use only the [ID].
- Focus on key findings, methodologies, and connections between papers.
- Highlight common themes and potential contradictions.
- Maintain academic tone and language.

3. Citation Rules:
- Every claim must be supported by at least one citation using [n] format.
- Use only the provided papers as sources.
- Multiple citations should be listed in ascending order, separated by commas: [1,3,4].

4. Length Enforcement:
- Before submitting the summary, count the words.
- If the word count exceeds 250, revise by removing less critical details while maintaining key findings.

Synthesize the information while maintaining academic integrity and avoiding information not present in the provided abstracts.
""",
"""
Analyze the following research papers' titles and abstracts to create a concise summary. Follow these specific guidelines:

1. Input Format:
For each paper, you will receive:
- ID: [Paper ID number]
- Title: [Paper Title]
- Abstract: [Paper Abstract]

2. Summary Structure and Length:
- Write EXACTLY two paragraphs:
  * First paragraph: A focused introduction and methodology overview
  * Second paragraph: Key findings and conclusions
- Keep the summary BRIEF and CONCISE
- Prioritize only the most significant findings and connections
- Eliminate any redundant or secondary information
- Use short, clear sentences

3. Content Requirements:
- Only use information explicitly stated in the provided abstracts
- Use numeric citations in square brackets [1], [2], etc., corresponding to the paper IDs
- Do not say "Paper [ID]", use only the [ID]
- Focus on essential findings and critical connections between papers
- Highlight only the most important themes and contradictions
- Maintain academic tone and language

4. Citation Rules:
- Every claim must be supported by at least one citation using [n] format
- Use only the provided papers as sources
- Multiple citations should be listed in ascending order, separated by commas: [1,3,4]

5. Length Control:
- Keep sentences concise and focused
- Avoid elaboration on minor points
- Use precise language instead of lengthy descriptions
- Eliminate redundant citations when multiple sources support the same point

Synthesize the information while maintaining academic integrity and avoiding information not present in the provided abstracts.
""",
"""
Analyze the following research papers' titles and abstracts to create a focused single-paragraph summary. Follow these specific guidelines:

1. Input Format:
For each paper, you will receive:
- ID: [Paper ID number]
- Title: [Paper Title]
- Abstract: [Paper Abstract]

2. Summary Structure:
- Create ONE focused paragraph that:
  * Begins with a brief context or introduction (1-2 sentences)
  * Presents key methodologies and findings in a logical flow
  * Ends with the most significant conclusions
- Use clear topic sentences and transitions
- Maintain a single coherent narrative thread
- Keep the summary TIGHT and FOCUSED

3. Content Requirements:
- Only use information explicitly stated in the provided abstracts
- Use numeric citations in square brackets [1], [2], etc., corresponding to the paper IDs
- Do not say "Paper [ID]", use only the [ID]
- Present only the most essential findings and connections
- Highlight critical themes and contradictions
- Maintain academic tone and language

4. Length Control Strategies:
- Write approximately 8-10 substantive sentences
- Use precise, economical language
- Avoid redundant information
- Combine related findings from multiple papers in single sentences
- Minimize descriptive phrases
- Use active voice

5. Citation Rules:
- Every claim must be supported by at least one citation using [n] format
- Use only the provided papers as sources
- Multiple citations should be listed in ascending order, separated by commas: [1,3,4]

Synthesize the information while maintaining academic integrity and avoiding information not present in the provided abstracts.
"""]

In [3]:
# model_name = "Qwen/Qwen2.5-14B-Instruct"
model_name = "Qwen/Qwen2.5-14B-Instruct-1M"
# model_name = "Qwen/Qwen2.5-32B-Instruct"
model, tokenizer = initialize_model(model_name)
prompts = load_prompts()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
data_file = "data.1.json"
system_prompt = prompts['prompts']['comprehensive_300']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer)

Processing topic: artificial intelligence cyber security
Number of papers in topic: 7

TOPIC: ARTIFICIAL INTELLIGENCE CYBER SECURITY

The integration of artificial intelligence (AI) and machine learning (ML) into smart cities and
internet of things (IoT) systems has emerged as a critical area of research, addressing challenges
ranging from energy efficiency to enhanced security [1]. Smart cities leverage AI, ML, and deep
reinforcement learning (DRL) to optimize policies and enhance services across various domains,
including transportation, healthcare, and energy distribution [1]. These technologies aim to manage
urban growth, reduce resource consumption, and improve citizens' quality of life.

Intrusion detection systems (IDS) have gained prominence in securing IoT and industrial cyber-
physical systems (CPS) against increasingly sophisticated cyber threats. Research has explored the
application of deep learning models, such as convolutional neural networks (CNNs) and recurrent
neural 

In [None]:
data_file = "data.2.json"
system_prompt = prompts['prompts']['comprehensive_300']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer)


Processing topic: data visualization
Number of papers in topic: 6

TOPIC: DATA VISUALIZATION

The field of scientific data visualization encompasses tools that assist researchers in interpreting
complex data generated by various experimental techniques, from crystallography to genomics. These
tools aim to enhance understanding through effective representation and analysis of intricate
datasets [1,2,3,4,5,6].

One prominent tool for crystallographic studies is VESTA, which has been upgraded to VESTA 3. This
version introduces advanced features such as external morphology drawing, superimposition of
structural models and volumetric data, calculation of electron and nuclear densities, and enhanced
performance in rendering isosurfaces and slice calculations [1]. Another tool, OVITO, specializes in
processing and visualizing atomistic simulation data from molecular dynamics and Monte Carlo
simulations, offering unique analysis and animation capabilities through an easy-to-use interface
[2].

In [None]:
data_file = "data.1.json"
system_prompt = prompts['prompts']['strict_300']['content']
get_summary(data_file, system_prompt, model, tokenizer)

Processing topic: artificial intelligence cyber security
Number of papers in topic: 7

TOPIC: ARTIFICIAL INTELLIGENCE CYBER SECURITY

The integration of artificial intelligence (AI) and machine learning (ML) technologies is pivotal in
addressing the multifaceted challenges faced by smart cities and Internet of Things (IoT) networks
[1]. These technologies are crucial for optimizing policy design, enhancing cyber-security,
improving energy efficiency, and ensuring effective healthcare services within smart city frameworks
[1]. Specifically, AI and ML enable the development of robust intrusion detection systems (IDS)
capable of safeguarding industrial cyber-physical systems (CPS) from sophisticated cyber threats
[2].

Federated learning approaches, like DeepFed, have emerged as promising solutions for training IDS
models across multiple CPS instances without compromising data privacy [2]. By leveraging
convolutional neural networks (CNNs) and gated recurrent units (GRUs), DeepFed constru

In [None]:
data_file = "data.2.json"
system_prompt = prompts['prompts']['strict_300']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer)


Processing topic: data visualization
Number of papers in topic: 6



TOPIC: DATA VISUALIZATION

The field of scientific data visualization encompasses a wide range of applications, from
crystallography and atomic simulations to genomic and single-cell data analysis. VESTA 3 [1] offers
advanced three-dimensional visualization capabilities for crystallographic studies, allowing users
to draw crystal morphologies, overlay multiple structural models, and calculate electron and nuclear
densities. This tool is particularly useful for researchers requiring detailed visual insights into
complex molecular structures.

In contrast, OVITO [2] focuses on post-processing atomistic simulation data, providing unique
analysis and editing functionalities through an easy-to-use graphical interface. Its flexibility,
supported by Python scripting and plugin capabilities, makes it suitable for a broader spectrum of
atomic-scale simulations, enhancing its versatility compared to VESTA.

For X-ray diffraction data, the HKL package [3] provides comprehensive tools for data re

In [None]:
data_file = "data.1.json"
system_prompt = prompts['prompts']['strict_300']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer, response_only=True)


TOPIC: ARTIFICIAL INTELLIGENCE CYBER SECURITY

The application of artificial intelligence (AI) and machine learning (ML) in smart cities aims to
enhance urban efficiency, energy consumption, environmental sustainability, and citizen welfare
through advanced information and communication technology (ICT) integration [1]. Key areas
benefiting from AI include intelligent transportation systems, energy grid optimization, and
healthcare services, all of which rely on predictive analytics and autonomous decision-making
capabilities.

In the realm of cybersecurity, particularly for industrial cyber-physical systems (CPSs) and IoT
networks, federated deep learning models have shown promise in mitigating attacks without
compromising privacy. For instance, DeepFed integrates convolutional neural networks (CNNs) and
gated recurrent units (GRUs) to detect cyber threats in industrial CPSs, utilizing a privacy-
preserving federated learning framework [2]. Similarly, another approach leverages multi

In [None]:
data_file = "data.2.json"
system_prompt = prompts['prompts']['strict_300']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer, response_only=True)


TOPIC: DATA VISUALIZATION

Recent advancements in scientific visualization have introduced powerful tools capable of handling
diverse data types across various domains, enhancing analysis and interpretation. VESTA 3 [1] stands
out for its comprehensive three-dimensional visualization capabilities, enabling the depiction of
crystal structures, volumetric data, and morphologies alongside advanced density calculations and
enhanced performance improvements. This tool facilitates detailed analysis and presentation of
complex structural data, making it particularly useful for crystallographic studies.

OVITO, an open-source 3D visualization software [2], caters specifically to the needs of atomistic
simulation data, integrating unique analysis, editing, and animation functionalities. Its
extensibility through a plugin interface and control via Python scripts offer flexibility, aligning
well with the computational demands of molecular dynamics and Monte Carlo simulations.

In contrast, the f

In [None]:
data_file = "data.2.json"
system_prompt = prompts['prompts']['strict_250']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer)

Processing topic: data visualization
Number of papers in topic: 6

TOPIC: DATA VISUALIZATION

Visualization and analysis of complex scientific data across various domains have seen significant
advancements through specialized tools. VESTA [1], a three-dimensional visualization system,
enhances crystallographic studies with features like morphology drawing, superimposition of
structural models, and advanced density calculations, facilitating comprehensive analysis of crystal
structures. Similarly, OVITO [2] provides a versatile platform for post-processing atomistic
simulation data, incorporating unique analysis and animation capabilities accessible via a user-
friendly interface. Both tools emphasize graphical representation and ease of use, catering to
diverse scientific inquiries.

In contrast, deepTools2 [4] focuses on deep-sequencing data, offering a robust set of tools for
quality control, normalization, and integrative analysis within a Galaxy-based framework. This web
server is 

In [None]:
data_file = "data.2.json"
system_prompt = prompts['prompts']['two_paragraph']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer)

Processing topic: data visualization
Number of papers in topic: 6

TOPIC: DATA VISUALIZATION

The first paragraph focuses on advanced visualization and analysis tools designed for handling
complex scientific data. VESTA 3 introduces a comprehensive suite of features for three-dimensional
visualization of crystal structures, electron density calculations, and volumetric data, enabling
detailed crystallographic studies [1]. Meanwhile, OVITO offers a versatile platform for post-
processing atomistic simulation data, integrating unique analysis and animation capabilities within
an accessible interface [2]. These tools are complemented by deepTools2, which enhances the analysis
of deep-sequencing data through a robust set of tools for quality control, normalization, and
visualization, all accessible via a web server or Galaxy framework [4]. Additionally, the
Integrative Genomics Viewer (IGV) addresses the challenges of visualizing large, diverse genomic
datasets, supporting both array-based

In [4]:
data_file = "data.2.json"
system_prompt = prompts['prompts']['single_paragraph']['content']
summary = get_summary(data_file, system_prompt, model, tokenizer)

Processing topic: data visualization
Number of papers in topic: 6

TOPIC: DATA VISUALIZATION

Visualization and analysis of complex data across various scientific domains have seen significant
advancements with the development of specialized tools. VESTA 3 offers comprehensive three-
dimensional visualization capabilities for crystallographic studies, enabling the integration of
structural models, volumetric data, and crystal faces, alongside enhanced performance in rendering
isosurfaces [1]. In parallel, OVITO provides a robust platform for post-processing atomistic
simulation data, incorporating unique analysis and animation functionalities, while remaining
accessible and extensible through scripting and plugin interfaces [2]. For X-ray diffraction data,
the HKL package introduces advanced statistical and visualization tools that assist in monitoring
the progress of data collection and reduction, ensuring accurate error estimates and facilitating
the decision-making process for struc