In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from typing import Literal

class ResearchTextSimplifier:
    def __init__(self, model_name: str = "google/pegasus-xsum"):
        """
        Pure neural research text simplifier using a summarization model.

        Args:
            model_name: Pretrained model identifier (default: PEGASUS-XSum)
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)

        # Default text generation settings
        self.generation_config = {
            "max_length": 256,
            "min_length": 50,
            "num_beams": 5,
            "length_penalty": 1.5,
            "early_stopping": True,
            "temperature": 0.7,
            "top_k": 50,
            "top_p": 0.9,
            "no_repeat_ngram_size": 2
        }

    def simplify(
        self,
        text: str,
        audience: Literal["researchers", "students", "public"] = "students"
    ) -> str:
        """
        Simplifies research text for a specific audience.

        Args:
            text: Input research text.
            audience: Target audience level - "researchers", "students", or "public".

        Returns:
            Simplified version of the input text.
        """
        prompt = self._create_audience_prompt(text, audience)

        inputs = self.tokenizer(
            prompt,
            max_length=1024,
            truncation=True,
            return_tensors="pt"
        ).to(self.device)

        # Adjust generation parameters based on audience type
        config = self.generation_config.copy()
        if audience == "researchers":
            config.update({"length_penalty": 1.2, "temperature": 0.6})
        elif audience == "students":
            config.update({"length_penalty": 1.5, "temperature": 0.7})
        elif audience == "public":
            config.update({"length_penalty": 2.0, "temperature": 0.9})

        outputs = self.model.generate(
            inputs["input_ids"],
            **config
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def _create_audience_prompt(self, text: str, audience: str) -> str:
        """Creates audience-specific prompts for better text simplification."""
        prompts = {
            "researchers": (
                "Rephrase the following academic text for fellow researchers while maintaining clarity, precision, and technical terms: {text}"
            ),
            "students": (
                "Explain the following research text in simple terms for undergraduate students, making complex ideas easier to understand: {text}"
            ),
            "public": (
                "Summarize the following scientific information in plain language for the general public, avoiding jargon and making it engaging: {text}"
            )
        }
        return prompts[audience].format(text=text)

# Example usage
if __name__ == "__main__":
    simplifier = ResearchTextSimplifier()

    research_paragraph = """
    The improved photocatalytic efficiency of the heterostructured nanomaterials was attributed to the strong interfacial interactions between the components, which promoted rapid charge transfer and minimized electron-hole recombination, as confirmed by electrochemical impedance spectroscopy and time-resolved fluorescence analysis.
    """

    print("=== Original Research Text ===")
    print(research_paragraph.strip())

    print("\n=== Simplified for Researchers ===")
    print(simplifier.simplify(research_paragraph, "researchers"))

    print("\n=== Simplified for Students ===")
    print(simplifier.simplify(research_paragraph, "students"))

    print("\n=== Simplified for General Public ===")
    print(simplifier.simplify(research_paragraph, "public"))


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

=== Original Research Text ===
The improved photocatalytic efficiency of the heterostructured nanomaterials was attributed to the strong interfacial interactions between the components, which promoted rapid charge transfer and minimized electron-hole recombination, as confirmed by electrochemical impedance spectroscopy and time-resolved fluorescence analysis.

=== Simplified for Researchers ===




Researchers at the University of California, Los Angeles (UCLA), have developed a new type of heterostructured nanomaterials that can be used as photocatalysts for lithium-ion batteries and as catalysts for redox vesicles in redox cells, among other applications.

=== Simplified for Students ===




Researchers at the University of California, Los Angeles (UCLA), have developed a new type of heterostructured nanomaterials that exhibit improved photocatalytic efficiency and a reduced charge transfer rate compared with conventional materials. The research was funded by the National Science Foundation (NSF), the US Department of Defense (DoD), and the California Institute of Technology.

=== Simplified for General Public ===




Researchers at the University of California, Los Angeles (UCLA), have developed a new type of photocatalyst that can be used to produce lithium-ion batteries, as well as to manufacture a variety of other energy-efficient materials and devices, such as fuel cells and sensors.


In [None]:
# Mount Google Drive (run this first in Colab)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchvision

Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torch==2.6.0 (from torchvision)
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Dow

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m 

In [None]:
# Mounting Drive


from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from heapq import nlargest
from transformers import pipeline

# 🔹 Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# 🔹 Extractive Simplification (NLTK)
def extractive_simplify(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_frequencies = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_frequencies[word] = word_frequencies.get(word, 0) + 1

    max_freq = max(word_frequencies.values(), default=1)
    for word in word_frequencies:
        word_frequencies[word] /= max_freq

    sentence_scores = {}
    for sent in sentences:
        for word in word_tokenize(sent):
            if word in word_frequencies:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies[word]

    simplified_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return " ".join(simplified_sentences)

# 🔹 Abstractive Simplification (T5 Transformer) - Now called "Simplifier"
simplifier = pipeline("summarization", model="t5-small", device=0)  # Uses GPU if available

def abstractive_simplify(text, max_length=150):
    simplified_text = simplifier(text, max_length=max_length, min_length=50, do_sample=False)
    return simplified_text[0]['summary_text']

# 🔹 Example Research Paragraph
research_paragraph = """
Climate change has been a growing concern due to the rise in global temperatures, melting ice caps, and extreme weather events.
Scientific studies indicate that human activities, particularly fossil fuel burning and deforestation, contribute significantly to greenhouse gas emissions.
These emissions trap heat in the atmosphere, leading to severe consequences for ecosystems and human populations.
Researchers are exploring renewable energy sources, afforestation, and carbon capture technologies as potential solutions to mitigate the effects of climate change.
"""

# Run Simplification
print("🔹 Extractive Simplification:")
print(extractive_simplify(research_paragraph))

print("\n🔹 Abstractive Simplification:")
print(abstractive_simplify(research_paragraph))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


🔹 Extractive Simplification:


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from heapq import nlargest
from transformers import pipeline

# 🔹 Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Download the 'punkt_tab' data package
nltk.download('punkt_tab')

# 🔹 Extractive Simplification (NLTK)
def extractive_simplify(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_frequencies = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_frequencies[word] = word_frequencies.get(word, 0) + 1

    max_freq = max(word_frequencies.values(), default=1)
    for word in word_frequencies:
        word_frequencies[word] /= max_freq

    sentence_scores = {}
    for sent in sentences:
        for word in word_tokenize(sent):
            if word in word_frequencies:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies[word]

    simplified_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return " ".join(simplified_sentences)

# 🔹 Abstractive Simplification (T5 Transformer) - Now called "Simplifier"
simplifier = pipeline("summarization", model="t5-small", device=0)  # Uses GPU if available

def abstractive_simplify(text, max_length=150):
    simplified_text = simplifier(text, max_length=max_length, min_length=50, do_sample=False)
    return simplified_text[0]['summary_text']

# 🔹 Example Research Paragraph
research_paragraph = """
Climate change has been a growing concern due to the rise in global temperatures, melting ice caps, and extreme weather events.
Scientific studies indicate that human activities, particularly fossil fuel burning and deforestation, contribute significantly to greenhouse gas emissions.
These emissions trap heat in the atmosphere, leading to severe consequences for ecosystems and human populations.
Researchers are exploring renewable energy sources, afforestation, and carbon capture technologies as potential solutions to mitigate the effects of climate change.
"""

# Run Simplification
print("🔹 Extractive Simplification:")
print(extractive_simplify(research_paragraph))

print("\n🔹 Abstractive Simplification:")
print(abstractive_simplify(research_paragraph))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Device set to use cuda:0
Your max_length is set to 150, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


🔹 Extractive Simplification:

Climate change has been a growing concern due to the rise in global temperatures, melting ice caps, and extreme weather events. Researchers are exploring renewable energy sources, afforestation, and carbon capture technologies as potential solutions to mitigate the effects of climate change. These emissions trap heat in the atmosphere, leading to severe consequences for ecosystems and human populations.

🔹 Abstractive Simplification:
climate change has been a growing concern due to the rise in global temperatures, melting ice caps, and extreme weather events . human activities, particularly fossil fuel burning, contribute significantly to greenhouse gas emissions . these emissions trap heat in the atmosphere, leading to severe consequences for ecosystems and human populations .


In [3]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [4]:
import nltk
import gradio as gr
from nltk.tokenize import sent_tokenize, word_tokenize
from heapq import nlargest
from transformers import pipeline
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

summarizer = pipeline("summarization", model="t5-small")
jargon_simplifier = pipeline("text2text-generation", model="facebook/bart-large-cnn")

def clean_redundancy(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    seen = []

    for sentence in sentences:
        cleaned_sentence = re.sub(r'\s+', ' ', sentence.strip()).lower()
        cleaned_sentence = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_sentence)
        if not any(cosine_similarity_tfidf(cleaned_sentence, s) > 0.8 for s in seen):
            seen.append(cleaned_sentence)
            cleaned_sentences.append(sentence)

    return " ".join(cleaned_sentences)

def cosine_similarity_tfidf(sent1, sent2):
    vectorizer = TfidfVectorizer().fit_transform([sent1, sent2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

def simplify_jargon(text):
    jargon_dict = {
        "synergy": "cooperation",
        "utilize": "use",
        "leverage": "take advantage of",
        "paradigm": "model",
        "robust": "strong",
        "iterate": "repeat",
        "streamline": "simplify",
        "facilitate": "help",
        "implement": "carry out",
        "core competency": "main strength"
    }

    for jargon, simple in jargon_dict.items():
        text = re.sub(rf"\b{jargon}\b", simple, text, flags=re.IGNORECASE)

    return text

def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_freq = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_freq[word] = word_freq.get(word, 0) + 1

    max_freq = max(word_freq.values(), default=1)
    for word in word_freq:
        word_freq[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word in word_freq:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]

    return " ".join(nlargest(num_sentences, sentence_scores, key=sentence_scores.get))

def abstractive_summary(text, max_length=150):
    return summarizer(text, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']

def simplify_text(text):
    simplified_text = simplify_jargon(text)
    return extractive_summary(simplified_text), abstractive_summary(simplified_text)

app = gr.Interface(
    fn=simplify_text,
    inputs=gr.Textbox(lines=10, placeholder="Paste your research paper text here..."),
    outputs=[
        gr.Textbox(label="Extractive Simplification", lines=10),
        gr.Textbox(label="Abstractive Simplification", lines=10)
    ],
    title="📜 Research Paper Simplifier",
    description="Paste your text to simplify jargon and get both extractive (important sentences) and abstractive (AI-generated) simplifications instantly!"
)

app.launch()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7e86b3c8f5ee4e222e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [5]:
import nltk
import gradio as gr
import re
from heapq import nlargest
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt', quiet=True)

summarizer = pipeline("summarization", model="t5-small")
jargon_simplifier = pipeline("text2text-generation", model="facebook/bart-large-cnn")

def remove_redundant_sentences(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    seen = []

    for sentence in sentences:
        cleaned = re.sub(r'\s+', ' ', sentence.strip()).lower()
        cleaned = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned)
        if not any(cosine_similarity_tfidf(cleaned, s) > 0.8 for s in seen):
            seen.append(cleaned)
            cleaned_sentences.append(sentence)

    return " ".join(cleaned_sentences)

def cosine_similarity_tfidf(sent1, sent2):
    vectorizer = TfidfVectorizer().fit_transform([sent1, sent2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

def simplify_jargon(text):
    jargon_dict = {
        "synergy": "cooperation",
        "utilize": "use",
        "leverage": "take advantage of",
        "paradigm": "model",
        "robust": "strong",
        "iterate": "repeat",
        "streamline": "simplify",
        "facilitate": "help",
        "implement": "carry out",
        "core competency": "main strength"
    }

    for jargon, simple in jargon_dict.items():
        text = re.sub(rf"\b{jargon}s?\b", simple, text, flags=re.IGNORECASE)

    return text

def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_freq = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_freq[word] = word_freq.get(word, 0) + 1

    max_freq = max(word_freq.values(), default=1)
    for word in word_freq:
        word_freq[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word in word_freq:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]

    return " ".join(nlargest(num_sentences, sentence_scores, key=sentence_scores.get))

def abstractive_summary(text, max_length=150):
    try:
        text = text[:1000]
        result = summarizer(text, max_length=max_length, min_length=50, do_sample=False)
        return result[0]['summary_text']
    except Exception as e:
        return f"Error during summarization: {e}"

def process_text(text, num_sentences=3):
    cleaned_text = remove_redundant_sentences(text)
    simplified_text = simplify_jargon(cleaned_text)
    return extractive_summary(simplified_text, num_sentences), abstractive_summary(simplified_text)

with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# 📜 Research Paper Simplifier")
    gr.Markdown(
        "Simplify complex academic language with AI. "
        "Get a quick overview using **extractive** and **abstractive** summaries."
    )

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Paste your research paper content",
                placeholder="Enter or paste your text here...",
                lines=15
            )
            num_sentences = gr.Slider(
                minimum=1, maximum=10, value=3, step=1,
                label="Number of Extractive Sentences"
            )
            submit_btn = gr.Button("🔍 Simplify Text")
            clear_btn = gr.Button("🧹 Clear")

        with gr.Column():
            extractive_output = gr.Textbox(
                label="🧠 Extractive Simplification", lines=10, interactive=False
            )
            abstractive_output = gr.Textbox(
                label="🎯 Abstractive Simplification", lines=10, interactive=False
            )

    submit_btn.click(
        fn=process_text,
        inputs=[input_text, num_sentences],
        outputs=[extractive_output, abstractive_output]
    )

    clear_btn.click(
        fn=lambda: ("", "", ""),
        inputs=[],
        outputs=[input_text, extractive_output, abstractive_output]
    )

    gr.Examples(
        examples=[
            ["In order to leverage synergy between departments, we must implement a robust framework that facilitates seamless data integration across all paradigms of the enterprise."],
            ["The study iterated multiple methods to streamline the diagnostic process. Researchers utilized high-end equipment to demonstrate core competencies."]
        ],
        inputs=[input_text]
    )

app.launch()


Device set to use cuda:0
Device set to use cuda:0


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://224fc0c9ef84b3f1e2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [6]:
pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [9]:
import nltk
import gradio as gr
import re
import fitz  # PyMuPDF
from heapq import nlargest
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt', quiet=True)

# AI pipelines
summarizer = pipeline("summarization", model="t5-small")
jargon_simplifier = pipeline("text2text-generation", model="facebook/bart-large-cnn")

# Extract abstract only
def extract_abstract_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file.name) as doc:
        for page in doc:
            text += page.get_text()

    lower_text = text.lower()
    abstract_start = lower_text.find("abstract")
    if abstract_start == -1:
        return "❌ Abstract section not found in the PDF."

    stop_keywords = ["introduction", "1. introduction", "background", "keywords"]
    end_idx = len(text)
    for keyword in stop_keywords:
        idx = lower_text.find(keyword, abstract_start + 10)
        if idx != -1:
            end_idx = min(end_idx, idx)

    abstract_text = text[abstract_start:end_idx].strip()
    return abstract_text

# Redundancy removal
def remove_redundant_sentences(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    seen = []

    for sentence in sentences:
        cleaned = re.sub(r'\s+', ' ', sentence.strip()).lower()
        cleaned = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned)
        if not any(cosine_similarity_tfidf(cleaned, s) > 0.8 for s in seen):
            seen.append(cleaned)
            cleaned_sentences.append(sentence)

    return " ".join(cleaned_sentences)

def cosine_similarity_tfidf(sent1, sent2):
    vectorizer = TfidfVectorizer().fit_transform([sent1, sent2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

# Simplify basic jargon
def simplify_jargon(text):
    jargon_dict = {
        "synergy": "cooperation",
        "utilize": "use",
        "leverage": "take advantage of",
        "paradigm": "model",
        "robust": "strong",
        "iterate": "repeat",
        "streamline": "simplify",
        "facilitate": "help",
        "implement": "carry out",
        "core competency": "main strength"
    }

    for jargon, simple in jargon_dict.items():
        text = re.sub(rf"\b{jargon}s?\b", simple, text, flags=re.IGNORECASE)

    return text

# Extractive summary
def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_freq = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_freq[word] = word_freq.get(word, 0) + 1

    max_freq = max(word_freq.values(), default=1)
    for word in word_freq:
        word_freq[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word in word_freq:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]

    return " ".join(nlargest(num_sentences, sentence_scores, key=sentence_scores.get))

# Abstractive summary
def abstractive_summary(text, max_length=150):
    try:
        text = text[:1000]
        result = summarizer(text, max_length=max_length, min_length=50, do_sample=False)
        return result[0]['summary_text']
    except Exception as e:
        return f"Error during abstractive summarization: {e}"

# Process full logic
def process_pdf(pdf_file, num_sentences):
    text = extract_abstract_from_pdf(pdf_file)
    if text.startswith("❌"):
        return text, ""

    cleaned = remove_redundant_sentences(text)
    simplified = simplify_jargon(cleaned)

    # Extractive summary
    extractive = extractive_summary(simplified, num_sentences)

    # 👇 AI Simplification of Extractive summary
    simplified_extractive = jargon_simplifier(extractive, max_length=100, min_length=30, do_sample=False)[0]['generated_text']

    # Abstractive summary
    abstractive = abstractive_summary(simplified)

    return simplified_extractive, abstractive

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# 📜 AI-Powered Abstract Simplifier")
    gr.Markdown("Upload a research paper PDF. We'll extract the **abstract**, clean it, and simplify it both extractively and abstractively.")

    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(label="📎 Upload PDF", file_types=[".pdf"])
            num_sentences = gr.Slider(1, 10, value=3, step=1, label="🧠 Extractive Summary Length")
            run_btn = gr.Button("✨ Simplify Abstract")

        with gr.Column():
            extractive_output = gr.Textbox(label="🧠 AI-Simplified Extractive Summary", lines=10)
            abstractive_output = gr.Textbox(label="🎯 Abstractive Summary", lines=10)

    run_btn.click(fn=process_pdf, inputs=[pdf_input, num_sentences], outputs=[extractive_output, abstractive_output])

app.launch()


Device set to use cuda:0
Device set to use cuda:0


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7f3355fd80ca2126d9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [12]:
!pip install gradio transformers nltk scikit-learn pymupdf




In [14]:
import nltk
import gradio as gr
import re
import fitz  # PyMuPDF
from heapq import nlargest
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt', quiet=True)

# AI Pipelines
summarizer = pipeline("summarization", model="t5-small")
jargon_simplifier = pipeline("text2text-generation", model="facebook/bart-large-cnn")

# Extract abstract from PDF
def extract_abstract_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file.name) as doc:
        for page in doc:
            text += page.get_text()

    lower_text = text.lower()
    abstract_start = lower_text.find("abstract")
    if abstract_start == -1:
        return "❌ Abstract section not found in the PDF."

    stop_keywords = ["introduction", "1. introduction", "background", "keywords"]
    end_idx = len(text)
    for keyword in stop_keywords:
        idx = lower_text.find(keyword, abstract_start + 10)
        if idx != -1:
            end_idx = min(end_idx, idx)

    abstract_text = text[abstract_start:end_idx].strip()
    return abstract_text

# Redundancy removal
def remove_redundant_sentences(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    seen = []

    for sentence in sentences:
        cleaned = re.sub(r'\s+', ' ', sentence.strip()).lower()
        cleaned = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned)
        if not any(cosine_similarity_tfidf(cleaned, s) > 0.8 for s in seen):
            seen.append(cleaned)
            cleaned_sentences.append(sentence)

    return " ".join(cleaned_sentences)

def cosine_similarity_tfidf(sent1, sent2):
    vectorizer = TfidfVectorizer().fit_transform([sent1, sent2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

# Simplify jargon
def simplify_jargon(text):
    jargon_dict = {
        "synergy": "cooperation",
        "utilize": "use",
        "leverage": "take advantage of",
        "paradigm": "model",
        "robust": "strong",
        "iterate": "repeat",
        "streamline": "simplify",
        "facilitate": "help",
        "implement": "carry out",
        "core competency": "main strength"
    }

    for jargon, simple in jargon_dict.items():
        text = re.sub(rf"\b{jargon}s?\b", simple, text, flags=re.IGNORECASE)

    return text

# Extractive summary
def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_freq = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_freq[word] = word_freq.get(word, 0) + 1

    max_freq = max(word_freq.values(), default=1)
    for word in word_freq:
        word_freq[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word in word_freq:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]

    return " ".join(nlargest(num_sentences, sentence_scores, key=sentence_scores.get))

# Abstractive summary
def abstractive_summary(text, max_length=150):
    try:
        text = text[:1000]
        result = summarizer(text, max_length=max_length, min_length=50, do_sample=False)
        return result[0]['summary_text']
    except Exception as e:
        return f"Error during abstractive summarization: {e}"

# Process function
def simplify_text_or_pdf(pdf_file, manual_text, num_sentences):
    if pdf_file is not None:
        text = extract_abstract_from_pdf(pdf_file)
        if text.startswith("❌"):
            return text, ""
    elif manual_text.strip():
        text = manual_text
    else:
        return "❌ Please upload a PDF or paste some text!", ""

    cleaned = remove_redundant_sentences(text)
    simplified = simplify_jargon(cleaned)

    # Extractive → AI simplified
    extractive = extractive_summary(simplified, num_sentences)
    simplified_extractive = jargon_simplifier(extractive, max_length=100, min_length=30, do_sample=False)[0]['generated_text']

    abstractive = abstractive_summary(simplified)
    return simplified_extractive, abstractive

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("## 📚 Research Abstract Simplifier — PDF or Manual Mode")
    gr.Markdown("Upload a **PDF** or paste **raw abstract text** below. This tool will simplify the content using AI (both extractive and abstractive).")

    with gr.Row():
        pdf_input = gr.File(label="📎 Upload PDF", file_types=[".pdf"])
        text_input = gr.Textbox(lines=8, placeholder="Or paste your abstract text here...", label="✍️ Manual Text Input")

    num_sentences = gr.Slider(1, 10, value=3, step=1, label="🧠 Extractive Summary Length")
    run_button = gr.Button("✨ Simplify Now")

    with gr.Row():
        extractive_output = gr.Textbox(label="🧠 AI-Simplified Extractive Summary", lines=10)
        abstractive_output = gr.Textbox(label="🎯 Abstractive Summary", lines=10)

    run_button.click(fn=simplify_text_or_pdf, inputs=[pdf_input, text_input, num_sentences], outputs=[extractive_output, abstractive_output])

app.launch()


Device set to use cuda:0
Device set to use cuda:0


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b88ba8fea9013247a8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [19]:
!pip install PyPDF2 gradio transformers nltk scikit-learn


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [21]:
import nltk
import gradio as gr
import PyPDF2
import re
from heapq import nlargest
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

# Load models
summarizer = pipeline("summarization", model="t5-small")
jargon_simplifier = pipeline("text2text-generation", model="facebook/bart-large-cnn")

# Extract abstract text from PDF
def extract_abstract_from_pdf(pdf_file):
    with open(pdf_file.name, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])

    # Flexible regex for Abstract section
    abstract_match = re.search(r'(?i)abstract[\s:\n]*(.*?)(?=\n\s*[1I]\.?\s*Introduction|\n\s*[A-Z][a-z]{2,})', text, re.DOTALL)

    if abstract_match:
        return abstract_match.group(1).strip()
    return "Abstract not found."

# Remove redundant similar sentences
def clean_redundancy(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    seen = []

    for sentence in sentences:
        cleaned_sentence = re.sub(r'\s+', ' ', sentence.strip()).lower()
        cleaned_sentence = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_sentence)
        if not any(cosine_similarity_tfidf(cleaned_sentence, s) > 0.8 for s in seen):
            seen.append(cleaned_sentence)
            cleaned_sentences.append(sentence)

    return " ".join(cleaned_sentences)

# Cosine similarity helper
def cosine_similarity_tfidf(sent1, sent2):
    vectorizer = TfidfVectorizer().fit_transform([sent1, sent2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

# Simplify jargon
def simplify_jargon(text):
    jargon_dict = {
        "synergy": "cooperation",
        "utilize": "use",
        "leverage": "take advantage of",
        "paradigm": "model",
        "robust": "strong",
        "iterate": "repeat",
        "streamline": "simplify",
        "facilitate": "help",
        "implement": "carry out",
        "core competency": "main strength"
    }

    for jargon, simple in jargon_dict.items():
        text = re.sub(rf"\b{jargon}\b", simple, text, flags=re.IGNORECASE)

    return text

# Extractive summary
def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_freq = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_freq[word] = word_freq.get(word, 0) + 1

    max_freq = max(word_freq.values(), default=1)
    for word in word_freq:
        word_freq[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word in word_freq:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]

    return " ".join(nlargest(num_sentences, sentence_scores, key=sentence_scores.get))

# Abstractive summary
def abstractive_summary(text, max_length=150):
    return summarizer(text, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']

# Core text simplifier
def simplify_text(text):
    simplified = simplify_jargon(text)
    cleaned = clean_redundancy(simplified)
    return extractive_summary(cleaned), abstractive_summary(cleaned)

# For uploaded PDF
def process_pdf(pdf):
    abstract_text = extract_abstract_from_pdf(pdf)
    if abstract_text == "Abstract not found.":
        return abstract_text, ""
    return simplify_text(abstract_text)

# Gradio UI
with gr.Blocks(title="Research Paper Simplifier") as app:
    gr.Markdown("## 📄 Research Paper Simplifier - Paste Text or Upload PDF")

    with gr.Tab("📋 Paste Text"):
        with gr.Row():
            input_text = gr.Textbox(label="Paste your text", lines=12, placeholder="Paste your abstract or content here...")
        with gr.Row():
            extractive_output = gr.Textbox(label="🧠 Extractive Simplification", lines=6)
            abstractive_output = gr.Textbox(label="✨ Abstractive Simplification", lines=6)
        run_button = gr.Button("Simplify Text")
        run_button.click(fn=simplify_text, inputs=input_text, outputs=[extractive_output, abstractive_output])

    with gr.Tab("📁 Upload PDF"):
        with gr.Row():
            pdf_input = gr.File(label="Upload PDF file", file_types=[".pdf"])
        with gr.Row():
            pdf_extractive = gr.Textbox(label="🧠 Extractive Simplification", lines=6)
            pdf_abstractive = gr.Textbox(label="✨ Abstractive Simplification", lines=6)
        pdf_button = gr.Button("Simplify Abstract")
        pdf_button.click(fn=process_pdf, inputs=pdf_input, outputs=[pdf_extractive, pdf_abstractive])

app.launch()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
Device set to use cuda:0


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1ed5e11b604b022432.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


