In [None]:
!pip install torchvision

In [None]:
!pip install evaluate

In [None]:
!pip install gradio

In [None]:
pip install pymupdf


In [None]:
!pip install gradio transformers nltk scikit-learn pymupdf


In [None]:
!pip install PyPDF2 gradio transformers nltk scikit-learn


In [None]:
import nltk
import gradio as gr
import PyPDF2
import re
from heapq import nlargest
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')


In [None]:
# Load models
summarizer = pipeline("summarization", model="t5-small")
jargon_simplifier = pipeline("text2text-generation", model="facebook/bart-large-cnn")

In [None]:
# Extract abstract text from PDF
def extract_abstract_from_pdf(pdf_file):
    with open(pdf_file.name, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])

    # Flexible regex for Abstract section
    abstract_match = re.search(r'(?i)abstract[\s:\n]*(.*?)(?=\n\s*[1I]\.?\s*Introduction|\n\s*[A-Z][a-z]{2,})', text, re.DOTALL)

    if abstract_match:
        return abstract_match.group(1).strip()
    return "Abstract not found."

In [None]:
# Remove redundant similar sentences
def clean_redundancy(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    seen = []

    for sentence in sentences:
        cleaned_sentence = re.sub(r'\s+', ' ', sentence.strip()).lower()
        cleaned_sentence = re.sub(r'[^a-zA-Z0-9 ]', '', cleaned_sentence)
        if not any(cosine_similarity_tfidf(cleaned_sentence, s) > 0.8 for s in seen):
            seen.append(cleaned_sentence)
            cleaned_sentences.append(sentence)

    return " ".join(cleaned_sentences)

In [None]:
# Cosine similarity helper
def cosine_similarity_tfidf(sent1, sent2):
    vectorizer = TfidfVectorizer().fit_transform([sent1, sent2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

In [None]:
# Simplify jargon
def simplify_jargon(text):
    jargon_dict = {
        "synergy": "cooperation",
        "utilize": "use",
        "leverage": "take advantage of",
        "paradigm": "model",
        "robust": "strong",
        "iterate": "repeat",
        "streamline": "simplify",
        "facilitate": "help",
        "implement": "carry out",
        "core competency": "main strength"
    }

    for jargon, simple in jargon_dict.items():
        text = re.sub(rf"\b{jargon}\b", simple, text, flags=re.IGNORECASE)

    return text

In [None]:
# Extractive summary
def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_freq = {}

    for word in word_tokenize(text):
        if word.isalnum():
            word_freq[word] = word_freq.get(word, 0) + 1

    max_freq = max(word_freq.values(), default=1)
    for word in word_freq:
        word_freq[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word in word_freq:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]

    return " ".join(nlargest(num_sentences, sentence_scores, key=sentence_scores.get))

In [None]:
# Abstractive summary
def abstractive_summary(text, max_length=150):
    return summarizer(text, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']

# Core text simplifier
def simplify_text(text):
    simplified = simplify_jargon(text)
    cleaned = clean_redundancy(simplified)
    return extractive_summary(cleaned), abstractive_summary(cleaned)

# For uploaded PDF
def process_pdf(pdf):
    abstract_text = extract_abstract_from_pdf(pdf)
    if abstract_text == "Abstract not found.":
        return abstract_text, ""
    return simplify_text(abstract_text)

In [None]:
# Gradio UI
with gr.Blocks(title="Research Paper Simplifier") as app:
    gr.Markdown("## üìÑ Research Paper Simplifier - Paste Text or Upload PDF")

    with gr.Tab("üìã Paste Text"):
        with gr.Row():
            input_text = gr.Textbox(label="Paste your text", lines=12, placeholder="Paste your abstract or content here...")
        with gr.Row():
            extractive_output = gr.Textbox(label="üß† Extractive Simplification", lines=6)
            abstractive_output = gr.Textbox(label="‚ú® Abstractive Simplification", lines=6)
        run_button = gr.Button("Simplify Text")
        run_button.click(fn=simplify_text, inputs=input_text, outputs=[extractive_output, abstractive_output])

    with gr.Tab("üìÅ Upload PDF"):
        with gr.Row():
            pdf_input = gr.File(label="Upload PDF file", file_types=[".pdf"])
        with gr.Row():
            pdf_extractive = gr.Textbox(label="üß† Extractive Simplification", lines=6)
            pdf_abstractive = gr.Textbox(label="‚ú® Abstractive Simplification", lines=6)
        pdf_button = gr.Button("Simplify Abstract")
        pdf_button.click(fn=process_pdf, inputs=pdf_input, outputs=[pdf_extractive, pdf_abstractive])

app.launch()
