In [2]:
!pip install pdfplumber docx langchain requests python-docx

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m

In [3]:
# Detects file type and extracts text from PDF, DOCX, or TXT (with pdfplumber for tables)
import os
import pdfplumber
from docx import Document

In [4]:
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == '.pdf':
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    elif ext == '.docx':
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    elif ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file type")

    return text

In [5]:
text=extract_text_from_file('/content/generated_sample_3000_words.pdf')

In [6]:
# Cleans and normalizes extracted text using regex + NLP techniques
import re
import spacy
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

def preprocess_text(text: str) -> str:
    # Regex cleaning
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n', text)
    text = text.replace('\xa0', ' ')
    text = text.encode('ascii', 'ignore').decode()

    # Remove repeating headers/footers
    lines = text.splitlines()
    line_counts = {}
    for line in lines:
        line_counts[line] = line_counts.get(line, 0) + 1
    lines = [line for line in lines if line_counts[line] < 5]
    text = " ".join(lines)

    # NLP preprocessing: stopword removal + lemmatization
    doc = nlp(text)
    cleaned = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
    return " ".join(cleaned)


In [8]:
final_text=preprocess_text(text)

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
def split_text_into_chunks(text, chunk_size=1700, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [11]:
split_text_into_chunks(final_text)

['piece foreign order use skin skill high business item center service wide certain sound reach current office computer evidence establish fact society sing son election history world writer determine new share population perform course trial single effort find later strategy contain shake game car campaign likely require lot management partner kitchen eat agency type smile issue animal necessary identify guess method wife place partner response administration yeah save adult represent board change product admit inside maybe true standard rule author accept position concern situation buy husband manage vote address good wish hand find dark scientist memory simple research large exactly eat provide message east think audience law address indicate later wide environmental remember suggest song manage find speech conference appear finally year simple want buy lead thank action figure free card laugh send model agreement pass change simple catch stock huge actually cup relationship deal lo

In [12]:
PROMPT_TEMPLATE = """
You are a document analysis assistant designed to extract structured metadata and generate summaries from text.

Your task is to read the content below and return the following information in a strict JSON format:
1. **title**: The title of the document (or an inferred title based on its content).
2. **author**: The author’s name, if available, or return "Unknown".
3. **date**: Date of publication or creation, if mentioned. Use "Unknown" if not found.
4. **keywords**: A list of 4–7 relevant keywords or key phrases based on the document's topics.
5. **document_type**: Choose the most appropriate type from:
   - "research paper", "legal notice", "resume", "report", "book chapter",
     "article", "business proposal", "letter", or "others".
6. **summary**: A concise, neutral summary (3–5 sentences) covering the main points.

 Return your response strictly in this JSON format (no explanation, no markdown):

{{
  "title": "",
  "author": "",
  "date": "",
  "keywords": [],
  "document_type": "",
  "summary": ""
}}

Content to analyze:
\"\"\"{content_chunk}\"\"\"
"""


In [13]:
# 5. llm_call.py
import os
import requests

In [14]:
os.environ["MISTRAL_API_URL"] = "https://api.mistral.ai/v1/chat/completions"
os.environ["MISTRAL_API_KEY"] = "xLy5WZAJHVd0AkGgkAcOO6X1psZWo0jY "


In [15]:
MISTRAL_API_URL = os.getenv("MISTRAL_API_URL")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

def call_llm_on_chunk(chunk):
    headers = {
        "Authorization": f"Bearer {os.getenv('MISTRAL_API_KEY')}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "open-mistral-7b",
        "messages": [
            {"role": "user", "content": PROMPT_TEMPLATE.format(content_chunk=chunk)}
        ],
        "temperature": 0.3
    }
    response = requests.post(os.getenv("MISTRAL_API_URL"), headers=headers, json=data)
    if response.status_code != 200:
        print(f"❌ Error {response.status_code}: {response.text}")
        return "ERROR"
    return response.json()['choices'][0]['message']['content']


In [16]:
def summarize_document_chunks(chunks):
    results = []
    for chunk in chunks:
        result = call_llm_on_chunk(chunk)
        results.append(result)
    return results

In [17]:
text = extract_text_from_file('/content/generated_sample_3000_words.pdf')
clean_text = preprocess_text(text)
chunks = split_text_into_chunks(clean_text)

In [18]:
!pip install keybert sentence-transformers

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

In [19]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [20]:
print(f"✅ Total Chunks: {len(chunks)}")

results = []
for i, chunk in enumerate(chunks):
    print(f"\n--- Generating summary for Chunk {i+1}/{len(chunks)} ---")
    summary = call_llm_on_chunk(chunk)
    print(summary)
    results.append(summary)



✅ Total Chunks: 10

--- Generating summary for Chunk 1/10 ---
{
  "title": "Unspecified Document",
  "author": "Unknown",
  "date": "Unknown",
  "keywords": ["business", "skin", "service", "election", "research", "speech", "environment"],
  "document_type": "others",
  "summary": "The document discusses various topics including business, skin, service, election, research, and environmental issues. It also mentions speeches, elections, and a possible debate. The text appears to be related to a wide range of subjects, including politics, technology, and possibly aesthetics."
}

--- Generating summary for Chunk 2/10 ---
{
  "title": "Unspecified Document",
  "author": "Unknown",
  "date": "Unknown",
  "keywords": ["speech", "change", "education", "culture", "politics", "business", "international"],
  "document_type": "others",
  "summary": "The document discusses various topics including speech, change, education, culture, politics, business, and international relations. It seems to invol

In [21]:
combined_summaries = "\n\n".join(results)
combine_prompt = f"""
You are a smart metadata assistant. Below are partial summaries of a document generated from different chunks.

Your task is to first read the chunks carefully to combine them into a **single, coherent metadata JSON object** with meaningful values.

Infer the **title** and **author** based on the document as a whole, even if not explicitly mentioned.
Generate a meaningful, concise **summary** for the full document.
Merge and deduplicate the keywords intelligently.
Assume the document type is "Article" unless there's a clear reason to choose otherwise.

Return the result in this JSON format:
{{
  "title": "Meaningful title of the whole document",
  "author": "Author name (or 'Not specified' if not found)",
  "date": "Not specified",
  "keywords": ["keyword1", "keyword2", "..."],
  "document_type": "Article",
  "summary": "Clean, concise summary of the full document."
}}

Here are the partial summaries:
\"\"\"{combined_summaries}\"\"\"
"""


In [22]:
import json
import textwrap
import re
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [23]:
def call_llm_merge_summary(prompt):
    headers = {
        "Authorization": f"Bearer {os.getenv('MISTRAL_API_KEY')}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "open-mistral-7b",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.3
    }
    response = requests.post(os.getenv("MISTRAL_API_URL"), headers=headers, json=data)
    return response.json()['choices'][0]['message']['content']

final_output = call_llm_merge_summary(combine_prompt)

In [24]:
import json
import re
import textwrap
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

# Example input: Replace this with your actual final_output and clean_text


# ============================
# 🎯 Final Output Handling
# ============================
try:
    # ① Try fenced ```json ``` block
    json_match = re.search(r'```json\s+(.*?)\s+```', final_output, re.DOTALL)

    # ② If no fenced block found, try any JSON object in text
    if not json_match:
        json_match = re.search(r'\{.*\}', final_output, re.DOTALL)

    # ③ If JSON found, parse it
    if json_match:
        json_string = json_match.group(0)
        parsed = json.loads(json_string)

        # ✅ Improve keywords using KeyBERT
        if 'clean_text' in locals():
            kw_model = KeyBERT(model=SentenceTransformer('all-MiniLM-L6-v2'))
            kb_keywords = kw_model.extract_keywords(
                clean_text,
                keyphrase_ngram_range=(1, 2),
                stop_words='english',
                top_n=10,
                use_maxsum=True,
                nr_candidates=20
            )
            final_keywords = []
            seen = set()
            for kw, _ in kb_keywords:
                if kw not in seen:
                    seen.add(kw)
                    final_keywords.append(kw)
            parsed["keywords"] = final_keywords
        else:
            print("⚠️ Warning: 'clean_text' not available for keyword extraction using KeyBERT.")

        # ✅ Pretty output
        print("\n✅ Final Metadata:")
        print(json.dumps(parsed, indent=2))

        print("\n✅ Final Summary:")
        if "summary" in parsed and parsed["summary"]:
            print(textwrap.fill(parsed["summary"], width=100))
        else:
            print("Summary not available in the parsed output.")

    else:
        print("⚠️ Could not find the JSON object within the final output string.")
        print("Showing raw output:")
        print(final_output)

except json.JSONDecodeError as e:
    print(f"⚠️ JSONDecodeError: {e}")
    print("Showing raw output:")
    print(final_output)
except KeyError as e:
    print(f"⚠️ KeyError: {e} - Check if expected keys are present in the JSON output.")
    if 'parsed' in locals():
        print("Parsed dictionary (partial):")
        print(json.dumps(parsed, indent=2))
    else:
        print("Parsed dictionary not available.")


⚠️ JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Showing raw output:
Here is the combined metadata JSON object for the given document chunks:

```json
{
  "title": "Discussion on Various Topics in Modern Society",
  "author": "Not specified",
  "date": "Not specified",
  "keywords": ["business", "skin", "service", "election", "research", "speech", "environment", "change", "education", "culture", "politics", "international", "discussion", "decision", "local politics", "community issues", "financial trial", "health", "crime", "music", "economy", "defense", "camera", "control", "investment", "medical", "technology", "policy", "debate", "community", "security", "development", "modern war", "Public Growth", "Economic Challenges", "West", "Democracy", "worker representation", "cultural speak", "market support", "energy", "training", "employee", "democratic accord", "interview"],
  "document_type": "Article",
  "summary": "The combined document discusses a wide range of topics in

In [25]:
#!pip install streamlit pyngrok --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [26]:
! pip install streamlit -q


In [48]:
%%writefile app.py
# app.py
import os
import json
import textwrap
import tempfile
from pathlib import Path
from typing import List

import streamlit as st
import requests
from keybert import KeyBERT
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document
import pdfplumber

# Load KeyBERT model once
kw_model = KeyBERT()

# Prompt used per chunk
PROMPT_TEMPLATE = """
You are an intelligent assistant. Read this content chunk and return:
- A 1-2 sentence summary
- Five keywords (comma-separated)

Chunk:
\"\"\"{content_chunk}\"\"\"
"""

# ───────────────────────────────
# Utility Functions
# ───────────────────────────────

def extract_text_from_file(uploaded_file) -> str:
    suffix = Path(uploaded_file.name).suffix.lower()
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(uploaded_file.read())
        tmp_path = Path(tmp.name)

    if suffix == ".txt":
        return tmp_path.read_text(encoding="utf-8", errors="ignore")
    elif suffix == ".docx":
        doc = Document(tmp_path)
        return "\n".join([p.text for p in doc.paragraphs])
    elif suffix == ".pdf":
        text = ""
        with pdfplumber.open(tmp_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
        return text
    raise ValueError(f"Unsupported file type: {suffix}")

def preprocess_text(text: str) -> str:
    return " ".join(text.split())

def split_text_into_chunks(text: str, size: int = 1700, overlap: int = 50) -> List[str]:
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    return splitter.split_text(text)

def call_mistral(prompt: str, temperature: float = 0.3) -> str:
    api_url = os.getenv("MISTRAL_API_URL")
    api_key = os.getenv("MISTRAL_API_KEY")
    if not api_url or not api_key:
        raise ValueError("Please set MISTRAL_API_URL and MISTRAL_API_KEY env vars")

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "open-mistral-7b",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature
    }
    resp = requests.post(api_url, headers=headers, json=data, timeout=60)
    resp.raise_for_status()
    return resp.json()["choices"][0]["message"]["content"].strip()

# ───────────────────────────────
# Streamlit UI
# ───────────────────────────────

st.set_page_config(page_title="Metadata & Summary Generator", layout="centered")

st.markdown(
    '<h1 style="color:#4A90E2;text-align:center;">📄 Auto Metadata & Summary Generator</h1>',
    unsafe_allow_html=True
)

file = st.file_uploader("📂 Upload PDF, DOCX, or TXT file", type=["pdf", "docx", "txt"])

if file:
    with st.spinner("⏳ Processing the uploaded file..."):
        raw_text = extract_text_from_file(file)
        clean_text = preprocess_text(raw_text)
        chunks = split_text_into_chunks(clean_text)

        summaries = [call_mistral(PROMPT_TEMPLATE.format(content_chunk=chunk)) for chunk in chunks]
        combined = "\n\n".join(summaries)

        # Final summarization prompt
        final_prompt = f"""
You are a smart metadata assistant. Below are partial summaries of a document generated from different chunks.

Your task is to first read the chunks carefully to combine them into a **single, coherent metadata JSON object** with meaningful values.

Infer the **title** and **author** based on the document as a whole, even if not explicitly mentioned.
Generate a meaningful, concise **summary** for the full document.
Merge and deduplicate the keywords intelligently.
Assume the document type is "Article" unless there's a clear reason to choose otherwise.

Return the result in this JSON format:
{{
  "title": "Meaningful title of the whole document",
  "author": "Author name (or 'Not specified' if not found)",
  "date": "Not specified",
  "keywords": ["keyword1", "keyword2", "..."],
  "document_type": "Article",
  "summary": "Clean, concise summary of the full document."
}}

Here are the partial summaries:
\"\"\"{combined}\"\"\"
"""
        final_output = call_mistral(final_prompt)

    try:
        parsed = json.loads(final_output)

        # Optional: Use KeyBERT for better keywords
        kb_keywords = kw_model.extract_keywords(
            clean_text,
            keyphrase_ngram_range=(1, 2),
            stop_words="english",
            top_n=10,
            use_maxsum=True,
            nr_candidates=20
        )
        parsed["keywords"] = [kw for kw, _ in kb_keywords]

        # ── Styled Output ──
        st.markdown('<h3 style="color:#1f77b4;">📌 <b>Extracted Metadata</b></h3>', unsafe_allow_html=True)
        st.json(parsed)

        st.markdown('<h3 style="color:#2ca02c;">📝 <b>Wrapped Summary</b></h3>', unsafe_allow_html=True)
        st.markdown(
            f"<div style='color:#333333; font-size:16px; line-height:1.6; background-color:#f4f4f4; padding:15px; border-radius:8px'>{parsed['summary']}</div>",
            unsafe_allow_html=True
        )
        st.markdown("<br>", unsafe_allow_html=True)

        st.download_button(
            label="💾 Download Summary",
            data=parsed["summary"],
            file_name="summary.txt",
            mime="text/plain"
        )

        st.markdown("<hr><div style='text-align:center;color:#888'>Made with ❤️ by Arpit · Powered by Mistral AI</div>", unsafe_allow_html=True)

    except Exception as e:
        st.error(f"❌ Failed to parse output: {e}")


Overwriting app.py


In [49]:
# In Colab or local terminal:
!streamlit run app.py & npx localtunnel --port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.199.121.54:8501[0m
[0m
[1G[0K⠦[1G[0Kyour url is: https://fifty-bees-watch.loca.lt
2025-06-24 02:52:49.246803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750733569.312134    7962 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750733569.332233    7962 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been 