In [1]:
# !pip install pdfplumber spacy tqdm pandas transformers sentence-transformers faiss-cpu

In [2]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

pdf_path = "1.pdf" 
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

MULTI-SCALECROPPINGMECHANISMFORREMOTESENSINGIMAGECAPTIONING
XuetingZhang1,QiWang1,ShangdongChen2,XuelongLi1*
1SchoolofComputerScienceandCenterforOPTicalIMageryAnalysisandLearning(OPTIMAL),
NorthwesternPolytechnicalUniversity,Xi’an710072,Shaanxi,P.R.China.
2SchoolofInformationScienceandTechnology,
NorthwestUniversity,Xi’an710072,Shaanxi,P.R.China.
ABSTRACT Imagecaptioning[3]isacomprehensivetaskwhichcom-
binescomputervisionandnaturallanguageprocessing.Since
Withtherapiddevelopmentofartificialsatellite,alargenum-
that encoder-decoder based method [4] can automaticallty
ber of high resolution remote sensing images can be easily
learn the hign-level semantic features and dig their textual
obtained now. Recently, remote sensing image captioning,
relationships, it has dominated the field of image caption-
whichaimstogenerateaccurateandconcisedescriptivesen-
ing with the best perpormance. The encoder process aims
tences for remote sensing images, has been promoted by
to represent an image with

In [3]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)         # Replace multiple whitespace with a single space
    text = re.sub(r'\\[a-z]+', '', text)       # Remove escape sequences like \n, \t, etc.
    text = re.sub(r'[^\x00-\x7F]+', '', text)   # Remove non-ASCII characters
    return text.strip()

def split_into_chunks(text, chunk_size=450):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

cleaned_text = clean_text(extracted_text)
chunks = split_into_chunks(cleaned_text, chunk_size=450)

print("Number of chunks created:", len(chunks))
print("First chunk preview:", chunks[0][:500])

Number of chunks created: 4
First chunk preview: MULTI-SCALECROPPINGMECHANISMFORREMOTESENSINGIMAGECAPTIONING XuetingZhang1,QiWang1,ShangdongChen2,XuelongLi1* 1SchoolofComputerScienceandCenterforOPTicalIMageryAnalysisandLearning(OPTIMAL), NorthwesternPolytechnicalUniversity,Xian710072,Shaanxi,P.R.China. 2SchoolofInformationScienceandTechnology, NorthwestUniversity,Xian710072,Shaanxi,P.R.China. ABSTRACT Imagecaptioning[3]isacomprehensivetaskwhichcom- binescomputervisionandnaturallanguageprocessing.Since Withtherapiddevelopmentofartificialsatelli


In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned T5 model and tokenizer
model_path = "./t5_arxiv_full_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Create the summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework = "pt")

def summarize_chunks(chunks):
    summaries = []
    for chunk in chunks:
        # Prepend a prompt to guide the model for summarization
        input_text = "summarize: " + chunk
        try:
            summary = summarizer(input_text, max_length=150, min_length=40, do_sample=False)[0]["summary_text"]
        except Exception as e:
            print("Error during summarization:", e)
            summary = ""
        summaries.append(summary)
    return " ".join(summaries)


merged_summary = summarize_chunks(chunks)
print("Merged Summary:\n", merged_summary)

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1489 > 512). Running this sequence through the model will result in indexing errors


Merged Summary:
 in this paper , we propose a novel method for remote sensing image captioning . the method is based on a multi - modal neural network model , which is used to generate meaningful sentences . in this method , a feature map/vector is decoded into a sentence model with sev - convolutional neural network ( cNNs ) , while in decoder eral related datasets released . this method is applied to generate a large amount of ingimages . it is shown that the sydney-caption is a powerful tool for generating meaningful sentences for a given image . in this paper , we introduce a puter vision , extracting features from images by learning multi-scalecroppingtrainingmechanismtoimprovethegen-based method . based on the eigenvalues of the encoder , a set of encodings is derived , and a recursive method is developed . it is shown that , in the case of a fixed-length feature vector , it is possible to extract features from a given image by a multi - scalecropping mechanism . in this case , t

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

def text_processing(text: str) -> str:
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

processed_text = text_processing(cleaned_text)
print("Processed Text Preview:\n", processed_text[:500])

Processed Text Preview:
 multi scalecroppingmechanismforremotesensingimagecaptioning northwesternpolytechnicaluniversity northwestuniversity abstract withtherapiddevelopmentofartificialsatellite encoder decoder base method automaticallty ber high resolution remote sensing image easily learn hign level semantic feature dig textual obtain recently remote sense image captioning relationship dominate field image ing good perpormance encoder process aim tence remote sense image promote represent image feature map vector temp


In [6]:
import pandas as pd

def process_pdf(pdf_path):
    # 1. Extract raw text from the PDF
    raw_text = extract_text_from_pdf(pdf_path)
    
    # 2. Clean the extracted text
    cleaned_text = clean_text(raw_text)
    
    # 3. Split cleaned text into chunks
    chunks = split_into_chunks(cleaned_text)
    
    # 4. Generate a merged summary for all chunks
    merged_summary = summarize_chunks(chunks)
    
    # 5. Process the cleaned text using spaCy
    processed_text = text_processing(cleaned_text)
    
    # Create a DataFrame to hold all results
    data = {
        "text": [cleaned_text],
        "summary": [merged_summary],
        "processed_text": [processed_text]
    }
    df = pd.DataFrame(data)
    return df


pdf_path = "1.pdf"
df = process_pdf(pdf_path)
print(df.head())

                                                text  \
0  MULTI-SCALECROPPINGMECHANISMFORREMOTESENSINGIM...   

                                             summary  \
0  in this paper , we propose a novel method for ...   

                                      processed_text  
0  multi scalecroppingmechanismforremotesensingim...  


In [7]:
df.head()

Unnamed: 0,text,summary,processed_text
0,MULTI-SCALECROPPINGMECHANISMFORREMOTESENSINGIM...,"in this paper , we propose a novel method for ...",multi scalecroppingmechanismforremotesensingim...


In [8]:
df.to_csv("final.csv", index=False)
print("Data saved to final.csv")

Data saved to final.csv


In [9]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load the CSV that contains the "summary" column
df = pd.read_csv("final.csv")
print("Data loaded:")
print(df[['summary']].head())

# Get the list of summaries
summaries = df['summary'].tolist()

# Load a SentenceTransformer model for embedding
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for each summary
embeddings = embedder.encode(summaries, convert_to_tensor=False)
embeddings = np.array(embeddings).astype("float32")

# Build a FAISS index using L2 distance
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print("FAISS index built with", index.ntotal, "documents.")

Data loaded:
                                             summary
0  in this paper , we propose a novel method for ...
FAISS index built with 1 documents.


In [10]:
def retrieve_summary(query, k=3):
    """
    Retrieves the top-k most relevant summaries based on the query.
    """
    query_embedding = embedder.encode([query], convert_to_tensor=False)
    query_embedding = np.array(query_embedding).astype("float32")
    distances, indices = index.search(query_embedding, k)
    retrieved = [summaries[i] for i in indices[0]]
    return " ".join(retrieved)

# Test the retrieval function
test_query = "What are the main contributions of the research paper?"
print("Retrieved Summary Context:\n", retrieve_summary(test_query))

Retrieved Summary Context:
 in this paper , we propose a novel method for remote sensing image captioning . the method is based on a multi - modal neural network model , which is used to generate meaningful sentences . in this method , a feature map/vector is decoded into a sentence model with sev - convolutional neural network ( cNNs ) , while in decoder eral related datasets released . this method is applied to generate a large amount of ingimages . it is shown that the sydney-caption is a powerful tool for generating meaningful sentences for a given image . in this paper , we introduce a puter vision , extracting features from images by learning multi-scalecroppingtrainingmechanismtoimprovethegen-based method . based on the eigenvalues of the encoder , a set of encodings is derived , and a recursive method is developed . it is shown that , in the case of a fixed-length feature vector , it is possible to extract features from a given image by a multi - scalecropping mechanism . in th

In [11]:
from dotenv import load_dotenv
import os

load_dotenv() 
openai_api_key = os.getenv("OPENAI_API_KEY")
print("OpenAI API Key Loaded:", bool(openai_api_key))

OpenAI API Key Loaded: True


In [12]:
# !pip install tiktoken

In [13]:
import openai
import tiktoken

def truncate_prompt(prompt, max_tokens, model="gpt-3.5-turbo"):
    """
    Truncates the prompt so that its token length does not exceed max_tokens.
    """
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(prompt)
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        return encoding.decode(truncated_tokens)
    return prompt

def generate_openai_answer(prompt, model="gpt-3.5-turbo", max_tokens=200, temperature=0.7, max_context_tokens=14000):
    """
    Generates an answer using OpenAI's ChatCompletion API,
    ensuring that the prompt does not exceed allowed token limits.
    
    Args:
        prompt (str): The full prompt.
        model (str): The model to use.
        max_tokens (int): Maximum new tokens to generate.
        temperature (float): Sampling temperature.
        max_context_tokens (int): Maximum allowed tokens for the prompt.
        
    Returns:
        str: The generated answer.
    """
    # Truncate the prompt if it exceeds our max_context_tokens limit.
    prompt = truncate_prompt(prompt, max_context_tokens, model=model)
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers only based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature
    )
    
    return response.choices[0].message["content"]


In [14]:
def answer_query(query):
    # Retrieve the relevant summary context (using your retriever function)
    context = retrieve_summary(query, k=3)
    
    # Construct a prompt that instructs the model to answer based only on the provided context
    prompt = (
        "Use only the following context to answer the question. Do not add any information that is not present in the context. "
        "Provide a complete, coherent answer in full sentences.\n\n"
        f"Context: {context}\n\n"
        f"Question: {query}\n\n"
        "Answer:"
    )
    
    # Generate an answer using your OpenAI generation function
    answer = generate_openai_answer(prompt)
    return answer

# Test the integrated pipeline with an interactive loop:
if __name__ == "__main__":
    while True:
        user_query = input("Enter your query (or type 'exit' to quit): ")
        if user_query.lower() == "exit":
            break
        print("Answer:", answer_query(user_query))

Enter your query (or type 'exit' to quit): what is this paper about?
Answer: This paper is about proposing a novel method for remote sensing image captioning using a multi-modal neural network model to generate meaningful sentences. The method involves decoding a feature map/vector into a sentence model with convolutional neural networks and applying a multi-scale cropping mechanism to improve feature extraction from images.
Enter your query (or type 'exit' to quit): Ramaswamy contributions here?
Answer: There is no mention of Ramaswamy or their contributions in the provided context.
Enter your query (or type 'exit' to quit): Technology used?
Answer: The technology used in the proposed method for remote sensing image captioning is a multi-modal neural network model, convolutional neural networks (CNNs), and a multi-scale cropping mechanism.
Enter your query (or type 'exit' to quit): accuracy of cnn?
Answer: The accuracy of the Convolutional Neural Networks (CNNs) is not explicitly ment