In [None]:
!pip install spacy PyPDF2 requests pandas faiss-cpu scikit-learn python-docx sumy
!pip install pdfplumber
!python -m spacy download en_core_web_md  # Medium-sized NLP model

In [None]:
import hashlib
import requests
import PyPDF2
import pdfplumber
from urllib.parse import urlparse
import io
import spacy
import pandas as pd
import faiss
import numpy as np
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import os
from docx import Document

In [None]:
# --- 1. Enhanced PDF Downloader with Caching ---
def get_url_hash(url):
    """Generate consistent hash for each URL to detect duplicates"""
    return hashlib.md5(url.strip().encode()).hexdigest()

def download_paper(url, save_dir="/kaggle/working/papers", force_redownload=False):
    """
    Download PDF with:
    - Deduplication
    - Content validation
    - Error handling
    """
    os.makedirs(save_dir, exist_ok=True)
    url_hash = get_url_hash(url)
    pdf_path = os.path.join(save_dir, f"{url_hash}.pdf")
    
    # Skip if already downloaded (unless forced)
    if os.path.exists(pdf_path) and not force_redownload:
        print(f"📁 Already exists: {pdf_path}")
        return pdf_path
        
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Accept": "application/pdf,text/html"
    }
    
    try:
        with requests.get(url, headers=headers, stream=True, timeout=30) as response:
            response.raise_for_status()
            
            # Validate content type
            content_type = response.headers.get('Content-Type', '').lower()
            if not ('pdf' in content_type or url.lower().endswith('.pdf')):
                print(f"⚠️ Skipping non-PDF content at {url} (Content-Type: {content_type})")
                return None
            
            # Download without progress bar
            with open(pdf_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # Filter out keep-alive chunks
                        f.write(chunk)
                        
        print(f"✅ Downloaded: {pdf_path}")
        return pdf_path
        
    except Exception as e:
        print(f"❌ Failed to download {url}: {str(e)}")
        if os.path.exists(pdf_path):
            os.remove(pdf_path)  # Clean up partial downloads
        return None

# --- 2. Robust PDF Processing ---
def validate_pdf(pdf_path):
    """Comprehensive PDF validation with error recovery"""
    try:
        # Quick magic number check
        with open(pdf_path, 'rb') as f:
            if f.read(4) != b'%PDF':
                return False
                
        # Full structural validation
        try:
            with pdfplumber.open(pdf_path) as pdf:
                if len(pdf.pages) == 0:
                    return False
            return True
        except:
            with PyPDF2.PdfReader(pdf_path) as pdf:
                return len(pdf.pages) > 0
                
    except Exception as e:
        print(f"⚠️ PDF validation failed for {pdf_path}: {str(e)}")
        return False

def extract_text(pdf_path, max_pages=50):
    """Extract text with fallback mechanisms and page limits"""
    methods = [
        # Try pdfplumber first (better formatting preservation)
        lambda: " ".join(
            p.extract_text() or "" 
            for p in pdfplumber.open(pdf_path).pages[:max_pages]
        ),
        # Fallback to PyPDF2
        lambda: " ".join(
            p.extract_text() or "" 
            for p in PyPDF2.PdfReader(pdf_path).pages[:max_pages]
        )
    ]
    
    for method in methods:
        try:
            text = method()
            if text.strip():
                return text
        except Exception as e:
            continue
            
    print(f"⚠️ All extraction methods failed for {pdf_path}")
    return None

# --- 3. Main Pipeline ---
def process_papers(paper_urls, output_dir="/kaggle/working/papers"):
    """Full processing pipeline with deduplication"""
    papers = []
    processed_hashes = set()
    
    for url in paper_urls:
        # Skip duplicates
        url_hash = get_url_hash(url)
        if url_hash in processed_hashes:
            print(f"⏩ Skipping duplicate: {url}")
            continue
            
        # Download and validate
        pdf_path = download_paper(url, output_dir)
        if not pdf_path or not validate_pdf(pdf_path):
            continue
            
        # Extract text
        text = extract_text(pdf_path)
        if not text:
            continue
            
        # Store results
        papers.append({
            "id": url_hash,
            "title": os.path.basename(pdf_path),
            "source_url": url,
            "local_path": pdf_path,
            "text_length": len(text),
            "text_preview": text[:1000] + "..." if len(text) > 1000 else text,
            "full_text": text  # Warning: may be memory-intensive for many papers
        })
        processed_hashes.add(url_hash)
        print(f"✔ Processed: {url}")
        
    return papers

# --- 4. Execution ---
paper_urls = [
    "https://arxiv.org/pdf/2307.12874",    
    "https://arxiv.org/pdf/1802.04351",
    "https://arxiv.org/pdf/2306.08168",
    "https://arxiv.org/pdf/2503.15964",
    "https://www.jetir.org/papers/JETIR2405D82.pdf",
    "https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf",
    "https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf",
    "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
    "https://dl.gi.de/server/api/core/bitstreams/aaa640a1-f8dd-4514-ad72-b809932072cc/content",
    "https://eprint.iacr.org/2023/062.pdf",
    "https://eprint.iacr.org/2022/075.pdf",    
    "https://eprint.iacr.org/2023/1234.pdf",
    "https://eprint.iacr.org/2020/300.pdf",
    "https://eprint.iacr.org/2023/312.pdf",
    "https://eprint.iacr.org/2016/013.pdf",
    "https://researchmgt.monash.edu/ws/portalfiles/portal/468554595/430334621_oa.pdf",
    "https://openaccess.uoc.edu/bitstream/10609/151551/1/Rahmanikivi_cbt22_empirical.pdf",
    "https://ics.uci.edu/~dabrowsa/dabrowski-defi21-hwwallet.pdf",
    "https://fc19.ifca.ai/preproceedings/93-preproceedings.pdf",
    "https://www.jkroll.com/papers/bitcoin_threshold_signatures.pdf",
    "https://corporates.db.com/files/documents/publications/db-polygo-digital-id-wp-42pp-web-secured.pdf",
    "https://www.napier.ac.uk/-/media/worktribe/output-2839021/smart-contract-attacks-and-protections.ashx",
    "https://www.cyprusbarassociation.org/images/6._Crypto_Wallets.pdf",
    "https://computerscience.unicam.it/marcantoni/tesi/Ethereum%20Smart%20Contracts%20Optimization.pdf",
    "https://cspecc.utsa.edu/publications/files/Refereed_Papers/2020_Choo_BCPPA-blockchain-cond-priv-auth-prot.pdf",
    "https://www.ekonomika.org.rs/sr/PDF/ekonomika/2019/clanci19-3/7.pdf",
    "https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf"
]

# Run pipeline
print("Starting paper processing...")
results = process_papers(paper_urls)

# --- 5. Results Analysis ---
print(f"\n{'='*40}\nProcessing Complete\n{'='*40}")
print(f"Total URLs processed: {len(paper_urls)}")
print(f"Unique valid papers extracted: {len(results)}")
print("\nSample results:")
for paper in results[:3]:
    print(f"\n📄 {paper['title']}")
    print(f"🔗 {paper['source_url']}")
    print(f"📝 Length: {paper['text_length']} chars")
    print(f"Preview:\n{paper['text_preview'][:500]}...")

In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join([page.extract_text() for page in reader.pages])
    return text

# Load all papers into a list
papers = []
for i in range(len(paper_urls)):
    pdf_path = f"/kaggle/working/papers/paper_{i}.pdf"
    text = extract_text_from_pdf(pdf_path)
    papers.append({"title": f"Paper_{i}", "text": text})

In [None]:
nlp = spacy.load("en_core_web_md")  # For word vectors & NER

In [None]:
for paper in papers:
    paper["doc"] = nlp(paper["text"])  # Store spaCy doc objects

In [None]:
df = pd.DataFrame(papers)

In [None]:
def keyword_search(df, keyword):
    results = []
    for _, row in df.iterrows():
        doc = row["doc"]
        matches = [sent.text for sent in doc.sents if keyword.lower() in sent.text.lower()]
        if matches:
            results.append({"title": row["title"], "matches": matches})
    return results

# Example: Search for "blockchain"
keyword_search(df, "blockchain")

In [None]:
# Convert spaCy vectors to a matrix
vectors = np.array([doc.vector for doc in df["doc"]])
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

def semantic_search(query, df, top_k=3):
    query_doc = nlp(query)
    query_vector = np.array([query_doc.vector])
    distances, indices = index.search(query_vector, top_k)
    return df.iloc[indices[0]]

# Example: Find papers similar to "privacy in MPC"
semantic_search("privacy in MPC", df)

In [None]:
def extract_entities(doc):
    return [(ent.text, ent.label_) for ent in doc.ents]

# Example: Extract entities from the first paper
entities = extract_entities(df.iloc[0]["doc"])
print("Entities:", entities)

In [None]:
def summarize_text(text, sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return " ".join([str(sentence) for sentence in summary])

# Example: Summarize the first paper
summary = summarize_text(df.iloc[0]["text"])
print("Summary:", summary)

In [None]:
df.to_csv("/kaggle/working/papers_database.csv", index=False)

In [None]:
!zip -r papers_database.zip /kaggle/working/papers*