In [1]:
!pip install spacy PyPDF2 requests pandas faiss-cpu scikit-learn python-docx sumy
!pip install pdfplumber
!python -m spacy download en_core_web_md  # Medium-sized NLP model

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import os
import hashlib
import requests
import PyPDF2
import pdfplumber
import spacy
import pandas as pd
import numpy as np
import faiss
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

In [3]:
# --- 1. PDF Downloading and Processing Functions ---
def get_url_hash(url):
    """Generate consistent hash for each URL to detect duplicates"""
    return hashlib.md5(url.strip().encode()).hexdigest()

def download_paper(url, save_dir="/kaggle/working/papers", force_redownload=False):
    """
    Download PDF with:
    - Deduplication
    - Content validation
    - Error handling
    """
    os.makedirs(save_dir, exist_ok=True)
    url_hash = get_url_hash(url)
    pdf_path = os.path.join(save_dir, f"{url_hash}.pdf")
    
    # Skip if already downloaded (unless forced)
    if os.path.exists(pdf_path) and not force_redownload:
        print(f"📁 Already exists: {pdf_path}")
        return pdf_path
        
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Accept": "application/pdf,text/html"
    }
    
    try:
        with requests.get(url, headers=headers, stream=True, timeout=30) as response:
            response.raise_for_status()
            
            # Validate content type
            content_type = response.headers.get('Content-Type', '').lower()
            if not ('pdf' in content_type or url.lower().endswith('.pdf')):
                print(f"⚠️ Skipping non-PDF content at {url} (Content-Type: {content_type})")
                return None
            
            # Download without progress bar
            with open(pdf_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # Filter out keep-alive chunks
                        f.write(chunk)
                        
        print(f"✅ Downloaded: {pdf_path}")
        return pdf_path
        
    except Exception as e:
        print(f"❌ Failed to download {url}: {str(e)}")
        if os.path.exists(pdf_path):
            os.remove(pdf_path)  # Clean up partial downloads
        return None

def validate_pdf(pdf_path):
    """Comprehensive PDF validation with error recovery"""
    try:
        # Quick magic number check
        with open(pdf_path, 'rb') as f:
            if f.read(4) != b'%PDF':
                return False
                
        # Full structural validation
        try:
            with pdfplumber.open(pdf_path) as pdf:
                if len(pdf.pages) == 0:
                    return False
            return True
        except:
            with PyPDF2.PdfReader(pdf_path) as pdf:
                return len(pdf.pages) > 0
                
    except Exception as e:
        print(f"⚠️ PDF validation failed for {pdf_path}: {str(e)}")
        return False

def extract_text(pdf_path, max_pages=50):
    """Extract text with fallback mechanisms and page limits"""
    methods = [
        # Try pdfplumber first (better formatting preservation)
        lambda: " ".join(
            p.extract_text() or "" 
            for p in pdfplumber.open(pdf_path).pages[:max_pages]
        ),
        # Fallback to PyPDF2
        lambda: " ".join(
            p.extract_text() or "" 
            for p in PyPDF2.PdfReader(pdf_path).pages[:max_pages]
        )
    ]
    
    for method in methods:
        try:
            text = method()
            if text.strip():
                return text
        except Exception as e:
            continue
            
    print(f"⚠️ All extraction methods failed for {pdf_path}")
    return None

def process_papers(paper_urls, output_dir="/kaggle/working/papers"):
    """Full processing pipeline with deduplication"""
    papers = []
    processed_hashes = set()
    
    for url in paper_urls:
        # Skip duplicates
        url_hash = get_url_hash(url)
        if url_hash in processed_hashes:
            print(f"⏩ Skipping duplicate: {url}")
            continue
            
        # Download and validate
        pdf_path = download_paper(url, output_dir)
        if not pdf_path or not validate_pdf(pdf_path):
            continue
            
        # Extract text
        text = extract_text(pdf_path)
        if not text:
            continue
            
        # Store results
        papers.append({
            "id": url_hash,
            "title": os.path.basename(pdf_path),
            "source_url": url,
            "local_path": pdf_path,
            "text_length": len(text),
            "text_preview": text[:1000] + "..." if len(text) > 1000 else text,
            "full_text": text
        })
        processed_hashes.add(url_hash)
        print(f"✔ Processed: {url}")
        
    return papers

# --- 2. NLP Processing Functions ---
def setup_spacy():
    """Load spaCy model with error handling"""
    try:
        nlp = spacy.load("en_core_web_md")
    except OSError:
        print("Downloading spaCy model...")
        !python -m spacy download en_core_web_md
        nlp = spacy.load("en_core_web_md")
    return nlp

def process_with_spacy(papers, nlp):
    """Add spaCy processing to papers"""
    for paper in papers:
        if "full_text" in paper:
            text = paper["full_text"]
        elif "text" in paper:
            text = paper["text"]
        else:
            print(f"Warning: Paper {paper.get('title', 'unknown')} has no text content")
            continue
        
        try:
            paper["doc"] = nlp(text[:1000000])  # Limit to first 1M chars
        except Exception as e:
            print(f"Error processing paper {paper.get('title', 'unknown')}: {str(e)}")
            paper["doc"] = None
    return papers

def create_dataframe(papers):
    """Create DataFrame and clean invalid entries"""
    df = pd.DataFrame(papers)
    return df[df["doc"].notna()].copy()

def setup_faiss_index(df):
    """Create FAISS index for semantic search"""
    if len(df) == 0:
        print("Warning: No papers processed successfully")
        return None
    
    vectors = np.array([doc.vector for doc in df["doc"] if doc is not None])
    if len(vectors) == 0:
        print("Warning: No valid vectors found for similarity search")
        return None
    
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)
    return index

# --- 3. Analysis Functions ---
def keyword_search(df, keyword):
    """Search for keyword in papers"""
    results = []
    for _, row in df.iterrows():
        if not isinstance(row["doc"], spacy.tokens.Doc):
            continue
        matches = [sent.text for sent in row["doc"].sents if keyword.lower() in sent.text.lower()]
        if matches:
            results.append({
                "title": row.get("title", "Untitled"),
                "source_url": row.get("source_url", ""),
                "matches": matches
            })
    return pd.DataFrame(results)

def semantic_search(query, df, index, nlp, top_k=3):
    """Find semantically similar papers"""
    if len(df) == 0 or index is None:
        return pd.DataFrame()
    
    query_doc = nlp(query)
    query_vector = np.array([query_doc.vector])
    distances, indices = index.search(query_vector, top_k)
    return df.iloc[indices[0]].copy()

def extract_entities(doc):
    """Extract named entities from document"""
    if not isinstance(doc, spacy.tokens.Doc):
        return []
    return [(ent.text, ent.label_) for ent in doc.ents]

def summarize_text(text, sentences_count=3):
    """Generate summary using LSA"""
    try:
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        summarizer = LsaSummarizer()
        summary = summarizer(parser.document, sentences_count)
        return " ".join([str(sentence) for sentence in summary])
    except Exception as e:
        print(f"Summarization error: {str(e)}")
        return "Summary unavailable"

In [4]:
# --- 4. Main Execution ---
if __name__ == "__main__":
# --- 4. Execution ---
    paper_urls = [
        "https://arxiv.org/pdf/2307.12874",
        "https://arxiv.org/pdf/2303.12940",
        "https://arxiv.org/pdf/1802.04351",
        "https://arxiv.org/pdf/2306.08168",
        "https://arxiv.org/pdf/2503.15964",
        "https://www.jetir.org/papers/JETIR2405D82.pdf",
        "https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf",
        "https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://dl.gi.de/server/api/core/bitstreams/aaa640a1-f8dd-4514-ad72-b809932072cc/content",
        "https://eprint.iacr.org/2023/062.pdf",
        "https://eprint.iacr.org/2022/075.pdf",    
        "https://eprint.iacr.org/2023/1234.pdf",
        "https://eprint.iacr.org/2020/300.pdf",
        "https://eprint.iacr.org/2023/312.pdf",
        "https://policyreview.info/pdf/policyreview-2016-3-427.pdf",
        "https://eprint.iacr.org/2016/013.pdf",
        "https://arxiv.org/pdf/1906.00245",
        "https://escholarship.org/content/qt7fh678d6/qt7fh678d6.pdf?t=pn651y",
        "https://re.public.polimi.it/bitstream/11311/1056221/6/11311-1056221%20Giudici.pdf",
        "https://research-api.cbs.dk/ws/files/44436178/ole_bjerg_how_is_bitcoin_money_postprint.pdf",
        "https://www.bis.org/fsi/publ/insights49.pdf",
        "https://www.scirp.org/pdf/ojbm_1534496.pdf",
        "https://www.bis.org/publ/work1066.pdf",
        "http://khcnbinhduong.gov.vn/ImageUpload/file/TTTK%20KCN/2019/Nguon%20tin%20KHCN/Blockchain_A3.pdf",
        "https://e-space.mmu.ac.uk/627269/1/Manuscript_Final%20JCLP.pdf",
        "https://pdfs.semanticscholar.org/9900/c9c91f9f78fa0adb6915855084396654363c.pdf?_gl=1*7q1z9h*_gcl_au*MTkxMDg1NzA4NC4xNzQ4MDIxMDA4*_ga*Mjc1MDg5MDkuMTc0ODAyMTAwOA..*_ga_H7P4ZT52H5*czE3NDgwMjEwMDckbzEkZzEkdDE3NDgwMjExNzkkajE1JGwwJGgwJGR1YWNJOGg3VW43bWFscGZjZ056LU5TM0lXc0Jtc0drMW93",
        "https://www.newyorkfed.org/medialibrary/media/research/epr/2024/EPR_2024_digital-assets_azar.pdf",
        "https://journals.law.harvard.edu/hblr/wp-content/uploads/sites/87/2025/03/04_HLB_15_1_Noked171-216.pdf",
        "https://www.stern.nyu.edu/sites/default/files/2024-07/Glucksman_Sak_2024.pdf",
        "https://www.tigta.gov/sites/default/files/reports/2024-07/2024300030fr_0.pdf",
        "https://www.fsb.org/uploads/Crypto-Council-for-Innovation.pdf",
        "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
        "https://ndbf.nebraska.gov/sites/default/files/industries/Digital%20Asset%20Depository%20Nebraska%20Custody%20and%20Fiduciary%20Services%20Examination%20Manual.pdf",
        "https://www.swlegal.com/media/filer_public/2d/f7/2df70b84-cb3c-4578-9943-8b3ea024abf9/sw_nl_january_2024_english.pdf",
        "https://www.willkie.com/-/media/files/publications/2024/12/law360---sec-custody-rule-creates-crypto-compliance-conundrum.pdf",
        "https://www.henrystewartpublications.com/sites/default/files/Opportunities%20in%20digital%20assets%20and%20digital%20custody-Tracking%20the%20modernisation%20of%20standard%20custody%20offering%20-%20Ignatowicz%20%26%20Taudes%20JSOC%2015-3.pdf",
        "https://www.gdf.io/wp-content/uploads/2019/02/GDF-Crypto-Asset-Safekeeping_20-April-2019-2-cust-providers-additions-1-2.pdf",
        "https://www.occ.gov/topics/charters-and-licensing/interpretations-and-actions/2020/int1170.pdf",
        "https://www.gemini.com/static/documents/guide-to-crypto-custody.pdf",
        "https://orbilu.uni.lu/bitstream/10993/62083/1/ZetzscheSinnigNikolakopoulou_Crypto%20custody_CMLJ%202024.pdf",
        "https://www.esrb.europa.eu/pub/pdf/reports/esrb.cryptoassetsanddecentralisedfinance202305~9792140acd.en.pdf",
        "https://repository.uel.ac.uk/download/df676586f4e9f8a89df529a36841d83d4750539805189a8951032ee4c2f0c16c/99798/challenges-and-approaches-to-regulating-decentralized-finance.pdf",
        "https://repository.uel.ac.uk/download/ca8bad2f5fab17596c44927643b4da1473ef7ef79862fe3ca05ea9251bd4db8b/1599957/Financial%20Crime%20update%20%282020%29.pdf",
        "https://www.iacpcybercenter.org/wp-content/uploads/2018/03/Bitcoin.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/Podcasts/SPT_Emerging-Tech-Terms.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018-materials/emerging-tech_glossary-phishing.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2018/Emerging_Tech_Bitcoin_Crypto.pdf",
        "https://www.ussc.gov/sites/default/files/pdf/training/annual-national-training-seminar/2019/emerging-tech_white-paper.pdf",
        "https://openaccess.uoc.edu/bitstream/10609/151551/1/Rahmanikivi_cbt22_empirical.pdf",
        "https://ics.uci.edu/~dabrowsa/dabrowski-defi21-hwwallet.pdf",
        "https://fc19.ifca.ai/preproceedings/93-preproceedings.pdf",
        "https://www.jkroll.com/papers/bitcoin_threshold_signatures.pdf",
        "https://corporates.db.com/files/documents/publications/db-polygo-digital-id-wp-42pp-web-secured.pdf",
        "https://www.napier.ac.uk/-/media/worktribe/output-2839021/smart-contract-attacks-and-protections.ashx",
        "https://www.cyprusbarassociation.org/images/6._Crypto_Wallets.pdf",
        "https://computerscience.unicam.it/marcantoni/tesi/Ethereum%20Smart%20Contracts%20Optimization.pdf",
        "https://cspecc.utsa.edu/publications/files/Refereed_Papers/2020_Choo_BCPPA-blockchain-cond-priv-auth-prot.pdf",
        "https://www.ekonomika.org.rs/sr/PDF/ekonomika/2019/clanci19-3/7.pdf",
        "https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf"
    ]
    
    # Step 1: Download and process papers
    print("Downloading and processing papers...")
    papers = process_papers(paper_urls)
    
    # Step 2: Set up NLP processing
    print("\nSetting up NLP pipeline...")
    nlp = setup_spacy()
    papers = process_with_spacy(papers, nlp)
    df = create_dataframe(papers)
    
    # Step 3: Set up semantic search
    print("\nSetting up semantic search index...")
    index = setup_faiss_index(df)
    
    # Step 4: Example analyses
    if len(df) > 0:
        print("\n=== Example Analyses ===")
        
        # Keyword search
        print("\nKeyword search for 'transformer':")
        print(keyword_search(df, "transformer").head())
        
        # Semantic search
        if index is not None:
            print("\nSemantic search for 'neural networks':")
            print(semantic_search("neural networks", df, index, nlp))
        
        # Entity extraction
        print("\nEntities in first paper:")
        print(extract_entities(df.iloc[0]["doc"]))
        
        # Summarization
        print("\nSummary of first paper:")
        summary_text = df.iloc[0].get("full_text", "")
        print(summarize_text(summary_text))
    
    # Save results
    print("\nSaving results...")
    df.to_csv("/kaggle/working/papers_database.csv", index=False)
    !zip -r /kaggle/working/papers_database.zip /kaggle/working/papers*
    print("\nProcessing complete!")

Downloading and processing papers...
📁 Already exists: /kaggle/working/papers/a009a494d6d220b47368efac02c33672.pdf
✔ Processed: https://arxiv.org/pdf/2307.12874
📁 Already exists: /kaggle/working/papers/e66996b3eb8367ad23957e2a0de9b57a.pdf
✔ Processed: https://arxiv.org/pdf/2303.12940
📁 Already exists: /kaggle/working/papers/4bab294056730051a7655f216addd028.pdf
✔ Processed: https://arxiv.org/pdf/1802.04351
📁 Already exists: /kaggle/working/papers/cb50a62ff6d1ac26b61d93c23b4043fd.pdf
✔ Processed: https://arxiv.org/pdf/2306.08168
📁 Already exists: /kaggle/working/papers/bf3b803f75ef8f205b017db3d01e183c.pdf
✔ Processed: https://arxiv.org/pdf/2503.15964
📁 Already exists: /kaggle/working/papers/3a3647f72884313ec11037b1d2f19999.pdf
✔ Processed: https://www.jetir.org/papers/JETIR2405D82.pdf
📁 Already exists: /kaggle/working/papers/457765beccd59ec2b696dc730490b0e9.pdf
✔ Processed: https://www.cs.ucf.edu/~czou/research/subWallet-Blockchain-2019.pdf
📁 Already exists: /kaggle/working/papers/3aaec7

In [5]:
# Ensure all outputs are saved
df.to_csv("/kaggle/working/database_0522.csv", index=False)
!zip -r /kaggle/working/papers_database.zip /kaggle/working/papers*

updating: kaggle/working/papers/ (stored 0%)
updating: kaggle/working/papers/93fc53e668e5c645db6a68924c36d0cb.pdf (deflated 1%)
updating: kaggle/working/papers/300c4cef5b6de6061dedca96740ebc1c.pdf (deflated 6%)
updating: kaggle/working/papers/f38a1479289b2e58ef8f77b349f820db.pdf (deflated 6%)
updating: kaggle/working/papers/54f2fb48bad263609528332efb956526.pdf (deflated 7%)
updating: kaggle/working/papers/cb50a62ff6d1ac26b61d93c23b4043fd.pdf (deflated 24%)
updating: kaggle/working/papers/0765e228294b673983f63770c17b760e.pdf (deflated 23%)
updating: kaggle/working/papers/52a0d62741c0dd9e67bb1f90e24bafd9.pdf (deflated 5%)
updating: kaggle/working/papers/e66996b3eb8367ad23957e2a0de9b57a.pdf (deflated 9%)
updating: kaggle/working/papers/bf3b803f75ef8f205b017db3d01e183c.pdf (deflated 31%)
updating: kaggle/working/papers/4d772f6071e0223d49470c38975474ec.pdf (deflated 0%)
updating: kaggle/working/papers/457765beccd59ec2b696dc730490b0e9.pdf (deflated 12%)
updating: kaggle/working/papers/74e86f