In [1]:
!pip install spacy PyPDF2 requests pandas faiss-cpu scikit-learn python-docx sumy
!pip install pdfplumber
!python -m spacy download en_core_web_md  # Medium-sized NLP model

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading 

In [2]:
import requests
import PyPDF2
import pdfplumber
from urllib.parse import urlparse
import io
import spacy
import pandas as pd
import faiss
import numpy as np
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import os
from docx import Document

In [27]:
# --- 1. Enhanced PDF Downloader with Error Handling ---
def download_paper(url, save_path):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, stream=True, timeout=10)
        response.raise_for_status()  # Raise HTTP errors
        
        # Check if content is PDF (by URL or headers)
        content_type = response.headers.get('Content-Type', '')
        if 'pdf' not in content_type.lower() and not url.lower().endswith('.pdf'):
            print(f"Warning: URL may not be a PDF (Content-Type: {content_type}): {url}")
            
        # Stream download to avoid memory issues
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:  # Filter out keep-alive chunks
                    f.write(chunk)
        print(f"Downloaded: {save_path}")
        return True
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
        return False

# --- 2. PDF Validation & Text Extraction ---
def is_valid_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            # Quick check for PDF magic number
            if f.read(4) != b'%PDF':
                return False
            f.seek(0)
            PyPDF2.PdfReader(f)  # Full validation
        return True
    except Exception as e:
        print(f"Invalid PDF: {pdf_path} - Error: {str(e)}")
        return False

def extract_text(pdf_path):
    # Try pdfplumber first (better for complex layouts)
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() or "" for page in pdf.pages])  # Handle None
            if text.strip():
                return text
    except Exception as e:
        print(f"pdfplumber failed: {str(e)}")
    
    # Fallback to PyPDF2
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = " ".join([page.extract_text() or "" for page in reader.pages])
            if text.strip():
                return text
    except Exception as e:
        print(f"PyPDF2 failed: {str(e)}")
    
    return None

# --- 3. Main Processing ---
paper_urls = [
    "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",  
    "https://arxiv.org/pdf/2307.12874",  
    "https://www.jetir.org/papers/JETIR2405D82.pdf",  
    "https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf",
    "https://arxiv.org/pdf/1802.04351",
    "https://eprint.iacr.org/2022/075.pdf",
    "https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf"
]

os.makedirs("/kaggle/working/papers", exist_ok=True)
papers = []

for i, url in enumerate(paper_urls):
    pdf_path = f"/kaggle/working/papers/paper_{i}.pdf"
    
    # Download and validate
    if download_paper(url, pdf_path) and is_valid_pdf(pdf_path):
        text = extract_text(pdf_path)
        if text:
            papers.append({
                "title": f"Paper_{i}",
                "source_url": url,
                "text": text[:5000] + "..." if len(text) > 5000 else text  # Store first 5k chars
            })
        else:
            print(f"Text extraction failed for: {pdf_path}")
    else:
        print(f"Skipping invalid PDF: {url}")

# --- 4. Results ---
print(f"\nSuccessfully processed {len(papers)}/{len(paper_urls)} papers:")
for paper in papers:
    print(f"- {paper['title']} (Source: {paper['source_url']})")

Downloaded: /kaggle/working/papers/paper_0.pdf
Downloaded: /kaggle/working/papers/paper_1.pdf
Downloaded: /kaggle/working/papers/paper_2.pdf
Downloaded: /kaggle/working/papers/paper_3.pdf
Downloaded: /kaggle/working/papers/paper_4.pdf
Downloaded: /kaggle/working/papers/paper_5.pdf
Downloaded: /kaggle/working/papers/paper_6.pdf

Successfully processed 7/7 papers:
- Paper_0 (Source: https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf)
- Paper_1 (Source: https://arxiv.org/pdf/2307.12874)
- Paper_2 (Source: https://www.jetir.org/papers/JETIR2405D82.pdf)
- Paper_3 (Source: https://www.cs.ucf.edu/~czou/research/Hossein-TrustCom-2020.pdf)
- Paper_4 (Source: https://arxiv.org/pdf/1802.04351)
- Paper_5 (Source: https://eprint.iacr.org/2022/075.pdf)
- Paper_6 (Source: https://assets.cureusjournals.com/artifacts/upload/review_article/pdf/1099/20250319-214523-194a3z.pdf)


In [28]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join([page.extract_text() for page in reader.pages])
    return text

# Load all papers into a list
papers = []
for i in range(len(paper_urls)):
    pdf_path = f"/kaggle/working/papers/paper_{i}.pdf"
    text = extract_text_from_pdf(pdf_path)
    papers.append({"title": f"Paper_{i}", "text": text})

In [29]:
nlp = spacy.load("en_core_web_md")  # For word vectors & NER

In [30]:
for paper in papers:
    paper["doc"] = nlp(paper["text"])  # Store spaCy doc objects

In [31]:
df = pd.DataFrame(papers)

In [32]:
def keyword_search(df, keyword):
    results = []
    for _, row in df.iterrows():
        doc = row["doc"]
        matches = [sent.text for sent in doc.sents if keyword.lower() in sent.text.lower()]
        if matches:
            results.append({"title": row["title"], "matches": matches})
    return results

# Example: Search for "blockchain"
keyword_search(df, "blockchain")

[{'title': 'Paper_0',
  'matches': ['IMPROVING  SECURITY  OF CRYPTO  WALLETS   \nIN BLOCKCHAIN  TECHNOLOGIES  \n \n \n \n \nby \n \nHOSSEIN REZAEIGHALEH   \nM.S. University of Central Florida, 201 8 \n \nA dissertation  submitted in partial fulfillment of the requirements  \nfor the degree of Doctor of Philosophy   \nin the Department of Electrical Engineering and Computer Science  \nin the College of Engineering and Computer Science  \nat the University of Central Florida  \nOrlando,  Florida  \n \n \n \n \n \nFall Term  \n2020 \n \n \nMajor Professor: Cliff C. Zou  \n   ii  \n \n \n \n \n \n \n \n \n \n© 20 20 Hossein Rezaeighaleh  \n \n    iii',
   'A big challenge in blockchain and cryptocurrency is securing the private key from \npotential hackers.',
   '..............  15 \n2.2 Blockchain Technology  ................................ ................................ ...........................  ',
   '..................  16 \n2.2.2  Blockchain Mechanics  ..........................

In [33]:
# Convert spaCy vectors to a matrix
vectors = np.array([doc.vector for doc in df["doc"]])
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

def semantic_search(query, df, top_k=3):
    query_doc = nlp(query)
    query_vector = np.array([query_doc.vector])
    distances, indices = index.search(query_vector, top_k)
    return df.iloc[indices[0]]

# Example: Find papers similar to "privacy in MPC"
semantic_search("privacy in MPC", df)

Unnamed: 0,title,text,doc
6,Paper_6,Review began\n 11/17/2024 \nReview ended\n 03/...,"(Review, began, \n , 11/17/2024, \n, Review, e..."
0,Paper_0,IMPROVING SECURITY OF CRYPTO WALLETS \nIN...,"(IMPROVING, , SECURITY, , OF, CRYPTO, , WAL..."
4,Paper_4,A First Look at the Usability of Bitcoin Key\n...,"(A, First, Look, at, the, Usability, of, Bitco..."


In [34]:
def extract_entities(doc):
    return [(ent.text, ent.label_) for ent in doc.ents]

# Example: Extract entities from the first paper
entities = extract_entities(df.iloc[0]["doc"])
print("Entities:", entities)

Entities: [('BLOCKCHAIN', 'ORG'), ('HOSSEIN REZAEIGHALEH', 'LAW'), ('M.S. University of Central Florida', 'ORG'), ('201', 'CARDINAL'), ('the Department of Electrical Engineering and Computer Science', 'ORG'), ('the College of Engineering and Computer Science', 'ORG'), ('the University of Central Florida', 'ORG'), ('Orlando', 'GPE'), ('Florida', 'GPE'), ('2020', 'DATE'), ('C. Zou  \n   ii', 'PERSON'), ('20 20', 'QUANTITY'), ('Hossein Rezaeighaleh  \n \n    iii', 'PERSON'), ('Firstly', 'ORDINAL'), ('two', 'CARDINAL'), ('three', 'CARDINAL'), ('one', 'CARDINAL'), ('ACKNOWL EDGMENT S', 'FAC'), ('Cliff Zou', 'PERSON'), ('Ph.D.', 'WORK_OF_ART'), ('CHAPTER 1', 'LAW'), ('1', 'CARDINAL'), ('1.1', 'CARDINAL'), ('1', 'CARDINAL'), ('1.2', 'CARDINAL'), ('2', 'CARDINAL'), ('1.3', 'CARDINAL'), ('2', 'CARDINAL'), ('1.4', 'CARDINAL'), ('3', 'CARDINAL'), ('1.5', 'CARDINAL'), ('Defense -in-Depth Architecture', 'ORG'), ('4', 'CARDINAL'), ('1.6', 'CARDINAL'), ('5', 'CARDINAL'), ('1.7', 'CARDINAL'), ('6', 'C

In [35]:
def summarize_text(text, sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return " ".join([str(sentence) for sentence in summary])

# Example: Summarize the first paper
summary = summarize_text(df.iloc[0]["text"])
print("Summary:", summary)

Summary: [2] H. Rezaeighaleh, R. Laurens, C. C. Zou, “Secure smart card signing with time -based digital signature”, in Proceedings of the 2018 International Conference on Computing, Networking and Communic ations, ACM, pp. [56] N. De, "Troubled Canadian crypto exchange Quadriga CX owes its customers $190 million and cannot access most of the funds, according to a court filing obtained by CoinDesk," 1 Feb 2019. 104 [57] P. Rakdej, N. Janpitak, M. Warasart an d W. Lilakiatsakun, "Coin Recovery from Inaccessible Cryptocurrency Wallet Using Unspent Transaction Output," in 2019 4th International Conference on Information Technology (InCIT) , Bangkok, Thailand, 2019.


In [36]:
df.to_csv("/kaggle/working/papers_database.csv", index=False)

In [37]:
!zip -r papers_database.zip /kaggle/working/papers*

updating: kaggle/working/papers/ (stored 0%)
updating: kaggle/working/papers/paper_4.pdf (deflated 9%)
updating: kaggle/working/papers/paper_0.pdf (deflated 14%)
updating: kaggle/working/papers/paper_2.pdf (deflated 3%)
updating: kaggle/working/papers/paper_1.pdf (deflated 56%)
updating: kaggle/working/papers_database.csv (deflated 68%)
updating: kaggle/working/papers/paper_5.pdf (deflated 9%)
updating: kaggle/working/papers/paper_3.pdf (deflated 7%)
updating: kaggle/working/papers/paper_7.pdf (deflated 23%)
updating: kaggle/working/papers/paper_6.pdf (deflated 23%)
