In [1]:
!pip install spacy PyPDF2 requests pandas faiss-cpu scikit-learn python-docx sumy
!python -m spacy download en_core_web_md  # Medium-sized NLP model

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28

In [2]:
import requests
import PyPDF2
import io
import spacy
import pandas as pd
import faiss
import numpy as np
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import os
from docx import Document

In [5]:
def download_paper(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download: {url}")

# Example: Download a paper
paper_urls = [
    "https://www.cs.ucf.edu/~czou/research/HosseinDissertation-2020.pdf",
    "https://www.jetir.org/papers/JETIR2405D82.pdf",
    "https://arxiv.org/pdf/2307.12874"
]

os.makedirs("/kaggle/working/papers", exist_ok=True)
for i, url in enumerate(paper_urls):
    download_paper(url, f"/kaggle/working/papers/paper_{i}.pdf")

Downloaded: /kaggle/working/papers/paper_0.pdf
Downloaded: /kaggle/working/papers/paper_1.pdf
Downloaded: /kaggle/working/papers/paper_2.pdf


In [6]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join([page.extract_text() for page in reader.pages])
    return text

# Load all papers into a list
papers = []
for i in range(len(paper_urls)):
    pdf_path = f"/kaggle/working/papers/paper_{i}.pdf"
    text = extract_text_from_pdf(pdf_path)
    papers.append({"title": f"Paper_{i}", "text": text})

In [7]:
nlp = spacy.load("en_core_web_md")  # For word vectors & NER

In [8]:
for paper in papers:
    paper["doc"] = nlp(paper["text"])  # Store spaCy doc objects

In [9]:
df = pd.DataFrame(papers)

In [10]:
def keyword_search(df, keyword):
    results = []
    for _, row in df.iterrows():
        doc = row["doc"]
        matches = [sent.text for sent in doc.sents if keyword.lower() in sent.text.lower()]
        if matches:
            results.append({"title": row["title"], "matches": matches})
    return results

# Example: Search for "blockchain"
keyword_search(df, "blockchain")

[{'title': 'Paper_0',
  'matches': ['IMPROVING  SECURITY  OF CRYPTO  WALLETS   \nIN BLOCKCHAIN  TECHNOLOGIES  \n \n \n \n \nby \n \nHOSSEIN REZAEIGHALEH   \nM.S. University of Central Florida, 201 8 \n \nA dissertation  submitted in partial fulfillment of the requirements  \nfor the degree of Doctor of Philosophy   \nin the Department of Electrical Engineering and Computer Science  \nin the College of Engineering and Computer Science  \nat the University of Central Florida  \nOrlando,  Florida  \n \n \n \n \n \nFall Term  \n2020 \n \n \nMajor Professor: Cliff C. Zou  \n   ii  \n \n \n \n \n \n \n \n \n \n© 20 20 Hossein Rezaeighaleh  \n \n    iii',
   'A big challenge in blockchain and cryptocurrency is securing the private key from \npotential hackers.',
   '..............  15 \n2.2 Blockchain Technology  ................................ ................................ ...........................  ',
   '..................  16 \n2.2.2  Blockchain Mechanics  ..........................

In [11]:
# Convert spaCy vectors to a matrix
vectors = np.array([doc.vector for doc in df["doc"]])
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

def semantic_search(query, df, top_k=3):
    query_doc = nlp(query)
    query_vector = np.array([query_doc.vector])
    distances, indices = index.search(query_vector, top_k)
    return df.iloc[indices[0]]

# Example: Find papers similar to "privacy in MPC"
semantic_search("privacy in MPC", df)

Unnamed: 0,title,text,doc
0,Paper_0,IMPROVING SECURITY OF CRYPTO WALLETS \nIN...,"(IMPROVING, , SECURITY, , OF, CRYPTO, , WAL..."
1,Paper_1,"© 2024 JETIR May 2024, Volume 11, Issue 5 ...","(©, 2024, JETIR, May, 2024, ,, Volume, 11, ,, ..."
2,Paper_2,"SoK: Design, Vulnerabilities, and Security Mea...","(SoK, :, Design, ,, Vulnerabilities, ,, and, S..."


In [12]:
def extract_entities(doc):
    return [(ent.text, ent.label_) for ent in doc.ents]

# Example: Extract entities from the first paper
entities = extract_entities(df.iloc[0]["doc"])
print("Entities:", entities)

Entities: [('BLOCKCHAIN', 'ORG'), ('HOSSEIN REZAEIGHALEH', 'LAW'), ('M.S. University of Central Florida', 'ORG'), ('201', 'CARDINAL'), ('the Department of Electrical Engineering and Computer Science', 'ORG'), ('the College of Engineering and Computer Science', 'ORG'), ('the University of Central Florida', 'ORG'), ('Orlando', 'GPE'), ('Florida', 'GPE'), ('2020', 'DATE'), ('C. Zou  \n   ii', 'PERSON'), ('20 20', 'QUANTITY'), ('Hossein Rezaeighaleh  \n \n    iii', 'PERSON'), ('Firstly', 'ORDINAL'), ('two', 'CARDINAL'), ('three', 'CARDINAL'), ('one', 'CARDINAL'), ('ACKNOWL EDGMENT S', 'FAC'), ('Cliff Zou', 'PERSON'), ('Ph.D.', 'WORK_OF_ART'), ('CHAPTER 1', 'LAW'), ('1', 'CARDINAL'), ('1.1', 'CARDINAL'), ('1', 'CARDINAL'), ('1.2', 'CARDINAL'), ('2', 'CARDINAL'), ('1.3', 'CARDINAL'), ('2', 'CARDINAL'), ('1.4', 'CARDINAL'), ('3', 'CARDINAL'), ('1.5', 'CARDINAL'), ('Defense -in-Depth Architecture', 'ORG'), ('4', 'CARDINAL'), ('1.6', 'CARDINAL'), ('5', 'CARDINAL'), ('1.7', 'CARDINAL'), ('6', 'C

In [13]:
def summarize_text(text, sentences_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return " ".join([str(sentence) for sentence in summary])

# Example: Summarize the first paper
summary = summarize_text(df.iloc[0]["text"])
print("Summary:", summary)

Summary: [2] H. Rezaeighaleh, R. Laurens, C. C. Zou, “Secure smart card signing with time -based digital signature”, in Proceedings of the 2018 International Conference on Computing, Networking and Communic ations, ACM, pp. [56] N. De, "Troubled Canadian crypto exchange Quadriga CX owes its customers $190 million and cannot access most of the funds, according to a court filing obtained by CoinDesk," 1 Feb 2019. 104 [57] P. Rakdej, N. Janpitak, M. Warasart an d W. Lilakiatsakun, "Coin Recovery from Inaccessible Cryptocurrency Wallet Using Unspent Transaction Output," in 2019 4th International Conference on Information Technology (InCIT) , Bangkok, Thailand, 2019.


In [14]:
df.to_csv("/kaggle/working/papers_database.csv", index=False)

In [15]:
!zip -r papers_database.zip /kaggle/working/papers*

  adding: kaggle/working/papers/ (stored 0%)
  adding: kaggle/working/papers/paper_4.pdf (deflated 56%)
  adding: kaggle/working/papers/paper_0.pdf (deflated 14%)
  adding: kaggle/working/papers/paper_2.pdf (deflated 56%)
  adding: kaggle/working/papers/paper_1.pdf (deflated 3%)
  adding: kaggle/working/papers_database.csv (deflated 69%)
