<a href="https://colab.research.google.com/github/baizhankyzy/academic-knowledge-base/blob/main/knowledge_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#  Step 1: Replace with your GitHub repo URL
GITHUB_REPO = "https://github.com/baizhankyzy/academic-knowledge-base.git"

#  Step 2: Remove old version if already cloned
!rm -rf academic-knowledge-base

#  Step 3: Clone your GitHub repo
!git clone {GITHUB_REPO}

#  Step 4: Set correct paths for file access
pdf_path = "academic-knowledge-base/data/ai-and-hmi/anticipatory-design.pdf"
output_path = "academic-knowledge-base/output/anticipatory-design.json"


Cloning into 'academic-knowledge-base'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 45 (delta 12), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (45/45), 2.32 MiB | 7.95 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [None]:
# 📘 Knowledge Extraction from Academic PDFs to JSON (Google Colab-ready)

# ✅ Section A: Install & Import Packages
!pip install pdfplumber spacy yake -q
import os
import json
import pdfplumber
import spacy
import yake



# ✅ Section C: Load NLP Models
nlp = spacy.load("en_core_web_sm")
kw_extractor = yake.KeywordExtractor()

# ✅ Section D: Extract and Process Metadata from One PDF
def extract_metadata(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

    doc = nlp(text)
    title = text.split("\n")[0][:200]  # First line as fallback title
    keywords = [kw[0] for kw in kw_extractor.extract_keywords(text)[:10]]
    entities = list(set([ent.text for ent in doc.ents]))

    return {
        "title": title,
        "keywords": keywords,
        "entities": entities,
        "full_text": text,
        "file_name": os.path.basename(pdf_path)
    }

# ✅ Section E: Run on Single File and Save as JSON
data = extract_metadata(pdf_path)

with open(output_path, "w") as f:
    json.dump(data, f, indent=4)

print(f"✅ Extracted metadata from {pdf_path} and saved to {output_path}")




✅ Extracted metadata from academic-knowledge-base/data/ai-and-hmi/anticipatory-design.pdf and saved to academic-knowledge-base/output/anticipatory-design.json


## Upload single PDF

In [None]:
import os
print(os.path.exists("data/ai-and-hmi/anticipatory-design.pdf"))  # Should return True


False


## Extract Raw text with pdf plumber

In [None]:
with pdfplumber.open(pdf_path) as pdf:
    text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])


FileNotFoundError: [Errno 2] No such file or directory: 'data/ai-and-hmi/anticipatory-design.pdf'

## Prerequisites

In [1]:
!pip install langchain
!pip install -U langchain-community
!pip install sentence-transformers
!pip install faiss-gpu
!pip install pypdf

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.0-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

## Data ingestion

In [4]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

#Load the document
loader = PyPDFLoader("academic-knowledge-base/data/ai-and-hmi/anticipatory-design.pdf")
documents = loader.load()

#Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator="\n")
docs = text_splitter.split_documents(documents=documents)


## Data embedding and Storage with FAISS

In [6]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS

#Load embedding model
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs
)

#Create FAISS vectorstore
vectorstore = FAISS.from_documents(docs, embeddings)

#Save and reload the vector store
vectorstore.save_local("faiss_index")
persisted_vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

#Create a retriever
retriever = persisted_vectorstore.as_retriever()

  embeddings = HuggingFaceBgeEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx