# Web Scraping for PDFs

In [4]:
import os
import requests

def download_single_pdf(url, folder="data"):
    os.makedirs(folder, exist_ok=True)  # create folder if not exists
    filename = os.path.join(folder, os.path.basename(url))  # extract file name from URL
    try:
        r = requests.get(url)
        r.raise_for_status()  # raise error if download fails
        with open(filename, "wb") as f:
            f.write(r.content)
        print(f" Downloaded: {filename}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

In [3]:
import requests
from bs4 import BeautifulSoup
import os

def scrape_pdfs(base_url, download_folder="data"):
    os.makedirs(download_folder, exist_ok=True)
    res = requests.get(base_url)
    soup = BeautifulSoup(res.text, "html.parser")

    for link in soup.find_all("a"):
        href = link.get("href", "")
        if href.endswith(".pdf"):
            pdf_url = href if href.startswith("http") else base_url + href
            file_name = os.path.join(download_folder, os.path.basename(pdf_url))
            r = requests.get(pdf_url)
            with open(file_name, "wb") as f:
                f.write(r.content)
            print(f"Downloaded: {file_name}")

In [3]:
scrape_pdfs("https://capd.mit.edu/resources/career-handbook/")

Downloaded: data/Career-Handbook-2019.pdf
Downloaded: data/Career-Handbook-2019.pdf
Downloaded: data/selecting-a-medical-school-tips.pdf
Downloaded: data/selecting-a-medical-school-tips.pdf


In [4]:
scrape_pdfs("https://careerservices.fas.harvard.edu/resources/hes-resume-cover-letter-template/")

Downloaded: data/2024-HES_resume-and-letter.pdf


In [5]:
scrape_pdfs("https://careerservices.fas.harvard.edu/channels/create-a-resume-cv-or-cover-letter/")

Downloaded: data/Harvard-College-CS-Resume-Example.pdf
Downloaded: data/Harvard-College-CS-Resume-Example.pdf
Downloaded: data/Harvard-College-Engineering-Example.pdf
Downloaded: data/Harvard-College-Engineering-Example.pdf
Downloaded: data/2024-GSAS_phd_resume_cover_letters-1.pdf
Downloaded: data/2024-GSAS_phd_resume_cover_letters-1.pdf


In [12]:
download_single_pdf("https://www.careereducation.columbia.edu/sites/default/files/2025-01/resume-sample-with-pop-outs.pdf")

✅ Downloaded successfully: data/resume-sample-with-pop-outs.pdf


In [15]:
scrape_pdfs("https://dsel.education.gov.in/careers/health.html")

Downloaded: data/Health and Wellness.pdf


In [16]:
scrape_pdfs("https://dsel.education.gov.in/careers/education.html")

Downloaded: data/Education and Training.pdf


In [17]:
scrape_pdfs("https://dsel.education.gov.in/careers/business.html")

Downloaded: data/Business and Finance.pdf


In [18]:
scrape_pdfs("https://dsel.education.gov.in/careers/ArtaMediaMarketing.html")

Downloaded: data/telephone_directory_moe.pdf
Downloaded: data/GFR-2017.pdf
Downloaded: data/telephone_directory_moe.pdf
Downloaded: data/pgi_hi_1819.pdf


In [19]:
scrape_pdfs("https://dsel.education.gov.in/careers/Agriculture.html")

Downloaded: data/Agriculture and Allied Sciences.pdf


In [20]:
scrape_pdfs("https://dsel.education.gov.in/careers/engineering.html")

Downloaded: data/Engineering.pdf


In [21]:
scrape_pdfs("https://dsel.education.gov.in/careers/government.html")

Downloaded: data/Government Services.pdf


In [22]:
scrape_pdfs("https://dsel.education.gov.in/careers/informationtechnology.html")
scrape_pdfs("https://dsel.education.gov.in/careers/management.html")
scrape_pdfs("https://dsel.education.gov.in/careers/operationsLogistics.html")
scrape_pdfs("https://dsel.education.gov.in/careers/publicPolicyLaw.html")
scrape_pdfs("https://dsel.education.gov.in/careers/researchandDevelopment.html")

Downloaded: data/Information Technology.pdf
Downloaded: data/Management.pdf
Downloaded: data/Operations,Logistics and Hospitality.pdf
Downloaded: data/Public Policy,Law and Safety.pdf
Downloaded: data/Research and Development.pdf


In [23]:
scrape_pdfs("https://dsel.education.gov.in/careers/technicalandskillTrades.html")

Downloaded: data/Technical and Skill Trades.pdf


In [24]:
pdf_urls = [
    "https://careerdevelopment.princeton.edu/sites/g/files/toruqf1041/files/resume_guide_2020.pdf",
    "https://cdn.uconnectlabs.com/wp-content/uploads/sites/123/2021/07/Career-Handbook-2019.pdf",
    "https://www.umkc.edu/recreation/docs/career-development-guide.pdf",
    "https://som.yale.edu/sites/default/files/Yale%20SOM%20CDO%20Resume%20Writing%20Guide-1.pdf",
    "https://career.fsu.edu/sites/g/files/upcbnu746/files/Guides/resume%20writing%20guide%20web.pdf",
    "https://business.ucf.edu/wp-content/uploads/sites/4/2020/03/UPDATED-Combined-Resume-packet.pdf",
    "https://indianastate.edu/sites/default/files/career-center-resume-writing-guide.pdf",
    "https://www.calhr.ca.gov/workforce-planning/Documents/wfp-caltrans-Career-Development-Planning-Workbook-Final.pdf",
    "https://www.doi.gov/sites/default/files/resume-handout-508-compliant.pdf"
]

for url in pdf_urls:
    download_single_pdf(url)

 Downloaded: data/resume_guide_2020.pdf
 Downloaded: data/Career-Handbook-2019.pdf
 Downloaded: data/career-development-guide.pdf
Failed to download https://som.yale.edu/sites/default/files/Yale%20SOM%20CDO%20Resume%20Writing%20Guide-1.pdf: 404 Client Error: Not Found for url: https://som.yale.edu/sites/default/files/Yale%20SOM%20CDO%20Resume%20Writing%20Guide-1.pdf
 Downloaded: data/resume%20writing%20guide%20web.pdf
 Downloaded: data/UPDATED-Combined-Resume-packet.pdf
Failed to download https://indianastate.edu/sites/default/files/career-center-resume-writing-guide.pdf: 404 Client Error: Not Found for url: https://indianastate.edu/sites/default/files/career-center-resume-writing-guide.pdf
Failed to download https://www.calhr.ca.gov/workforce-planning/Documents/wfp-caltrans-Career-Development-Planning-Workbook-Final.pdf: 403 Client Error: Forbidden for url: https://www.calhr.ca.gov/workforce-planning/Documents/wfp-caltrans-Career-Development-Planning-Workbook-Final/
 Downloaded: data/

NameError: name 'process_pdfs' is not defined

In [25]:
download_single_pdf("https://gradschool.uky.edu/sites/default/files/2024-12/stuckert-resume-guide-final-2024-2025-1.pdf?utm_source=chatgpt.com")

 Downloaded: data/stuckert-resume-guide-final-2024-2025-1.pdf?utm_source=chatgpt.com


In [26]:
download_single_pdf("https://som.yale.edu/sites/default/files/2022-01/2022_Your%20Career%20Development%20Guide%20for%20Working%20Professionals_0.pdf")
download_single_pdf("https://harris.uchicago.edu/sites/default/files/Action%20Plan%20-%20Revised%20081315_2.pdf")

 Downloaded: data/2022_Your%20Career%20Development%20Guide%20for%20Working%20Professionals_0.pdf
 Downloaded: data/Action%20Plan%20-%20Revised%20081315_2.pdf


In [27]:
pdf_urls = [
    "https://www.naceweb.org/docs/default-source/default-document-library/2024/resources/nace-career-readiness-competencies-revised-apr-2024.pdf",
    "https://careerdevelopment.princeton.edu/sites/g/files/toruqf1041/files/graduate_school_guide_2020.pdf",
    "https://career.berkeley.edu/wp-content/uploads/Copy-of-Internship-Program-Development-Toolkit_FINAL.pdf",
    "https://academic-senate.berkeley.edu/sites/default/files/internship_guidelines_final.pdf",
    "https://cdn.wou.edu/professional-pathways/files/2022/07/Grad-School-Guide-2022.pdf"
]

for url in pdf_urls:
    download_single_pdf(url)

 Downloaded: data/nace-career-readiness-competencies-revised-apr-2024.pdf
 Downloaded: data/graduate_school_guide_2020.pdf
 Downloaded: data/Copy-of-Internship-Program-Development-Toolkit_FINAL.pdf
 Downloaded: data/internship_guidelines_final.pdf
 Downloaded: data/Grad-School-Guide-2022.pdf


In [28]:
pdf_urls = [
    "https://www.cmu.edu/career/faculty-and-staff/documents-for-career-champions/behavioral-interview-guide.pdf",
    "https://www.cmu.edu/dietrich/students/graduate/docs/evidence-based-interviewing-2022.pdf",
    "https://www.cmu.edu/heinz-shared/_files/img/career-services-pages/career-academy-interviewing-guide.pdf",
    "https://som.yale.edu/sites/default/files/2022-01/2022_Your%20Career%20Development%20Guide%20for%20Working%20Professionals_0.pdf",
    "https://www.kirkwood.edu/_files/pdf/explore/services/job_search_success_workbook_access.pdf"
]

for url in pdf_urls:
    download_single_pdf(url)

 Downloaded: data/behavioral-interview-guide.pdf
 Downloaded: data/evidence-based-interviewing-2022.pdf
Failed to download https://www.cmu.edu/heinz-shared/_files/img/career-services-pages/career-academy-interviewing-guide.pdf: 404 Client Error: Not Found for url: https://www.cmu.edu/heinz-shared/_files/img/career-services-pages/career-academy-interviewing-guide.pdf
 Downloaded: data/2022_Your%20Career%20Development%20Guide%20for%20Working%20Professionals_0.pdf
 Downloaded: data/job_search_success_workbook_access.pdf


In [4]:
download_single_pdf("https://www.researchgate.net/publication/385650743_The_Importance_of_Internships")

Failed to download https://www.researchgate.net/publication/385650743_The_Importance_of_Internships: 403 Client Error: Forbidden for url: https://www.researchgate.net/publication/385650743_The_Importance_of_Internships


In [6]:
pdf_urls = [
    "https://files.eric.ed.gov/fulltext/EJ1263677.pdf",
    "https://assets.publishing.service.gov.uk/media/5b3b5de3ed915d33c7d58e52/Internships.pdf",
    "https://www.tesolunion.org/attachments/files/8MZFL2NWYX6ZJI52YJU35MZQ15ZDRM2MTC4DZJM41MTJK3YTDI8ZGMX9YJFLFYTQ20NZY3CZDAY5YZHI2MTAZ1MGY2DLJC01NZQZ9OTE3FLMYZ.pdf",
    "https://www.montclair.edu/college-of-communication-and-media/wp-content/uploads/sites/20/2020/06/10-Reasons-to-Do-an-Internship.pdf",
    "https://ijssrr.com/journal/article/view/2200/1675f"
]

for url in pdf_urls:
    download_single_pdf(url)

 Downloaded: data/EJ1263677.pdf
 Downloaded: data/Internships.pdf
 Downloaded: data/8MZFL2NWYX6ZJI52YJU35MZQ15ZDRM2MTC4DZJM41MTJK3YTDI8ZGMX9YJFLFYTQ20NZY3CZDAY5YZHI2MTAZ1MGY2DLJC01NZQZ9OTE3FLMYZ.pdf
 Downloaded: data/10-Reasons-to-Do-an-Internship.pdf
Failed to download https://ijssrr.com/journal/article/view/2200/1675f: 404 Client Error: Not Found for url: https://ijssrr.com/journal/article/view/2200/1675f


In [9]:
scrape_pdfs("https://www.researchgate.net/publication/383711697_Importance_of_Internship_Programs_as_a_Strategy_to_Impart_Experiential_Learning_on_Graduate")

In [5]:
scrape_pdfs("https://cbseacademic.nic.in/curriculum_2025.html")

Downloaded: data/Curriculum_Sec_2024-25.pdf
Downloaded: data/Arabic_Sec_2024-25.pdf
Downloaded: data/Assamese_Sec_2024-25.pdf
Downloaded: data/Bahasa_Melayu_Sec_2024-25.pdf
Downloaded: data/Bengali_Sec_2024-25.pdf
Downloaded: data/Bhoti_Sec_2024-25.pdf
Downloaded: data/Bhutia_Sec_2024-25.pdf
Downloaded: data/Bodo_Sec_2024-25.pdf
Downloaded: data/English_LL_2024-25.pdf
Downloaded: data/English_Communicative_Sec_2024-25.pdf
Downloaded: data/French_Sec_2024-25.pdf
Downloaded: data/German_Sec_2024-25.pdf
Downloaded: data/Gujarati_Sec_2024-25.pdf
Downloaded: data/Gurung_Sec_2024-25.pdf
Downloaded: data/Hindi_A_Sec_2024-25.pdf
Downloaded: data/Hindi_B_Sec_2024-25.pdf
Downloaded: data/Japanese_Sec_2024-25.pdf
Downloaded: data/Kannada_Sec_2024-25.pdf
Downloaded: data/Kashmiri_Sec_2024-25.pdf
Downloaded: data/KOKBOROK_Sec_2024-25.pdf
Downloaded: data/Lepcha_Sec_2024-25.pdf
Downloaded: data/Limboo_Sec_2024-25.pdf
Downloaded: data/Malyalam_Sec_2024-25.pdf
Downloaded: data/Manipuri_Sec_2024-25.pdf

# Chunking and Vectorizing

In [11]:
import json

METADATA_PATH = "career_vectordb/vector_db_metadata.json"

def load_metadata():
    if os.path.exists(METADATA_PATH):
        with open(METADATA_PATH, "r") as f:
            return json.load(f)
    else:
        return {"processed_pdfs": []}

def save_metadata(metadata):
    os.makedirs(os.path.dirname(METADATA_PATH), exist_ok=True)
    with open(METADATA_PATH, "w") as f:
        json.dump(metadata, f, indent=4)

In [None]:
import os
import json
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings  
from langchain.text_splitter import CharacterTextSplitter

METADATA_PATH = "career_vectordb/vector_db_metadata.json"

def load_metadata():
    if os.path.exists(METADATA_PATH):
        with open(METADATA_PATH, "r") as f:
            return json.load(f)
    else:
        return {"processed_pdfs": []}

def save_metadata(metadata):
    os.makedirs(os.path.dirname(METADATA_PATH), exist_ok=True)
    with open(METADATA_PATH, "w") as f:
        json.dump(metadata, f, indent=4)

def process_pdfs_incremental(directory="data/", chunk_size=1000, chunk_overlap=100):
    embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    metadata = load_metadata()
    processed = metadata.get("processed_pdfs", [])
    
    all_new_chunks = []

    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        if not file.lower().endswith(".pdf"):
            print(f"⏭ Skipping non-PDF file: {file}")
            continue
        if file in processed:
            print(f"✅ Skipping already processed file: {file}")
            continue

        print(f"🔍 Processing new file: {file}")
        try:
            loader = PyMuPDFLoader(filepath)
            docs = loader.load()
        except Exception as e:
            print(f"❌ Failed to load {file}: {e}")
            continue

        try:
            splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
            chunks = splitter.split_documents(docs)
            all_new_chunks.extend(chunks)
            processed.append(file)
        except Exception as e:
            print(f"❌ Failed to split {file}: {e}")
            continue

    if not all_new_chunks:
        print("📦 No new documents to add.")
        return

    # Load existing vector DB or create a new one
    try:
        if os.path.exists("career_vectordb/index.faiss"):
            vectordb = FAISS.load_local("career_vectordb", embedding_model, allow_dangerous_deserialization=True)
            vectordb.add_documents(all_new_chunks)
        else:
            vectordb = FAISS.from_documents(all_new_chunks, embedding_model)
        vectordb.save_local("career_vectordb")
        save_metadata({"processed_pdfs": processed})
        print("✅ Vector DB updated and saved.")
    except Exception as e:
        print(f"❌ Failed to update vector DB: {e}")

if __name__ == "__main__":
    # Run incremental processing on "data" folder by default
    process_pdfs_incremental()

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


✅ Skipping already processed file: Operations,Logistics and Hospitality.pdf
✅ Skipping already processed file: Agriculture and Allied Sciences.pdf
✅ Skipping already processed file: behavioral-interview-guide.pdf
✅ Skipping already processed file: nace-career-readiness-competencies-revised-apr-2024.pdf
✅ Skipping already processed file: resume-sample-with-pop-outs.pdf
✅ Skipping already processed file: UPDATED-Combined-Resume-packet.pdf
✅ Skipping already processed file: GFR-2017.pdf
✅ Skipping already processed file: Health and Wellness.pdf
✅ Skipping already processed file: 2024-HES_resume-and-letter.pdf
✅ Skipping already processed file: Internships.pdf
✅ Skipping already processed file: 10-Reasons-to-Do-an-Internship.pdf
✅ Skipping already processed file: telephone_directory_moe.pdf
✅ Skipping already processed file: Public Policy,Law and Safety.pdf
✅ Skipping already processed file: Research and Development.pdf
✅ Skipping already processed file: TheImportanceofInternships_LinkedIn

In [13]:
# At the bottom of loader.py or in a separate cell if using Jupyter Notebook
process_pdfs_incremental(directory="data/")

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


✅ Skipping already processed file: Operations,Logistics and Hospitality.pdf
✅ Skipping already processed file: Agriculture and Allied Sciences.pdf
✅ Skipping already processed file: behavioral-interview-guide.pdf
✅ Skipping already processed file: nace-career-readiness-competencies-revised-apr-2024.pdf
✅ Skipping already processed file: resume-sample-with-pop-outs.pdf
✅ Skipping already processed file: UPDATED-Combined-Resume-packet.pdf
✅ Skipping already processed file: GFR-2017.pdf
✅ Skipping already processed file: Health and Wellness.pdf
✅ Skipping already processed file: 2024-HES_resume-and-letter.pdf
🔍 Processing new file: Internships.pdf
🔍 Processing new file: 10-Reasons-to-Do-an-Internship.pdf
✅ Skipping already processed file: telephone_directory_moe.pdf
✅ Skipping already processed file: Public Policy,Law and Safety.pdf
✅ Skipping already processed file: Research and Development.pdf
✅ Skipping already processed file: TheImportanceofInternships_LinkedIn.pdf
✅ Skipping already 

  return forward_call(*args, **kwargs)


✅ Vector DB updated and saved.
