In [21]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
  """
  Extracts text from a PDF document.

  Args:
      pdf_path (str): Path to the PDF file.

  Returns:
      str: Extracted text content.
  """
  with open(pdf_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page_num in range(len(pdf_reader.pages)):
      page = pdf_reader.pages[page_num]
      text += page.extract_text()
    return text

In [22]:
import re

def extract_name(text):
  """
  Attempts to extract a name from the provided text using regular expressions.

  Args:
      text (str): Text content to search for names.

  Returns:
      str: Extracted name (if found), or an empty string.
  """
  name_patterns = [r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters
                   r"[A-Z][a-z]+-\w+"]  # Hyphenated names
  for pattern in name_patterns:
    match = re.search(pattern, text)
    if match:
      return match.group()
  return ""

In [23]:
import re

def extract_contact_details(text):
  """
  Attempts to extract email, phone number, and address using regular expressions.

  Args:
      text (str): Text content to search for contact information.

  Returns:
      dict: A dictionary containing extracted email, phone number, and address (if found).
  """
  email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
  phone_pattern = r"[0-9]{3}-[0-9]{3}-[0-9]{4}"  # US-style phone number (modify for other formats)
  address_pattern = r"[0-9]+\s?[a-zA-Z]+(?:\s[a-zA-Z]+)?\s+(?:[A-Z][a-z]+\s?)*,\s?[A-Z]{2}\s+\d{5}(?:-\d{4})?"

  matches = {}
  matches["email"] = re.search(email_pattern, text)
  if matches["email"]:
    matches["email"] = matches["email"].group()

  matches["phone_number"] = re.search(phone_pattern, text)
  if matches["phone_number"]:
    matches["phone_number"] = matches["phone_number"].group()

  matches["address"] = re.search(address_pattern, text)
  if matches["address"]:
    matches["address"] = matches["address"].group()

  return matches

In [25]:
import nltk
from nltk import word_tokenize

def extract_skills_from_text(text):
  """
  Attempts to extract skills using NLTK and Part-of-Speech tagging.

  Args:
      text (str): Text content"""