In [None]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize

# Function to extract text from PDF file and convert it to lowercase
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text.lower()  # Convert the entire text to lowercase

# Function to extract the candidate's name
def extract_name(text):
    name_patterns = [
        r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters (first name + last name)
        r"[A-Z][a-z]+-\w+"  # Hyphenated names like "Anne-Marie"
    ]

    for pattern in name_patterns:
        match = re.search(pattern, text, re.IGNORECASE)  # Case-insensitive matching
        if match:
            return match.group()

    # If regular expressions fail, try using NLTK for Named Entity Recognition (NER)
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun, typically used for names
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"

# Function to extract the profile summary
def extract_profile_summary(text):
    # List of potential keywords indicating the profile section
    keywords = [
        "summary", "profile", "objective", "professional summary",
        "career summary", "executive summary", "personal summary",
        "summary of qualifications", "overview"
    ]
    
    # Convert the text to lowercase for case-insensitive matching
    lines = text.splitlines()  # Split the text into lines
    summary_start = None

    # Normalize keywords: Create regex patterns to ignore spaces
    keyword_patterns = [re.compile(r'\s*'.join(list(keyword.lower()))) for keyword in keywords]

    # Attempt to find the starting point of the summary based on keywords
    for pattern in keyword_patterns:
        for idx, line in enumerate(lines):
            if pattern.search(line.lower()):  # Check for keyword with regex
                summary_start = idx
                break
        if summary_start is not None:
            break

    # If a summary section was found, extract subsequent lines
    if summary_start is not None:
        extracted_lines = []
        
        # Start extracting from the next line after the keyword
        for i in range(summary_start + 1, len(lines)):
            line = lines[i].strip()

            # Stop extraction if the line is empty or contains bullet points, but allow for continuation
            if line == "" or line.startswith("*") or line.startswith("#"):
                continue  # Allow extraction to continue past bullet points and empty lines
            if line.isupper():  # Stop if we reach a fully uppercase line, which may indicate a section title
                break
            
            # Add the line to the extracted lines
            extracted_lines.append(line)

        # Join the extracted lines into a single summary string
        summary_result = "\n".join(extracted_lines).strip()
        return summary_result if summary_result else "Not Found"
    
    return "Not Found"

# Function to extract email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    return match.group(0).replace("(", "").replace(")", "").replace("-", "").replace(" ", "") if match else "Not Found"

# Function to extract address without including the keyword "address"
def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    lines = text.splitlines()
    
    # Iterate through the lines to find keywords
    for idx, line in enumerate(lines):
        # Check if any of the address keywords are in the line
        if any(keyword in line.lower() for keyword in address_keywords):
            # Extract lines after the keyword to capture the full address
            extracted_lines = []
            
            # If the keyword "address" is present in the line, ignore it and start from the next part of the line
            clean_line = re.sub(r'\b(?:' + '|'.join(address_keywords) + r')\b', '', line, flags=re.IGNORECASE).strip()
            if clean_line:
                extracted_lines.append(clean_line)
            
            # Attempt to include a few subsequent lines in case the address continues
            for i in range(1, 3):
                if idx + i < len(lines):
                    next_line = lines[idx + i].strip()
                    if next_line:
                        extracted_lines.append(next_line)
            
            # Combine the extracted lines into a single string
            return " ".join(extracted_lines).strip()
    
    return "Not Found"

# Function to extract links
def extract_links(text):
    """
    Extracts hyperlinks from text, including standard URLs, HTML-like links, and markdown links.
    
    Args:
        text (str): The text from which to extract links.
        
    Returns:
        list: A list of extracted hyperlinks or "Not Found" if no links are found.
    """
    # Standard URLs
    link_pattern = r'https?://[^\s]+'
    
    # HTML links
    html_link_pattern = r'<a\s+(?:[^>]*?\s+)?href=["\'](https?://[^\s"\']+)["\']'
    
    # Markdown links
    markdown_link_pattern = r'\[.*?\]\((https?://[^\s]+)\)'
    
    # Plain links like www.example.com or example.com
    plain_link_pattern = r'\b(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?\b'
    
    # Find all matches for each pattern
    standard_links = re.findall(link_pattern, text)
    html_links = re.findall(html_link_pattern, text)
    markdown_links = re.findall(markdown_link_pattern, text)
    plain_links = re.findall(plain_link_pattern, text)
    
    # Combine all found links into a single list
    hyperlinks = standard_links + html_links + markdown_links + plain_links
    
    # Remove duplicates by converting to a set, then back to a list
    unique_links = list(set(hyperlinks))
    
    return unique_links if unique_links else ["Not Found"]

# Function to extract experience details
def extract_experience(text):
    # Define keywords for detecting experience section headers
    experience_section_keywords = [
        "work history", "employment history", "work experience",
        "professional experience", "career summary", "professional background",
        "job experience", "internships", "relevant experience",
        "contract work", "military experience", "volunteer work"
    ]
    
    # Define date patterns for recognizing job durations
    date_patterns = [
        r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}",
        r"\b\d{4}\s*[-–—to]+\s*\d{4}\b",
        r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*Present",
        r"\b\d{4}\s*[-–—to]+\s*Present\b"
    ]
    
    # Define keywords for recognizing job titles
    job_title_keywords = [
        "manager", "director", "engineer", "analyst", "consultant",
        "assistant", "coordinator", "specialist", "developer", "designer",
        "executive", "advisor", "technician", "officer", "intern", "trainee"
    ]
    
    # Patterns for extracting company names
    company_patterns = r"(at|with|for)\s+([A-Z][\w&,. ]+)"
    
    # Patterns for extracting location information
    location_patterns = [
        r"\bin\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b",
        r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\s+(USA|Inc|Ltd|LLC|Co)\b"
    ]
    
    # Patterns for extracting responsibilities
    responsibility_patterns = [
        r"(managed|developed|designed|led|coordinated|executed|oversaw|achieved|conducted|implemented|increased|improved|created|initiated)\s+[^\n]*",
        r"(responsible for|tasked with|in charge of|oversaw|managed|coordinated|led)\s+[^\n]*"
    ]
    
    experience_sections = []
    
    # Split text into lines for processing
    lines = text.splitlines()
    experience_section_found = False

    # Iterate through lines to find experience sections
    for line in lines:
        # Check for experience section headers
        if any(keyword in line.lower() for keyword in experience_section_keywords):
            experience_section_found = True
            continue
        
        # If we found the experience section, start extracting
        if experience_section_found:
            if line.strip() == "" or line.isupper():  # Stop extraction on empty lines or titles
                break
            
            # Extract job title
            job_title = None
            for keyword in job_title_keywords:
                if keyword in line.lower():
                    job_title = line.strip()
                    break

            # Extract company name
            company_match = re.search(company_patterns, line)
            company_name = company_match.group(2).strip() if company_match else "Not Found"

            # Extract location
            location = "Not Found"
            for pattern in location_patterns:
                location_match = re.search(pattern, line)
                if location_match:
                    location = location_match.group(1).strip()
                    break

            # Extract dates
            dates = "Not Found"
            for pattern in date_patterns:
                date_match = re.search(pattern, line)
                if date_match:
                    dates = date_match.group().strip()
                    break
            
            # Extract responsibilities
            responsibilities = []
            for pattern in responsibility_patterns:
                responsibility_matches = re.findall(pattern, line)
                responsibilities.extend(responsibility_matches)

            # Create a structured experience dictionary
            experience_entry = {
                "job_title": job_title if job_title else "Not Found",
                "company_name": company_name,
                "location": location,
                "dates": dates,
                "responsibilities": " | ".join(responsibilities) if responsibilities else "Not Found"
            }

            # Add entry to experience sections
            experience_sections.append(experience_entry)
    
    return experience_sections if experience_sections else "Not Found"

# Function to extract all relevant data from the CV
def extract_data_from_cv(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    name = extract_name(text)
    profile_summary = extract_profile_summary(text)
    email = extract_email(text)
    phone_number = extract_phone_number(text)
    address = extract_address(text)
    links = extract_links(text)
    experience = extract_experience(text)  # Replaced function name

    # Create a dictionary to store the extracted data
    extracted_data = {
        "name": name,
        "profile_summary": profile_summary,
        "email": email,
        "phone_number": phone_number,
        "address": address,
        "links": links,
        "experience": experience
    }

    return extracted_data

# Main block to run the extraction process
if __name__ == "__main__":
    # Example PDF path (make sure to replace it with the actual path)
    pdf_file_path = 'path/to/cv.pdf'  # Adjust this path accordingly
    if os.path.exists(pdf_file_path):
        extracted_info = extract_data_from_cv(pdf_file_path)
        print(pd.DataFrame([extracted_info]))  # Print as DataFrame for better visualization
    else:
        print("File not found. Please check the path and try again.")


In [12]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from datetime import datetime

# Function to extract text from PDF file and convert it to lowercase
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text.lower()  # Convert the entire text to lowercase

# Function to extract the candidate's name
def extract_name(text):
    name_patterns = [
        r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters (first name + last name)
        r"[A-Z][a-z]+-\w+"  # Hyphenated names like "Anne-Marie"
    ]

    for pattern in name_patterns:
        match = re.search(pattern, text, re.IGNORECASE)  # Case-insensitive matching
        if match:
            return match.group()

    # If regular expressions fail, try using NLTK for Named Entity Recognition (NER)
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun, typically used for names
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"

# Function to extract the profile summary
def extract_profile_summary(text):
    # List of potential keywords indicating the profile section
    keywords = [
        "summary", "profile", "objective", "professional summary",
        "career summary", "executive summary", "personal summary",
        "summary of qualifications", "overview"
    ]
    
    # Convert the text to lowercase for case-insensitive matching
    lines = text.splitlines()  # Split the text into lines
    summary_start = None

    # Normalize keywords: Create regex patterns to ignore spaces
    keyword_patterns = [re.compile(r'\s*'.join(list(keyword.lower()))) for keyword in keywords]

    # Attempt to find the starting point of the summary based on keywords
    for pattern in keyword_patterns:
        for idx, line in enumerate(lines):
            if pattern.search(line.lower()):  # Check for keyword with regex
                summary_start = idx
                break
        if summary_start is not None:
            break

    # If a summary section was found, extract subsequent lines
    if summary_start is not None:
        extracted_lines = []
        
        # Start extracting from the next line after the keyword
        for i in range(summary_start + 1, len(lines)):
            line = lines[i].strip()

            # Stop extraction if the line is empty or contains bullet points, but allow for continuation
            if line == "" or line.startswith("*") or line.startswith("#"):
                continue  # Allow extraction to continue past bullet points and empty lines
            if line.isupper():  # Stop if we reach a fully uppercase line, which may indicate a section title
                break
            
            # Add the line to the extracted lines
            extracted_lines.append(line)

        # Join the extracted lines into a single summary string
        summary_result = "\n".join(extracted_lines).strip()
        return summary_result if summary_result else "Not Found"
    
    return "Not Found"

# Function to extract email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    return match.group(0).replace("(", "").replace(")", "").replace("-", "").replace(" ", "") if match else "Not Found"

def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    location_symbols = ['📍', '📌', '📍', '🔶','🏠','½']  # Add more symbols as needed
    lines = text.splitlines()
    
    # Iterate through the lines to find keywords
    for idx, line in enumerate(lines):
        # Check if any of the address keywords are in the line
        if any(keyword in line.lower() for keyword in address_keywords):
            # Extract lines after the keyword to capture the full address
            extracted_lines = []
            
            # If the keyword "address" is present in the line, ignore it and start from the next part of the line
            clean_line = re.sub(r'\b(?:' + '|'.join(address_keywords) + r')\b', '', line, flags=re.IGNORECASE).strip()
            if clean_line:
                extracted_lines.append(clean_line)
            
            # Attempt to include a few subsequent lines in case the address continues
            for i in range(1, 3):
                if idx + i < len(lines):
                    next_line = lines[idx + i].strip()
                    if next_line:
                        extracted_lines.append(next_line)

            # Check for symbols in the extracted lines
            for symbol in location_symbols:
                if symbol in ' '.join(extracted_lines):
                    return " ".join(extracted_lines).strip() + " " + symbol
            
            # Combine the extracted lines into a single string
            return " ".join(extracted_lines).strip()
    
    return "Not Found"

# Function to extract hyperlinks
def extract_links(text):
    """
    Extracts hyperlinks from text, including standard URLs, HTML-like links, and markdown links.
    
    Args:
        text (str): The text from which to extract links.
        
    Returns:
        list: A list of extracted hyperlinks or "Not Found" if no links are found.
    """
    # Standard URLs
    link_pattern = r'https?://[^\s]+'
    
    # HTML links
    html_link_pattern = r'<a\s+(?:[^>]*?\s+)?href=["\'](https?://[^\s"\']+)["\']'
    
    # Markdown links
    markdown_link_pattern = r'\[.*?\]\((https?://[^\s]+)\)'
    
    # Plain links like www.example.com or example.com
    plain_link_pattern = r'\b(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?\b'
    
    # Find all matches for each pattern
    standard_links = re.findall(link_pattern, text)
    html_links = re.findall(html_link_pattern, text)
    markdown_links = re.findall(markdown_link_pattern, text)
    plain_links = re.findall(plain_link_pattern, text)
    
    # Combine all found links into a single list
    hyperlinks = standard_links + html_links + markdown_links + plain_links
    
    # Remove duplicates by converting to a set, then back to a list
    unique_links = list(set(hyperlinks))
    
    return unique_links if unique_links else ["Not Found"]

import re

def extract_experience(text):
    # Define keywords for detecting experience section headers
    experience_section_keywords = [
        "work history", "employment history", "work experience",
        "professional experience", "career summary", "professional background",
        "job experience", "internships", "relevant experience",
        "contract work", "military experience", "volunteer work",
        "experience"
    ]
    
    # Regex pattern for experience section headers
    experience_section_pattern = '|'.join(experience_section_keywords)

    # Define date patterns for recognizing job durations
    date_patterns = [
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}\s*[-–—to]+\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}",
        r"\b\d{4}\s*[-–—to]+\s*\d{4}\b",
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}\s*[-–—to]+\s*Present",
        r"\b\d{4}\s*[-–—to]+\s*Present\b"
    ]
    
    # Patterns for extracting company names
    company_patterns = r"(?:at|with|for)\s+([A-Z][\w&,. ]+)"
    
    # Patterns for extracting location information
    location_patterns = [
        r"\bin\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b",
        r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\s+(USA|Inc|Ltd|LLC|Co)\b"
    ]
    
    # Patterns for extracting responsibilities
    responsibility_patterns = [
        r"•\s*.*",  # Bullet point responsibilities
        r"(?<=\n)-\s*.*",  # Responsibilities starting with a hyphen
        r"(Managed|Developed|Led|Coordinated|Implemented|Oversaw|Supervised)\s+.*"
    ]

    # Combine all location patterns into a single regex pattern
    location_regex = '|'.join(location_patterns)

    # Debugging: Print the extracted text
    print("Extracted Text:\n", text)  # Print the full text for review

    # Search for the start of the experience section
    experience_start = re.search(experience_section_pattern, text, re.IGNORECASE)
    if not experience_start:
        print("Experience section not found.")
        return []  # Return empty list if no experience section is found

    # Extract content starting from the identified section
    content = text[experience_start.start():]

    # Extract dates to isolate job entries
    dates = re.findall('|'.join(date_patterns), content)
    job_entries = re.split('|'.join(date_patterns), content)

    # List to hold extracted job experiences
    experiences = []

    # Loop through each identified job entry and its corresponding date
    for date, job in zip(dates, job_entries):
        job_clean = job.strip()
        print(f"Extracting job entry: '{job_clean}'")  # Debugging print
        
        # Skip if the job entry is a line with only hyphens, underscores, or dots
        if re.match(r'^[\-_]+$', job_clean) or re.match(r'^[.]+$', job_clean) or not job_clean:
            print("Skipping entry due to irrelevant line.")
            continue  # Skip this entry

        # Extract job title
        job_title_match = re.search(r'([A-Z][a-zA-Z\s]+(?:[ -][A-Z][a-zA-Z]*)*)', job_clean)
        job_title = job_title_match.group(0) if job_title_match else 'Not Found'

        # Extract company name
        company_match = re.search(company_patterns, job_clean)
        company = company_match.group(1) if company_match else 'Not Found'

        # Extract location
        location_match = re.search(location_regex, job_clean)
        location = location_match.group(1) if location_match else 'Not Found'

        # Extract responsibilities
        responsibilities = re.findall('|'.join(responsibility_patterns), job_clean)
        responsibilities_text = ' '.join(responsibilities).strip()

        # Store the extracted data in the experiences list
        experiences.append({
            "Job Title": job_title,
            "Company": company,
            "Location": location,
            "Employment Dates": date.strip(),
            "Responsibilities": responsibilities_text
        })

    if not experiences:
        print("No experiences found.")
        return [{"Job Title": "Not Found", "Company": "Not Found", "Location": "Not Found", "Employment Dates": "Not Found", "Responsibilities": "Not Found"}]

    return experiences

# Function to extract education details
def extract_education(text):
    education_section_keywords = [
        "education", "academic background", "educational qualifications",
        "degrees", "certifications"
    ]

    # Search for the start of the education section
    education_start = re.search('|'.join(education_section_keywords), text, re.IGNORECASE)
    if not education_start:
        return None  # Return None if no education section is found

    content = text[education_start.start():]

    # This regex is simplified. You can enhance it to capture more specific patterns
    education_patterns = r"(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b) +(\d{4}|\b[0-9]{4})?[-–—to]*(\b[0-9]{4})?"
    
    matches = re.findall(education_patterns, content)
    education_details = []

    for match in matches:
        degree = match[0].strip()
        start_year = match[1].strip() if match[1] else "Not specified"
        end_year = match[2].strip() if match[2] else "Not specified"

        education_details.append({
            "Degree": degree,
            "Start Year": start_year,
            "End Year": end_year
        })

    return education_details

# Function to extract languages
def extract_languages(text):
    language_section_keywords = [
        "languages", "language proficiency", "spoken languages", "language skills"
    ]

    # Search for the start of the languages section
    language_start = re.search('|'.join(language_section_keywords), text, re.IGNORECASE)
    if not language_start:
        return None  # Return None if no languages section is found

    content = text[language_start.start():]
    language_patterns = r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"  # This pattern can be improved based on the expected language format

    matches = re.findall(language_patterns, content)
    languages = [match.strip() for match in matches]

    return languages if languages else ["Not Found"]

# Function to extract certificates
def extract_certificates(text):
    certificate_section_keywords = [
        "certificates", "certification", "professional certifications",
        "certification courses"
    ]

    # Search for the start of the certificates section
    certificate_start = re.search('|'.join(certificate_section_keywords), text, re.IGNORECASE)
    if not certificate_start:
        return None  # Return None if no certificates section is found

    content = text[certificate_start.start():]
    certificate_patterns = r"\b([A-Z][a-zA-Z\s]+)\b"  # Simplified regex pattern to capture certificates

    matches = re.findall(certificate_patterns, content)
    certificates = [match.strip() for match in matches]

    return certificates if certificates else ["Not Found"]

# Main function to extract data from resumes
def extract_data_from_resumes(resume_folder):
    all_extracted_data = []
    
    # Extract all PDF files from the specified folder
    resume_paths = [os.path.join(resume_folder, f) for f in os.listdir(resume_folder) if f.endswith('.pdf')]

    for pdf_path in resume_paths:
        text = extract_text_from_pdf(pdf_path)
        name = extract_name(text)
        email = extract_email(text)
        phone = extract_phone_number(text)
        address = extract_address(text)
        profile_summary = extract_profile_summary(text)
        experience = extract_experience(text)
        education = extract_education(text)
        languages = extract_languages(text)
        certificates = extract_certificates(text)

        extracted_data = {
            "Name": name,
            "Email": email,
            "Phone": phone,
            "Address": address,
            "Profile Summary": profile_summary,
            "Experience": experience,
            "Education": education,
            "Languages": languages,
            "Certificates": certificates
        }
        all_extracted_data.append(extracted_data)

    return all_extracted_data

# Function to save extracted data to CSV
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    
    # Append or create new file based on whether it exists
    if os.path.exists(filename):
        df.to_csv(filename, mode='a', header=False, index=False)  # Append without header
    else:
        df.to_csv(filename, index=False)  # Create a new file with header

# Example usage
if __name__ == "__main__":
    # Specify the folder containing resumes and the output CSV file name
    resume_folder = "C:\\Users\\santhoshs.s\\jupyter\\New folder\\resume corpus"  # Replace with the path to your resumes folder
    output_csv_file = "output7.csv"  # Desired CSV output filename
    
    extracted_info = extract_data_from_resumes(resume_folder)
    save_to_csv(extracted_info, output_csv_file)

    print(f"Data extracted and saved to {output_csv_file}")


Extracted Text:
 anuva goyal
[ anuvagoyal111@gmail.com
½ agra, uttar pradesh, india
 github.com/anuvagoyal
projects
• mental healthcare chatbot that provides ad-
vice to the user based on diﬀerent categories
of mental health problems using a dataset
webscraped from counselchat.com (nov 2021)
• full stack speech emotion based movie
recommender system using the ravdess
dataset and web scraping techniques (oct
2021)
• finding a perfect fit, a model to parse re-
sumes using pytesseract, nlp and xg boost
and random forest classiﬁcation techniques
(aug 2021)
• face mask detection which detects the face
using haar cascade classiﬁer and classiﬁes
the image into one of the three categories-
without mask, with mask and incorrect mask
(may 2021)
education
b.tech, electrical engineering with
computer science specialisation
dayalbagh educational institute, agra
 july 2019– present (sgpa 9.35 - 4 sem)
higher secondary school certiﬁcate
st. clare’s senior secondary school, agra
 2019 (94%)
seconda

In [17]:
import re
import pandas as pd
from PyPDF2 import PdfReader
import os

def extract_experience(pdf_path):
    """
    Extracts the experience section from a PDF resume.
    
    Parameters:
    pdf_path (str): The path to the PDF resume file.
    
    Returns:
    str: A string containing the extracted experience details.
    """
    reader = PdfReader(pdf_path)
    text = ''
    
    # Extract text from the PDF
    for page in reader.pages:
        text += page.extract_text() + '\n'

    # Split text into lines for easier processing
    lines = text.split('\n')
    
    experience_list = []
    capture = False
    
    for line in lines:
        # Normalize spaces
        line = line.strip()
        
        # Check if we have reached the Experience section
        if 'EXPERIENCE' in line.upper():
            capture = True
            continue
        
        # If in the experience section, check for end condition
        if capture:
            # Check if line has only a straight line or dotted line
            if re.match(r'^[-•–]+$', line):
                continue  # Skip this line
            
            # If we reach a non-experience line (like Projects, Education), we stop capturing
            if any(keyword in line.upper() for keyword in ['PROJECTS', 'EDUCATION', 'TRAININGS']):
                break
            
            # Otherwise, add the line to experience list
            experience_list.append(line)
    
    # Combine the experience details into a single string
    experience_details = "\n".join(experience_list)
    
    return experience_details.strip()

def process_resumes(resumes_folder):
    """
    Processes all PDF resumes in a specified folder to extract experience.
    
    Parameters:
    resumes_folder (str): The path to the folder containing resume PDFs.
    
    Returns:
    pd.DataFrame: A DataFrame containing the extracted experiences from each resume.
    """
    experience_data = []

    # Loop through each PDF file in the resumes folder
    for filename in os.listdir(resumes_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(resumes_folder, filename)
            experience = extract_experience(pdf_path)
            experience_data.append({'File Name': filename, 'Experience': experience})

    # Create a DataFrame from the experience data
    df_experience = pd.DataFrame(experience_data)
    return df_experience

# Example usage
resumes_folder = 'C:\\Users\\santhoshs.s\\jupyter\\New folder\\resume corpus'
experience_df = process_resumes(resumes_folder)

# Display the extracted experiences
print(experience_df)


              File Name                                         Experience
0  AnuvaGoyal_Latex.pdf  Summer Intern\nGenisup India Pvt. Ltd., Hosur,...


In [18]:
import re
import pandas as pd
from PyPDF2 import PdfReader
import os

def extract_experience(pdf_path):
    """
    Extracts the experience section from a PDF resume.
    
    Parameters:
    pdf_path (str): The path to the PDF resume file.
    
    Returns:
    str: A string containing the extracted experience details.
    """
    reader = PdfReader(pdf_path)
    text = ''
    
    # Extract text from the PDF
    for page in reader.pages:
        text += page.extract_text() + '\n'

    # Split text into lines for easier processing
    lines = text.split('\n')
    
    experience_list = []
    capture = False
    
    for line in lines:
        # Normalize spaces
        line = line.strip()
        
        # Check if we have reached the Experience section
        if 'EXPERIENCE' in line.upper():
            capture = True
            continue
        
        # If in the experience section, check for end condition
        if capture:
            # Check if line has only a straight line or dotted line
            if re.match(r'^[-•–]+$', line):
                continue  # Skip this line
            
            # If we reach a non-experience line (like Projects, Education), we stop capturing
            if any(keyword in line.upper() for keyword in ['PROJECTS', 'EDUCATION', 'TRAININGS']):
                break
            
            # Otherwise, add the line to experience list
            experience_list.append(line)
    
    # Combine the experience details into a single string with full lines
    experience_details = "\n".join(experience_list)
    
    return experience_details.strip() if experience_details else "No experience found."

def process_resumes(resumes_folder):
    """
    Processes all PDF resumes in a specified folder to extract experience.
    
    Parameters:
    resumes_folder (str): The path to the folder containing resume PDFs.
    
    Returns:
    pd.DataFrame: A DataFrame containing the extracted experiences from each resume.
    """
    experience_data = []

    # Loop through each PDF file in the resumes folder
    for filename in os.listdir(resumes_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(resumes_folder, filename)
            experience = extract_experience(pdf_path)
            experience_data.append({'File Name': filename, 'Experience': experience})

    # Create a DataFrame from the experience data
    df_experience = pd.DataFrame(experience_data)
    return df_experience

# Example usage
resumes_folder = 'C:\\Users\\santhoshs.s\\jupyter\\New folder\\resume corpus'  # Change this to your folder path
experience_df = process_resumes(resumes_folder)

# Display the extracted experiences
print(experience_df.to_string(index=False))  # Display without index


           File Name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Experience
AnuvaGoyal_Latex.pdf Summer Intern\nGenisup India Pvt. Ltd., Hosur, Tamil Nadu\nὌJune 2021 – Aug 2021 Remote\n•Internship on the topic NLP: Topic Modeling to