In [None]:
# Install required libraries
!pip install pdfplumber
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install nltk



In [13]:
import pdfplumber
import re
import spacy
import datetime
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load SpaCy for skills extraction
nlp = spacy.load("en_core_web_sm")

def get_name(text):
    """
    This function returns a candidate name from a list of text
    :param text: list of text
    :return: string of a candidate name
    """
    # Tokenizes whole text to sentences
    Sentences = sent_tokenize(text)
    t = []

    for s in Sentences:
        # Tokenizes sentences to words
        t.append(word_tokenize(s))
    # Tags a word with its part of speech
    words = [pos_tag(token) for token in t]
    n = []
    for x in words:
        for l in x:
            # Match matches the pos tag of a word to a given tag
            if re.match('[NN.*]', l[1]):
                n.append(l[0])

    cands = []
    for nouns in n:
        if not wordnet.synsets(nouns):
            cands.append(nouns)

    cand = ' '.join(cands[:1])
    return cand

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted_text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF: {e}")
    return extracted_text

# Function to preprocess text (cleaning)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Function to extract contact info
def extract_contact_info(text):
    email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    phone_regex = r'\b\d{10}\b|\b(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
    github_regex = r'https://github.com/[A-Za-z0-9_-]+'

    email = re.search(email_regex, text)
    phone = re.search(phone_regex, text)
    github = re.search(github_regex, text)

    return {
        "email": email.group() if email else None,
        "phone": phone.group() if phone else None,
        "github": github.group() if github else None
    }

# Function to extract skills using spaCy
def extract_skills(text):
    skill_keywords =  [
    '.net', '1password', '3d', '3d-reconstruction', 'aboutness',
    'abstract-data-type', 'abstract-interpretation', 'abstract-machine',
    'access-control', 'access-method', 'access-network', 'accounting',
    'active-appearance-model', 'active-database', 'active-networking',
    'active-shape-model', 'apache-activemq', 'activity-recognition',
    'actuarial-science', 'actuator', 'adaboost', 'adaptive-routing',
    'adaptive-system', 'adder', 'adobe-illustrator', 'adobe-photoshop',
    'adobe-xd', 'advertising', 'aerial-photography', 'aeronautics',
    'aerospace-engineering', 'aerospike', 'agile-project-management',
    'agricultural-engineering', 'apache-airflow', 'airtable', 'ajax',
    'akamai', 'akka', 'algolia', 'algorithm', 'algorithm-design',
    'alpine-linux', 'amazon-api-gateway', 'amazon-athena', 'amazon-cloudfront',
    'amazon-cloudwatch', 'amazon-cognito', 'amazon-dynamodb', 'amazon-ebs',
    'amazon-ec2', 'amazon-eks', 'amazon-elasticache', 'amazon-elasticsearch-service',
    'amazon-emr', 'amazon-kinesis', 'amazon-kinesis-firehose', 'amazon-machine-learning',
    'amazon-rds', 'amazon-redshift', 'amazon-route-53', 'amazon-s3',
    'amazon-ses', 'amazon-sns', 'amazon-sqs', 'amazon-vpc', 'ambiguity',
    'am-php', 'amplitude', 'analog-to-digital-converter', 'analysis-of-algorithms',
    'analytics', 'android', 'android-sdk', 'android-studio', 'angular',
    'ansible', 'ant-design', 'apache-ant', 'apache-cordova', 'apache-flink',
    'apache-http-server', 'apache-maven', 'apache-mesos', 'apache-spark',
    'apache-tomcat', 'api', 'api-tools', 'apollo', 'appium', 'arangodb',
    'arduino', 'artificial-intelligence', 'asana', 'asp.net', 'aws',
    'azure', 'babel', 'bash', 'bayesian-inference', 'big-data',
    'bootstrap', 'bot', 'c', 'cakephp', 'celery', 'centos', 'circleci',
    'clojure', 'cloudflare', 'cloudinary', 'cobol', 'codeigniter',
    'coding', 'data-visualization', 'django', 'docker', 'flask', 'git',
    'google-cloud', 'html', 'java', 'javascript', 'jenkins', 'kubernetes',
    'linux', 'machine-learning', 'mysql', 'node.js', 'numpy', 'php',
    'pytorch', 'python', 'react', 'ruby', 'scala', 'scrum', 'sql',
    'tensorflow', 'typescript','vue.js']
    doc = nlp(text)

    extracted_skills = set()
    for token in doc:
        if token.text in skill_keywords:
            extracted_skills.add(token.text)
    return list(extracted_skills)

# Function to extract total experience in years
def extract_experience(text):
    experience_patterns = [
        r'(\d+(?:\.\d+)?)\s*(?:years?|yrs?)(?:\s*of)?\s*(?:total\s*)?(?:work\s*)?experience',
        r'total\s*experience[:\s](\d+(?:\.\d+)?)\s(?:years?|yrs?)',
        r'(\d+(?:\.\d+)?)\s*(?:years?|yrs?)\s*(?:of\s*)?(?:professional\s*)?experience',
        r'work\s*experience[:\s](\d+(?:\.\d+)?)\s(?:years?|yrs?)',
        r'(\d+(?:\.\d+)?)\s*(?:year|years)\s*(?:of\s*)?(?:total\s*)?experience'
    ]

    text_lower = text.lower()

    for pattern in experience_patterns:
        match = re.search(pattern, text_lower)
        if match:
            try:
                return float(match.group(1))
            except (ValueError, TypeError):
                continue

    job_experience_pattern = r'(?:from|)\s*(\d{4})\s*(?:to|[-])\s*(?:present|current|(\d{4}))'
    job_dates = re.findall(job_experience_pattern, text_lower)

    if job_dates:
        current_year = datetime.datetime.now().year
        total_years = 0

        for start, end in job_dates:
            start_year = int(start)
            end_year = int(end) if end else current_year
            total_years += end_year - start_year

        return total_years

    return "0"

# Function to process all resumes in a folder and extract data
def process_resumes_in_folder(folder_path):
    resume_data = []
    unique_id = 1  # Start the ID from 1

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing {filename}")

            raw_text = extract_text_from_pdf(pdf_path)
            if raw_text:
                cleaned_text = preprocess_text(raw_text)

                contact_info = extract_contact_info(cleaned_text)
                skills = extract_skills(cleaned_text)
                experience_years = extract_experience(cleaned_text)

                resume_data.append({
                    "unique_id": unique_id,
                    "name": get_name(cleaned_text),
                    "email": contact_info["email"],
                    "phone": contact_info["phone"],
                    "github": contact_info["github"],
                    "experience_years": experience_years,
                    "skills": ", ".join(skills)
                })

                unique_id += 1

    return resume_data

# Save the extracted data to a CSV
def save_to_csv(resume_data, output_file):
    df = pd.DataFrame(resume_data)
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

# Main script to run the extraction process
if __name__ == "__main__":
    folder_path = "/content/resumes"  # Set the path to your folder of resumes
    output_file = "/content/resumes/extracted_data.csv"  # Path to save the CSV output

    resume_data = process_resumes_in_folder(folder_path)
    save_to_csv(resume_data, output_file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Processing entry-level-data-analyst-resume-example.pdf
Processing Resume-2 (1) (1).pdf
Processing data-analyst-intern-resume-example.pdf
Processing junior-data-analyst-resume-example.pdf
Processing resume_updated.pdf
Processing data-analyst-resume-example.pdf
Data saved to /content/resumes/extracted_data.csv


In [15]:
import pdfplumber
import re
import spacy
import datetime
import nltk
import os
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet

# Load SpaCy for skills extraction
nlp = spacy.load("en_core_web_sm")

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted_text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF: {e}")
    return extracted_text

# Function to preprocess text (cleaning)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Function to extract contact info (email, phone, github)
def extract_contact_info(text):
    email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    phone_regex = r'\b\d{10}\b|\b(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
    github_regex = r'https://github.com/[A-Za-z0-9_-]+'

    email = re.search(email_regex, text)
    phone = re.search(phone_regex, text)
    github = re.search(github_regex, text)

    return {
        "email": email.group() if email else None,
        "phone": phone.group() if phone else None,
        "github": github.group() if github else None
    }

# Function to extract skills using spaCy
def extract_skills(text):
    skill_keywords = ['Python', 'Machine Learning', 'NLP', 'Data Analysis', 'Java', 'SQL',
    'C++', 'AWS', 'Docker', 'TensorFlow', 'Keras', 'React', 'Node.js''.net', '1password', '3d', '3d-reconstruction', 'aboutness',
    'abstract-data-type', 'abstract-interpretation', 'abstract-machine',
    'access-control', 'access-method', 'access-network', 'accounting',
    'active-appearance-model', 'active-database', 'active-networking',
    'active-shape-model', 'apache-activemq', 'activity-recognition',
    'actuarial-science', 'actuator', 'adaboost', 'adaptive-routing',
    'adaptive-system', 'adder', 'adobe-illustrator', 'adobe-photoshop',
    'adobe-xd', 'advertising', 'aerial-photography', 'aeronautics',
    'aerospace-engineering', 'aerospike', 'agile-project-management',
    'agricultural-engineering', 'apache-airflow', 'airtable', 'ajax',
    'akamai', 'akka', 'algolia', 'algorithm', 'algorithm-design',
    'alpine-linux', 'amazon-api-gateway', 'amazon-athena', 'amazon-cloudfront',
    'amazon-cloudwatch', 'amazon-cognito', 'amazon-dynamodb', 'amazon-ebs',
    'amazon-ec2', 'amazon-eks', 'amazon-elasticache', 'amazon-elasticsearch-service',
    'amazon-emr', 'amazon-kinesis', 'amazon-kinesis-firehose', 'amazon-machine-learning',
    'amazon-rds', 'amazon-redshift', 'amazon-route-53', 'amazon-s3',
    'amazon-ses', 'amazon-sns', 'amazon-sqs', 'amazon-vpc', 'ambiguity',
    'am-php', 'amplitude', 'analog-to-digital-converter', 'analysis-of-algorithms',
    'analytics', 'android', 'android-sdk', 'android-studio', 'angular',
    'ansible', 'ant-design', 'apache-ant', 'apache-cordova', 'apache-flink',
    'apache-http-server', 'apache-maven', 'apache-mesos', 'apache-spark',
    'apache-tomcat', 'api', 'api-tools', 'apollo', 'appium', 'arangodb',
    'arduino', 'artificial-intelligence', 'asana', 'asp.net', 'aws',
    'azure', 'babel', 'bash', 'bayesian-inference', 'big-data',
    'bootstrap', 'bot', 'c', 'cakephp', 'celery', 'centos', 'circleci',
    'clojure', 'cloudflare', 'cloudinary', 'cobol', 'codeigniter',
    'coding', 'data-visualization', 'django', 'docker', 'flask', 'git',
    'google-cloud', 'html', 'java', 'javascript', 'jenkins', 'kubernetes',
    'linux', 'machine-learning', 'mysql', 'node.js', 'numpy', 'php',
    'pytorch', 'python', 'react', 'ruby', 'scala', 'scrum', 'sql',
    'tensorflow', 'typescript','vue.js']
    doc = nlp(text)

    extracted_skills = set()
    for token in doc:
        if token.text in skill_keywords:
            extracted_skills.add(token.text)
    return list(extracted_skills)

# Function to extract total experience in years
def extract_experience(text):
    experience_patterns = [
        r'(\d+(?:\.\d+)?)\s*(?:years?|yrs?)(?:\s*of)?\s*(?:total\s*)?(?:work\s*)?experience',
        r'total\s*experience[:\s](\d+(?:\.\d+)?)\s(?:years?|yrs?)',
        r'(\d+(?:\.\d+)?)\s*(?:years?|yrs?)\s*(?:of\s*)?(?:professional\s*)?experience',
        r'work\s*experience[:\s](\d+(?:\.\d+)?)\s(?:years?|yrs?)',
        r'(\d+(?:\.\d+)?)\s*(?:year|years)\s*(?:of\s*)?(?:total\s*)?experience'
    ]

    text_lower = text.lower()

    for pattern in experience_patterns:
        match = re.search(pattern, text_lower)
        if match:
            try:
                return float(match.group(1))
            except (ValueError, TypeError):
                continue

    job_experience_pattern = r'(?:from|)\s*(\d{4})\s*(?:to|[-])\s*(?:present|current|(\d{4}))'
    job_dates = re.findall(job_experience_pattern, text_lower)

    if job_dates:
        current_year = datetime.datetime.now().year
        total_years = 0

        for start, end in job_dates:
            start_year = int(start)
            end_year = int(end) if end else current_year
            total_years += end_year - start_year

        return total_years

    return "0"

# Function to extract promotions count from job descriptions
def extract_promotions(text):
    # Phrases indicating promotion
    promotion_keywords = [
        r'promoted', r'promotion', r'lead', r'senior', r'head', r'manager', r'director', r'chief'
    ]

    promotion_count = 0
    for keyword in promotion_keywords:
        promotion_count += len(re.findall(keyword, text.lower()))

    return promotion_count

# Function to process all resumes in a folder and extract data
def process_resumes_in_folder(folder_path):
    resume_data = []
    unique_id = 1  # Start the ID from 1

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing {filename}")

            raw_text = extract_text_from_pdf(pdf_path)
            if raw_text:
                cleaned_text = preprocess_text(raw_text)

                contact_info = extract_contact_info(cleaned_text)
                skills = extract_skills(cleaned_text)
                experience_years = extract_experience(cleaned_text)
                promotions = extract_promotions(cleaned_text)

                resume_data.append({
                    "unique_id": unique_id,
                    "name": get_name(cleaned_text),
                    "email": contact_info["email"],
                    "phone": contact_info["phone"],
                    "github": contact_info["github"],
                    "experience_years": experience_years,
                    "skills": ", ".join(skills),
                    "promotions": promotions  # Add promotions count here
                })

                unique_id += 1

    return resume_data

# Save the extracted data to a CSV
def save_to_csv(resume_data, output_file):
    df = pd.DataFrame(resume_data)
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

# Main script to run the extraction process
if __name__ == "__main__":
    folder_path = "/content/resumes"  # Set the path to your folder of resumes
    output_file = "/content/resumes/extracted_data_with_promotions.csv"  # Path to save the CSV output

    resume_data = process_resumes_in_folder(folder_path)
    save_to_csv(resume_data, output_file)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Processing entry-level-data-analyst-resume-example.pdf
Processing Resume-2 (1) (1).pdf
Processing data-analyst-intern-resume-example.pdf
Processing junior-data-analyst-resume-example.pdf
Processing resume_updated.pdf
Processing data-analyst-resume-example.pdf
Data saved to /content/resumes/extracted_data_with_promotions.csv
