In [83]:
# Web scrapping using Selenium
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time
import pandas as pd

In [85]:
# Setup Edge WebDriver
options = webdriver.EdgeOptions()  
options.use_chromium = True 
# options.add_argument("--headless")

In [87]:
# Start WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# Open Indeed Jobs Page
job_title = "Data Analyst"
location = "United States"
url = f"https://www.simplyhired.com/search?q={job_title}&l={location}"

driver.get(url)
time.sleep(10) 

# Extract Job Data
jobs = []
q_jobs = []
max_pages = 3  #Adjust based on requirement
current_page = 1 


while current_page <= max_pages:
    print(f"Scraping Page {current_page}...") 

    job_cards = driver.find_elements(By.CLASS_NAME, "css-0") 

    for job in job_cards:
        try:
            title = job.find_element(By.CLASS_NAME, "css-1djbb1k").text.strip() 
            company = job.find_element(By.CLASS_NAME, "css-1sawo7p").text.strip() 
            description = job.find_element(By.CLASS_NAME, "css-jhqp7z").text.strip() 

            try:
                job.click()
                time.sleep(5) 
                qualifications = job.find_elements(By.XPATH, "//span[@data-testid='viewJobQualificationItem']")
                qualification = [q.text for q in qualifications]
                qualification = ", ".join(qualification)
                brief_jd = job.find_elements(By.XPATH, "//div[@data-testid='viewJobBodyJobFullDescriptionContent']")[0].text
              
            except:
                continue

            jobs.append({"Title": title, "Company": company, "Description": description, "Qualification": qualification, "Brief JD": brief_jd})
  
        except:
            continue

    try:
        next_button = driver.find_element(By.XPATH, "//a[@aria-label='Next page']")
        next_button.click()
        time.sleep(10)  
        current_page += 1
    except:
        print("No more pages available or next button not found.")
        break 

df = pd.DataFrame(jobs)
df.head()

Scraping Page 1...
Scraping Page 2...
Scraping Page 3...


Unnamed: 0,Title,Company,Description,Qualification,Brief JD
0,Data Analyst,Aloden Inc —Remote,"Support *CMS certification activities*, includ...","Jira, Microsoft Excel, Waterfall, Enterprise s...",Job Title: Certification Data Analyst\nLocatio...
1,Data Analyst (Remote),"Latica —Palo Alto, CA",The Latica medical data science platform bring...,"Research, Mid-level, SQL, Pandas, AWS, 2 years...",Who We Are?\nLatica (Formerly Lynx.MD) is buil...
2,Data Warehouse Analyst,"ASSYST, Inc. —Tallahassee, FL",Coach and mentor peers in data warehousing con...,"Computer science, Power BI, 7 years, Business ...",ASSYST is seeking a skilled Data Warehouse Ana...
3,Data Analyst (Excel),"StellarMettle Placements —Duluth, GA","Experience in data analysis, reporting, or bus...","Microsoft Excel, Business intelligence, Data a...",We are seeking a highly skilled Data Analyst w...
4,Entry-Level Data Analyst,"Upen Group Inc —Irving, TX","*Bachelor’s degree* in Data Science, Statistic...","Power BI, Microsoft Excel, Data analysis skill...",Entry-Level Data Analyst – Uncover Insights an...


In [89]:
# Check for missing values
df.isnull().sum()

Title            0
Company          0
Description      0
Qualification    0
Brief JD         0
dtype: int64

In [91]:
# Check for datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          60 non-null     object
 1   Company        60 non-null     object
 2   Description    60 non-null     object
 3   Qualification  60 non-null     object
 4   Brief JD       60 non-null     object
dtypes: object(5)
memory usage: 2.5+ KB


In [93]:
# Drop duplicates
df = df.drop_duplicates()

In [95]:
# ️Standardize text data to remove extra spaces, lower case
df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)


In [None]:
# Skill Extraction

In [97]:
# Define skillset for Data Analyst
skills = [
    "Python", "SQL", "R", "SAS", "Power BI", "Tableau", "Looker",
    "QlikView", "Google Data Studio", "MySQL", "PostgreSQL", "Microsoft SQL Server",
    "Oracle", "Snowflake", "BigQuery", "Pandas", "NumPy", "Excel", "VBA", "DAX",
    "T-SQL", "Regression Analysis", "Hypothesis Testing", "Time Series Analysis",
    "Forecasting", "Clustering", "Predictive Analytics", "AWS", "Azure", "Google Cloud",
    "Hadoop", "Spark", "Databricks", "Business Intelligence", "KPI Analysis",
    "Data Warehousing", "ETL", "Reporting Automation", "Problem Solving",
    "Communication", "Critical Thinking", "Storytelling with Data"
]

In [99]:
skills = [skill.lower() for skill in skills]

In [None]:
# Skill Extraction with RegEx

In [101]:
import re

def extract_skills(text, skill_list):
    if pd.isna(text): 
        return []
    found_skills = [skill for skill in skill_list if re.search(rf'\b{re.escape(skill)}\b', text, re.IGNORECASE)]
    return found_skills

df['extracted_skills'] = df['Brief JD'].apply(lambda x: extract_skills(x, skills))

In [None]:
# Skill Extraction with NLP

In [103]:
import spacy
from spacy.matcher import PhraseMatcher

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Add custom skills to match Spacy's PhraseMatcher
skill_patterns = [
    "hypothesis testing", "power bi", "data warehousing", 
    "business intelligence", "problem solving", 
    "regression analysis", "critical thinking"
]
matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(skill) for skill in skill_patterns]
matcher.add("SKILL", patterns)

In [105]:
# Function to extract skills using both Spacy and custom matcher
def extract_skills_nlp(text):
    doc = nlp(text.lower())
    found_skills = set()
    
    # Extract skills from Named Entities
    for token in doc:
        if token.text in skills:
            found_skills.add(token.text)
            
    # Extract skills from custom matcher
    matches = matcher(doc)
    for match_id, start, end in matches:
        found_skills.add(doc[start:end].text.lower())

    return list(found_skills)

df['extracted_skills_spacy'] = df['Brief JD'].apply(extract_skills_nlp)

In [None]:
# Compare RegEx and NLP extracted skills

In [107]:
# FROM RegExp
from collections import Counter
all_skills = sum(df['extracted_skills'], [])  # Flatten list
skill_counts = Counter(all_skills)

skill_df = pd.DataFrame(skill_counts.items(), columns=['Skill', 'Count']).sort_values(by="Count", ascending=False)

print(skill_df.head(10))

                    Skill  Count
2                     sql     38
12          communication     30
0                   excel     25
6                 tableau     23
5                power bi     20
1                  python     19
9   business intelligence     13
14                      r     12
4                     aws      8
23            forecasting      6


In [109]:
# FROM NLP
from collections import Counter
all_skills_nlp = sum(df['extracted_skills_spacy'], [])  # Flatten list
skill_counts_ = Counter(all_skills)

skill_df_nlp = pd.DataFrame(skill_counts.items(), columns=['Skill', 'Count']).sort_values(by="Count", ascending=False)

print(skill_df_nlp.head(10))

                    Skill  Count
2                     sql     38
12          communication     30
0                   excel     25
6                 tableau     23
5                power bi     20
1                  python     19
9   business intelligence     13
14                      r     12
4                     aws      8
23            forecasting      6


In [111]:
# Convert both results to sets for easy comparison
regex_skills_set = set(skill_df['Skill']) 
nlp_skills_set = set(skill_df_nlp['Skill'])

In [113]:
# Find missing skills in NLP
missing_in_nlp = regex_skills_set - nlp_skills_set 
missing_in_regex = nlp_skills_set - regex_skills_set

In [115]:
# Print results
print(f"Total skills extracted using RegEx: {len(regex_skills_set)}")
print(f"Total skills extracted using NLP: {len(nlp_skills_set)}")
print(f"Skills missing in NLP: {missing_in_nlp}")
print(f"Skills missing in RegEx: {missing_in_regex}")

Total skills extracted using RegEx: 27
Total skills extracted using NLP: 27
Skills missing in NLP: set()
Skills missing in RegEx: set()


In [None]:
# Extract Skills from Resume

In [117]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Example Usage
pdf_path = "data-analyst-resume-example.pdf"
resume_text = extract_text_from_pdf(pdf_path)
print(resume_text) 

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Ralf Kendal
Phone: (888)555-1111
Email:name@email.com
Professional Objective
To use my decade of experience as a data analyst to manage large sets of data, and produce insights to help the Illinois
Department of Natural Resources achieve its objectives.
Skills
• SQL
• Python
• Evaluating data sets for integrity
• Data cleansing and preparation for analytics
• Gathering and understanding of user requirements
• Visualizing and interpreting reports
• Identifying potential risks and sources of data corruption
Professional Experience
Oleson Fisheries Management
Senior Data Analyst
June 2017 — Present
• Provided data analysis and other support functions to various research teams, and other business areas.
• Cleaned and anonymized nearly 5 terabytes of data.
• Monitors data transfers and backups.
• Uses SQL and other tools to create comprehensive data reports.
• Manages the collection, storage, and analysis of aquatic life for facilities across five states.
University of Iowa: Life Sciences D

In [119]:
# Extract Only the "Skills" Section
pattern = r'Skills(.*?)(?=\n(?:Professional Experience|Education|Certification|Achievement|Project)|$)'

match = re.search(pattern, resume_text, re.DOTALL)

if match:
    skill_resume = match.group(1).strip()
    print(skill_resume)

• SQL
• Python
• Evaluating data sets for integrity
• Data cleansing and preparation for analytics
• Gathering and understanding of user requirements
• Visualizing and interpreting reports
• Identifying potential risks and sources of data corruption


In [121]:
# Convert extracted text into a list
resume_skills = [skill.strip() for skill in skill_resume.split("\n") if skill.strip()]

In [123]:
# Remove special characters (like •) and trim spaces
cleaned_skills = [re.sub(r"[^a-zA-Z0-9\s]", "", skill).strip() for skill in resume_skills]

In [125]:
cleaned_skills

['SQL',
 'Python',
 'Evaluating data sets for integrity',
 'Data cleansing and preparation for analytics',
 'Gathering and understanding of user requirements',
 'Visualizing and interpreting reports',
 'Identifying potential risks and sources of data corruption']

In [None]:
# Find Matching and Missing Skills

In [127]:
cleaned_skills = [x.lower() for x in cleaned_skills]

In [129]:
resume_skills = set(cleaned_skills)
job_skills = set(skill_df_nlp['Skill'])

In [131]:
matching_skills = resume_skills.intersection(job_skills)
missing_skills = job_skills - resume_skills 

In [133]:
# Calculate match percentage
match_percentage = round((len(matching_skills) / len(job_skills)) * 100, 2)

# Display results
print(f"Matching Skills: {list(matching_skills)}")
print(f"Missing Skills: {list(missing_skills)}")
print(f"Skill Match Percentage: {match_percentage}%")

Matching Skills: ['sql', 'python']
Missing Skills: ['forecasting', 'postgresql', 'microsoft sql server', 'data warehousing', 'snowflake', 'looker', 'google cloud', 'critical thinking', 'oracle', 'tableau', 'r', 'aws', 'vba', 'business intelligence', 'dax', 'pandas', 'azure', 'excel', 'predictive analytics', 'communication', 'sas', 'regression analysis', 'power bi', 'problem solving', 'etl']
Skill Match Percentage: 7.41%


In [None]:
# Similarly same steps can be applied to find Experience, Education, Certification 
# and Keywords from both Job description and Resume for comparison