## ReadME:

##### 1. upload skills.txt file.
##### 2. download job_details.xlsx once execution is done

In [3]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

In [17]:
#this method is to find the last page of the

def get_page_count(title_link_response):
        soup = BeautifulSoup(title_link_response.content, 'html.parser')

        # Find the pagination section
        pagination = soup.find('ul', class_='pagination pagination-custom')
        if pagination:
            last_page_link = pagination.find('a', string="Last")  # Find <a> with text "Last"
            #DEBUG :
            #print(f"This is last page link : {last_page_link}")
            if last_page_link and 'href' in last_page_link.attrs:
                last_page_url = last_page_link['href']

                # Extract the last number from the URL
                match = re.search(r'page=(\d+)', last_page_url)
                if match:
                    return int(match.group(1))  # Extracted last page number

        return None  # Return None if unable to determine the page count

In [7]:
#constants

start_page_count = 1
end_page_count = 2

SKILLS_FILE = "skills.txt"
BASE_URL = "https://www.hireitpeople.com/"
given_skills = {}

In [8]:
def load_skills():
    """Loads skills from skills.txt file and returns a set of skills in lowercase for matching."""
    with open(SKILLS_FILE, 'r', encoding='utf-8') as file:
        skills_list = {line.strip().lower() for line in file.readlines() if line.strip()}
    return skills_list

In [9]:
def extract_matching_skills(job_text, skills_list):
    """Finds all skills from job_text that match skills in the provided skills list."""
    job_text_lower = job_text.lower()
    matched_skills = {skill for skill in skills_list if skill in job_text_lower}
    return ", ".join(matched_skills)  # Convert set to comma-separated string

In [10]:
#get all the scrapped job links

def scrape_job_links(start_page_count, end_page_count):
    base_url = 'https://www.hireitpeople.com/jobs?page={}'
    job_links = []

    for page in range(start_page_count, end_page_count + 1):
        url = base_url.format(page)
        print(f"Scraping: {url}")
        response = requests.get(url, verify=False)
        if response.status_code != 200:
            print(f"Failed to retrieve page {page}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all job listing articles
        job_articles = soup.find_all('article', class_='post-classic')

        for article in job_articles:
            h4_tag = article.find('h4')
            if h4_tag:
                a_tag = h4_tag.find('a', href=True)
                if a_tag:
                    job_links.append(a_tag['href'])

        # Delay to respect the server (adjust as needed)
        time.sleep(10)

    return job_links


In [11]:
#Scrapping all the job details for given job links


def scrape_job_details(job_links, output_file='job_details.xlsx'):
    """Scrapes job details, extracts job description, matches skills, and saves to an Excel file."""
    job_data = []
    skills_list = load_skills()  # Load skills from file once

    for job_link in job_links:
        full_url = BASE_URL + job_link  # Append base URL
        print(f"Scraping job details from: {full_url}")

        response = requests.get(full_url, verify=False)

        if response.status_code != 200:
            print(f"Failed to retrieve job details from {full_url}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        job_info = {'URL': full_url}  # Store job URL

        # Extract Job Title from <h3>
        job_title_tag = soup.find('h3')
        job_info['Job Title'] = job_title_tag.text.strip() if job_title_tag else 'N/A'

        # Find all <dl class="dl-horizontal"> elements
        dl_tags = soup.find_all('dl', class_='dl-horizontal')

        for dl in dl_tags:
            dt_tags = dl.find_all('dt')
            dd_tags = dl.find_all('dd')

            for dt, dd in zip(dt_tags, dd_tags):
                key = dt.text.strip().replace(':', '')  # Column name
                value = dd.text.strip()  # Value
                job_info[key] = value

        # Extract Job Description (entire text from job page)
        #job_description_tag = soup.find('div', class_='job-description')  # Adjust selector if needed
        job_description = soup.get_text(separator=" ", strip=True) if soup else 'N/A'

        # Extract Matching Skills
        job_info['Skills'] = extract_matching_skills(job_description, skills_list)

        job_data.append(job_info)
        time.sleep(10)  # Respect the server's request

    # Convert new data to DataFrame
    new_df = pd.DataFrame(job_data)

    # Check if file exists, append to existing data
    if os.path.exists(output_file):
        existing_df = pd.read_excel(output_file)
        final_df = pd.concat([existing_df, new_df], ignore_index=True)  # Append data
    else:
        final_df = new_df  # Create a new file if not found

    # Save updated data to Excel
    final_df.to_excel(output_file, index=False)
    print(f"Job details appended to {output_file}")

In [12]:


# Updated URL to scrape
url = 'https://www.hireitpeople.com/jobs'
time.sleep(10)
# Disable SSL verification
response = requests.get(url, verify=False)

# Check if the request was successful
if response.status_code == 200:
    #print(f"The last page number is :   {get_page_count(response)}")

    print("all the job libks: ")
    job_links= scrape_job_links(start_page_count,end_page_count)
    print(job_links)

    #store data from each link to an excel sheet
    scrape_job_details(job_links)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")




all the job libks: 
Scraping: https://www.hireitpeople.com/jobs?page=1
Scraping: https://www.hireitpeople.com/jobs?page=2




['jobs/41312-senior-fullstack-ai-ml-principal-consultant-lead', 'jobs/41311-c-net-developer', 'jobs/41310-pharmacist-in-charge-non-retail-pharmacy', 'jobs/41309-software-quality-engineer', 'jobs/41308-software-quality-engineer', 'jobs/41307-help-desk-analyst', 'jobs/41306-programming-language-c-go-lang-developer', 'jobs/41305-fast-developer', 'jobs/41304-it-lead-consultant-data-services-testing-test-data-management', 'jobs/41303-secretary', 'jobs/41302-capital-markets-basics-golden-source-product-master-sme', 'jobs/41301-devops-engineer', 'jobs/41300-it-office-administrator', 'jobs/41299-azure-devops-infrastructure-engineer', 'jobs/41298-data-collection-staff-legal-research', 'jobs/41297-gis-it-support-specialist-esri-arcgis-desktop', 'jobs/41296-programming-language-c-go-lang-developer', 'jobs/41295-it-technology-lead-analytics-functional-business-analyst', 'jobs/41294-it-project-manager', 'jobs/41293-flexplm-senior-developer-consultant-pdm-plm-flexplm']
Scraping job details from: htt



Scraping job details from: https://www.hireitpeople.com/jobs/41311-c-net-developer




Scraping job details from: https://www.hireitpeople.com/jobs/41310-pharmacist-in-charge-non-retail-pharmacy




Scraping job details from: https://www.hireitpeople.com/jobs/41309-software-quality-engineer




Scraping job details from: https://www.hireitpeople.com/jobs/41308-software-quality-engineer




Scraping job details from: https://www.hireitpeople.com/jobs/41307-help-desk-analyst




Scraping job details from: https://www.hireitpeople.com/jobs/41306-programming-language-c-go-lang-developer




Scraping job details from: https://www.hireitpeople.com/jobs/41305-fast-developer




Scraping job details from: https://www.hireitpeople.com/jobs/41304-it-lead-consultant-data-services-testing-test-data-management




Scraping job details from: https://www.hireitpeople.com/jobs/41303-secretary




Scraping job details from: https://www.hireitpeople.com/jobs/41302-capital-markets-basics-golden-source-product-master-sme




Scraping job details from: https://www.hireitpeople.com/jobs/41301-devops-engineer




Scraping job details from: https://www.hireitpeople.com/jobs/41300-it-office-administrator




Scraping job details from: https://www.hireitpeople.com/jobs/41299-azure-devops-infrastructure-engineer




Scraping job details from: https://www.hireitpeople.com/jobs/41298-data-collection-staff-legal-research




Scraping job details from: https://www.hireitpeople.com/jobs/41297-gis-it-support-specialist-esri-arcgis-desktop




Scraping job details from: https://www.hireitpeople.com/jobs/41296-programming-language-c-go-lang-developer




Scraping job details from: https://www.hireitpeople.com/jobs/41295-it-technology-lead-analytics-functional-business-analyst




Scraping job details from: https://www.hireitpeople.com/jobs/41294-it-project-manager




Scraping job details from: https://www.hireitpeople.com/jobs/41293-flexplm-senior-developer-consultant-pdm-plm-flexplm




Job details appended to job_details.xlsx
