In [2]:
import ssl
from urllib.request import Request, urlopen
from urllib.error import HTTPError
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from io import StringIO
import requests
import time
import re
import pandas as pd
import json
import random
import os

# Disable SSL verification globally for urllib
ssl._create_default_https_context = ssl._create_unverified_context

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()

class LeverWebScraper:
    def __init__(self):
        self.urls = None

    def extract_urls(self, text):
        regex_pattern = r"(https?://[jobs.lever.co][^\s]+)"
        urls = re.findall(regex_pattern, text)
        return urls

    def remove_tags(self, html):
        # parse html content
        soup = BeautifulSoup(html, "html.parser")
        for data in soup(['style', 'script']):
            data.decompose()
        return ' '.join(soup.stripped_strings)

    def get_lever_sites(self):
        lever_gsheet = "https://docs.google.com/spreadsheets/d/18u2sKRKjKz9gwRyob0p9KmcyVC6NX8JaJhjOqsRmbKY/edit?usp=sharing"
        csv_url = lever_gsheet
        # Disable SSL verification for requests
        res = requests.get(url=csv_url, verify=False)
        content = res.content
        content = self.remove_tags(content)
        urls = self.extract_urls(content)
        return urls

    def strip_tags(self, html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()

    def clean_job_posting(self, posting):
        text = self.strip_tags(posting)
        cleaned_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
        cleaned_text = re.sub(r'([a-zA-Z])([^a-zA-Z])', r'\1 \2', cleaned_text)
        cleaned_text = re.sub(r'([^a-zA-Z])([a-zA-Z])', r'\1 \2', cleaned_text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = cleaned_text.replace("( ", "").replace(" )", "")
        if cleaned_text.startswith('Apply'):
            cleaned_text = cleaned_text.replace('Apply', '')
        if cleaned_text.endswith('Remote'):
            cleaned_text = '[REMOTE] - ' + cleaned_text.replace('Remote', '')
        # elif cleaned_text.endswith('On-site'):
        #     cleaned_text = '[ON-SITE] - ' + cleaned_text.replace('On-site', '')
        # elif cleaned_text.endswith('Hybrid'):
        #     cleaned_text = '[HYBRID] - ' + cleaned_text.replace('Hybrid', '')
        if "Full - Time" in cleaned_text:
            cleaned_text = '[Full-Time]' + cleaned_text.replace("Full - Time", "")
        # elif "Part - time" in cleaned_text:
        #     cleaned_text = '[Part-Time]' + cleaned_text.replace("Part - time", "")
        cleaned_text = cleaned_text.replace("  ", " ")
        return cleaned_text

    def get_postings(self):
        all_postings = list()
        map0 = {'urls': [], 'descriptions': [], 'points': []}
        links = self.get_lever_sites()
        links = [l + "/" if not l.endswith("/") else l for l in links]
        links = [l for l in links if ".eu." not in l]
        links = list(set(links))
        random.shuffle(links)
        current_link_n = 1
        n_links = len(links)

        for current_url in links:
            current_link_n += 1
            req = Request(current_url, headers={'User-Agent': 'Mozilla/5.0'})
            try:
                html_page = urlopen(req).read()
            except HTTPError as e:
                if e.code == 404:
                    print("HTTP 404 error: Page not found")
                else:
                    print("An HTTP error occurred:", e)
                continue

            soup = BeautifulSoup(html_page, 'html.parser')
            postings = soup.find_all("div", {"class": "posting"})
            for posting in postings:
                clean_posting = self.clean_job_posting(str(posting))
                clean_posting = clean_posting.replace('Sr . ', 'Sr. ')
                points = 0
                if "Data Analytics Director " in clean_posting.lower():
                    points += 2
                if "Data Analytics Vice President USA" in clean_posting:
                    points += 1
                if "REMOTE" in clean_posting:
                    points += 1
                if "Business Intelligence" in clean_posting:
                    points += 1
                skip_posting = False
                data_keyword_present = "data" in clean_posting.lower()
                data_science_referenced = data_keyword_present
                # machine_learning_referenced = "machine" in clean_posting.lower() and "learning" in clean_posting.lower()
                # if not (data_science_referenced or machine_learning_referenced):
                    # skip_posting = True
                if skip_posting:
                    continue
                map0['urls'].append(current_url)
                map0['descriptions'].append(clean_posting)
                map0['points'].append(points)
                print(current_url, '\t', clean_posting)
            time.sleep(1.0)
        return map0

lever_scraper = LeverWebScraper()
postings_map = lever_scraper.get_postings()

df = pd.DataFrame(postings_map)
df = df.sort_values(by=['points'], ascending=False)

with pd.option_context('display.max_colwidth', 400):
    display(df.head(50))




HTTP 404 error: Page not found
HTTP 404 error: Page not found
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Director , Identity Solutions Remote — United States
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Director , Inventory Development England Remote — United Kingdom
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Director , Inventory Development France Remote — France
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Technical Product Manager - Mobile App Machine Learning Remote — United States
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Technical Product Manager , B 2 B Performance & Machine Learning Remote — United States
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Lead Engineer - Integrations Services Remote — United States
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Lead Engineer - Integrations Services Remote — Canada
https://jobs.lever.co/stackadapt/ 	 [Full-Time] Software Engineer , Backend Intermediate / Senior Remote — Canada
https://jobs.lever.co/stackadapt

Unnamed: 0,urls,descriptions,points
901,https://jobs.lever.co/restaurant365/,"[REMOTE] - Manager , Business Intelligence — Full Time",2
892,https://jobs.lever.co/restaurant365/,[REMOTE] - Customer Success Manager — Full Time,1
975,https://jobs.lever.co/restaurant365/,"[REMOTE] - Account Executive , Strategic Accounts — Full Time",1
2692,https://jobs.lever.co/Hydrow/,"[Full-Time][REMOTE] - Graphic Designer Hybrid — Boston , MA or",1
446,https://jobs.lever.co/floqast/,[REMOTE] - Account Manager — Full - time,1
445,https://jobs.lever.co/floqast/,"[REMOTE] - Product Manager , Intercompany — Full - time",1
444,https://jobs.lever.co/floqast/,[REMOTE] - Product Manager — Full - time,1
442,https://jobs.lever.co/floqast/,[REMOTE] - Alliance Manager — Full - time,1
984,https://jobs.lever.co/restaurant365/,[REMOTE] - Site Reliability Engineer — Full Time,1
980,https://jobs.lever.co/restaurant365/,[REMOTE] - Revenue Enablement Technology Specialist — Full Time,1


In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL for scraping
url = "https://jobs.lever.co/restaurant365/"

# Make a request to the page
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.content, 'html.parser')

# Extract job postings using BeautifulSoup
postings = soup.select("h5[data-qa='posting-name']")  # Updated selector

# Check if postings were found
if not postings:
    print("No job postings found. Please check the selector.")
else:
    print(f"Found {len(postings)} job postings.")

# Define the function to filter job titles by keywords
def filter_postings_by_keywords(postings, keywords):
    filtered_postings = []
    for posting in postings:
        title = posting.get_text(strip=True)
        if any(keyword.lower() in title.lower() for keyword in keywords):
            filtered_postings.append(title)
    return filtered_postings

# Predefined list of keywords for filtering job titles (modify this list as needed)
keywords = ["Data"]  # Change these keywords as necessary

# Filter postings based on predefined keywords
filtered_postings = filter_postings_by_keywords(postings, keywords)

# Prepare data for DataFrame and CSV output
job_data = {'Job Title': filtered_postings[:50]}  # Limit to top 50 matches
df = pd.DataFrame(job_data)

# Save to CSV
output_file = "filtered_job_postings.csv"
df.to_csv(output_file, index=False)
print(f"Filtered job postings saved to {output_file}")

# Display top 50 job titles in pandas
print(df.head(50))



Found 94 job postings.
Filtered job postings saved to filtered_job_postings.csv
                   Job Title
0  Manager, Data Engineering
1       Senior Data Engineer


In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL for scraping
sheet_url = "https://docs.google.com/spreadsheets/d/18u2sKRKjKz9gwRyob0p9KmcyVC6NX8JaJhjOqsRmbKY/export?format=csv"
# Read the URLs from the Google Sheet
urls_df = pd.read_csv(sheet_url)

# Assuming the URLs are in the first column (modify the column index if necessary)
urls = urls_df.iloc[:, 0].tolist()  # Get all URLs as a list

# List to store all filtered job postings from all URLs
all_filtered_postings = []

# Loop through each URL and scrape job postings
for url in urls:
    print(f"Scraping URL: {url}")
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract job postings using BeautifulSoup
    postings = soup.select("h5[data-qa='posting-name']")  # Updated selector

    # Check if postings were found
    if not postings:
        print(f"No job postings found at {url}.")
        continue

    # Define the function to filter job titles by keywords
    def filter_postings_by_keywords(postings, keywords):
        filtered_postings = []
        for posting in postings:
            title = posting.get_text(strip=True)
            # Check if any keyword is present in the title (partial match)
            if any(keyword.lower() in title.lower() for keyword in keywords):
                filtered_postings.append(title)
        return filtered_postings

    # Predefined list of keywords for filtering job titles (modify this list as needed)
    keywords = [
       "Data"
    ]  # Add or modify keywords as necessary

    # Filter postings based on predefined keywords
    filtered_postings = filter_postings_by_keywords(postings, keywords)

    # Add filtered postings to the overall list
    all_filtered_postings.extend(filtered_postings)

# Prepare data for DataFrame and CSV output
job_data = {'Job Title': all_filtered_postings[:50]}  # Limit to top 50 matches
df = pd.DataFrame(job_data)

# Save to CSV
output_file = "filtered_job_postings.csv"
df.to_csv(output_file, index=False)
print(f"Filtered job postings saved to {output_file}")

# Display top 50 job titles in pandas
print(df.head(50))



Scraping URL: https://jobs.lever.co/certifid/
Scraping URL: https://jobs.eu.lever.co/yokoy
Scraping URL: https://jobs.eu.lever.co/uptempo
Scraping URL: https://jobs.lever.co/nautical-commerce
No job postings found at https://jobs.lever.co/nautical-commerce.
Scraping URL: https://jobs.lever.co/orum/
Scraping URL: https://jobs.lever.co/secureframe/
Scraping URL: https://jobs.lever.co/stackadapt/
Scraping URL: https://jobs.lever.co/restaurant365/
Scraping URL: https://jobs.lever.co/1password/
Scraping URL: https://jobs.lever.co/dexcarehealth
Scraping URL: https://jobs.lever.co/intenseye/
Scraping URL: https://jobs.lever.co/tegus
Scraping URL: https://jobs.lever.co/aurorasolar/
No job postings found at https://jobs.lever.co/aurorasolar/.
Scraping URL: https://jobs.lever.co/starburstdata
Scraping URL: https://jobs.lever.co/verkada
No job postings found at https://jobs.lever.co/verkada.
Scraping URL: https://jobs.lever.co/canva/
No job postings found at https://jobs.lever.co/canva/.
Scraping