In [20]:
from bs4 import BeautifulSoup
import requests
import re
import time
import random
from html.parser import HTMLParser
from io import StringIO
import ssl
import urllib3
import warnings

# Suppress only the InsecureRequestWarning from urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Disable SSL verification globally for urllib if you need to
ssl._create_default_https_context = ssl._create_unverified_context

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()


class LeverWebScraper:
    def __init__(self):
        self.urls = None

    def extract_urls(self, text):
        regex_pattern = r"(https?://[jobs.lever.co][^\s]+)"
        urls = re.findall(regex_pattern, text)
        return urls

    def remove_tags(self, html):
        soup = BeautifulSoup(html, "html.parser")
        for data in soup(['style', 'script']):
            data.decompose()
        return ' '.join(soup.stripped_strings)

    def get_lever_sites(self):
        lever_gsheet = "https://docs.google.com/spreadsheets/d/18u2sKRKjKz9gwRyob0p9KmcyVC6NX8JaJhjOqsRmbKY/edit?usp=sharing"
        # Add `verify=False` to bypass SSL verification
        res = requests.get(url=lever_gsheet, verify=False)
        content = res.content
        content = self.remove_tags(content)
        urls = self.extract_urls(content)
        return urls

    def strip_tags(self, html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()

    def clean_job_posting(self, posting):
        text = self.strip_tags(posting)
        cleaned_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
        return cleaned_text

    def get_postings(self):
        all_postings = list()
        map0 = {
            'urls': [],
            'descriptions': [],
            'points': []
        }
        links = self.get_lever_sites()
        links = [l + "/" if not l.endswith("/") else l for l in links]
        links = [l for l in links if ".eu." not in l]  # Exclude European sites
        links = list(set(links))
        random.shuffle(links)

        for current_url in links:
            try:
                # Add `verify=False` to bypass SSL verification
                res = requests.get(current_url, headers={'User-Agent': 'Mozilla/5.0'}, verify=False)
                html_page = res.content
            except requests.exceptions.RequestException as e:
                print(f"Request error for {current_url}: {e}")
                continue

            soup = BeautifulSoup(html_page, 'html.parser')
            postings = soup.select("div.posting-title h5.posting-name")

            for posting in postings:
                clean_posting = self.clean_job_posting(str(posting))
                clean_posting = clean_posting.replace('Sr . ', 'Sr. ')
                points = 0

                if "Data Analytics" in clean_posting:
                    points += 3
                if "REMOTE" in clean_posting:
                    points += 3
                if "United States" in clean_posting:
                    points += 3

                skip_posting = False
                keywords_to_exclude = ["on - site", "onsite", "hybrid"]

                for w in keywords_to_exclude:
                    if w in clean_posting.lower():
                        skip_posting = True
                        break

                if skip_posting:
                    continue

                map0['urls'].append(current_url)
                map0['descriptions'].append(clean_posting)
                map0['points'].append(points)
                print(current_url, '\t', clean_posting)

            time.sleep(1.0)
        return map0


# Usage
lever_scraper = LeverWebScraper()
postings_map = lever_scraper.get_postings()


In [22]:
import pandas as pd
pd.set_option('display.max_rows', 500)

# Create DataFrame from the postings map
df = pd.DataFrame(postings_map)

# Sort the DataFrame by points in descending order
df = df.sort_values(by=['points'], ascending=False)

# Filter the DataFrame to show only rows where points > 5
df_filtered = df[df['points'] > 5]

# Set display options and show filtered results
from pandas import option_context

with option_context('display.max_colwidth', 400):
    display(df_filtered.head(50))


Unnamed: 0,urls,descriptions,points
