# Webscraping from Linkedin 

This is a surface level implementation that will only look for titles for the time being

In [58]:
import pickle, re, json, hashlib, random, time
from collections import defaultdict
from urllib.parse import quote
from datetime import datetime, timezone
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
from urllib.parse import urljoin

## Cookie Injection for Login Logic:

In [43]:
def login_and_save_cookies(email, password,user_agent, cookie_path = "li_cookies.pkl"):
    """
    Launches a Selenium browser, logs into LinkedIn with the provided credentials,
    waits for the landing page to load, then saves cookies to disk.
    """
    opts = uc.ChromeOptions()
    opts.add_argument(f"--user-agent={user_agent}")
    driver = uc.Chrome(options=opts)
    driver.get("https://www.linkedin.com/login")
    
    # fill in credentials
    driver.find_element(By.ID, "username").send_keys(email)
    driver.find_element(By.ID, "password").send_keys(password)
    driver.find_element(By.CSS_SELECTOR, "button[type=submit]").click()
    
    # wait and enter 2fa if needed
    time.sleep(60)
    driver.get("https://www.linkedin.com")
    # dump cookies
    with open(cookie_path, "wb") as f:
        pickle.dump(driver.get_cookies(), f)
    
    driver.quit()
    print(f"✅ Logged in and saved cookies to {cookie_path}")


def make_driver_with_cookies(cookie_path = "li_cookies.pkl",user_agent = None,proxy = None):
    """
    Spins up a ChromeDriver, injects your saved LinkedIn cookies, and returns
    a logged-in driver.  Optionally overrides UA and/or proxy.
    """
    opts = uc.ChromeOptions()
    if user_agent:
        opts.add_argument(f"--user-agent={user_agent}")
    if proxy:
        opts.add_argument(f"--proxy-server={proxy}")
    driver = uc.Chrome(options=opts)

    # load cookies
    driver.get("https://www.linkedin.com")
    cookies = pickle.load(open(cookie_path, "rb"))
    for c in cookies:
        driver.add_cookie(c)
    driver.refresh()
    time.sleep(3)
    return driver


## Base Facets:

In [3]:
BASE = {
    "keywords": '"AI" OR "Generative AI" OR "LLM" OR "Large Language Model" OR '
                '"Prompt Engineering" OR "Foundation Model" OR "Transformer" OR '
                '"RAG" OR "Reinforcement Learning With Human Feedback" OR "RLHF"',
    "location": "United States",
    "geoId": "103644278",      # US
    "f_TPR": "r604800",        # past week
}

# Facet codes to shard by (adjust as you like)
EXP_CODES = ["1", "2","3","4","5","6"]           # f_E: 1=intern, 2=entry, 3=associate, 4=mid-senior, 5=director, 6=executive
JT_CODES  = ["I", "F","C","T","P","V","O"]           # f_JT: I=internship, F=full_time, (C contract, T temp, P part_time, V volunteer, O other)
WT_CODES  = ["2", "1","3"]           # f_WT: 2=remote, 1=on_site, 3=hybrid

# Optional human-readable labels for logs/UI
EXP_LABEL = {"1":"intern","2":"entry","3":"associate","4":"mid-senior","5":"director","6":"executive"}
JT_LABEL  = {"I":"internship","F":"full_time","C":"contract","T":"temporary","P":"part_time","V":"volunteer","O":"other"}
WT_LABEL  = {"1":"on_site","2":"remote","3":"hybrid"}

# Blacklist
BLACKLIST_COMPANY_IDS = set()
BLACKLIST_RE = re.compile(
    r"(jobright|jooble|talent\.com|ziprecruiter|lensa|adzuna|simplyhired|neuvoo|jora|"
    r"glassdoor|jobs2careers|myjobhelper|careerbuilder|monster|snagajob|"
    r"insight global|teksystems|kforce|aerotek|randstad|robert half|apex systems|experis|actalent)",
    re.I
)

jobs = {}
shards = {}
jobs_for_shard = defaultdict(list)
shards_for_job = defaultdict(list)

## Helper functions:

In [None]:
JOB_UL_XPATH   = "//ul[li[@data-occludable-job-id]]"
EMPTY_SELECTORS = (
    'div[class~="jobs-search-no-results-banner"][role="alert"][aria-live="assertive"], '
    '.jobs-search-results-list__no-jobs-available-card'
)

def wait_for_results_or_empty(driver, timeout=15):
    """
    Returns ("has_results", ul_element) or ("empty", None)
    """
    def _ready(d):
        return (d.find_elements(By.XPATH, JOB_UL_XPATH) or
                d.find_elements(By.CSS_SELECTOR, EMPTY_SELECTORS) or
                d.find_elements(By.CSS_SELECTOR, '[data-results-list-top-scroll-sentinel]'))
    WebDriverWait(driver, timeout).until(_ready)

    uls = driver.find_elements(By.XPATH, JOB_UL_XPATH)
    if uls:
        return "has_results", uls[0]
    # No UL with cards; treat as empty if banner/empty card is present
    if (driver.find_elements(By.CSS_SELECTOR, EMPTY_SELECTORS)):
        return "empty", None
    # Fallback: if sentinel exists but zero cards, also empty
    if driver.find_elements(By.CSS_SELECTOR, '[data-results-list-top-scroll-sentinel]'):
        return "empty", None
    return "empty", None
def is_blacklisted(company_id, company_name):
    if company_id and company_id in BLACKLIST_COMPANY_IDS: 
        return True
    if company_name and BLACKLIST_RE.search(company_name): 
        return True
    return False
def parse_date(iso: str):
    if not iso: 
        return None
    iso = iso.strip()
    # e.g. "2025-08-13" or "2025-08-13T14:22:31Z"
    if iso.endswith('Z'):
        iso = iso[:-1] + '+00:00'
    try:
        dt = datetime.fromisoformat(iso)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt
    except ValueError:
        if re.fullmatch(r"\d{4}-\d{2}-\d{2}", iso):
            return datetime.strptime(iso, "%Y-%m-%d").replace(tzinfo=timezone.utc)
        return None
def build_url(p, start):
    return (
        "https://www.linkedin.com/jobs/search"
        f"?keywords={quote(p['keywords'])}"
        f"&location={quote(BASE['location'])}"
        f"&geoId={p['geoId']}"
        f"&f_TPR={p['f_TPR']}"
        f"&f_E={p['f_E']}"
        f"&f_JT={p['f_JT']}"
        f"&f_WT={p['f_WT']}"
        f"&start={start}"
        f"&f_VJ = true" # verfied jobs only
    )

## Shard ID with Hashing

In [5]:
def shard_key(params: dict) -> str:
    """Stable hash for a shard's exact filters."""
    packed = json.dumps(params, sort_keys=True, separators=(",", ":")).encode() # make params a string
    return hashlib.sha1(packed).hexdigest() #hashing the string

## Shard Registration:

In [6]:
def register_shard(shards: dict, params: dict, sig: tuple, rank: int):
    sid = shard_key(params)
    if sid not in shards:
        fE, fJT, fWT = sig
        shards[sid] = {
            "rank": rank,
            "params": params,
            "sig": sig,
            "meta": {
                "kw_lbl": "catch_all",
                "geo_lbl": "US",
                "experience_lbl": EXP_LABEL.get(fE, fE),
                "job_type_lbl":   JT_LABEL.get(fJT, fJT),
                "workplace_lbl":  WT_LABEL.get(fWT, fWT),
                "date_from": None,
                "date_to": None,
            }
        }
    return sid

## Parsing HTML:

In [60]:
JOB_ANCHOR_SEL = 'a[href*="/jobs/view/"]'  # robust across UI variants
COMP_SEL = (
    '.base-search-card__subtitle a, '
    '.artdeco-entity-lockup__subtitle a, '
    'a[href*="/company/"]'
)

def parse_cards_from_ul_html(ul_outer_html, blacklist_companies=None):
    """
    Parse ONLY the LEFT list UL (pass ul.get_attribute('outerHTML')).
    Yields: {job_id, title, posted_dt, company_name, url}
    """
    if blacklist_companies is None:
        try:
            blacklist_companies = BLACKLIST_COMPANY_IDS
        except NameError:
            blacklist_companies = set()

    soup = BeautifulSoup(ul_outer_html, 'html.parser')

    for li in soup.select('li[data-occludable-job-id]'):
        a = li.select_one(JOB_ANCHOR_SEL)
        if not a:
            continue

        href = a.get('href', '')
        url  = urljoin('https://www.linkedin.com', href)
        job_id = li.get('data-occludable-job-id')
        if not job_id:
            continue

        # Title from aria-label first, then visible text
        title = (a.get('aria-label') or a.get_text(" ", strip=True) or "N/A").strip()

        # Company (prefer anchor)
        comp_a = li.select_one(COMP_SEL)
        if comp_a:
            company_name = comp_a.get_text(strip=True)
            urn = comp_a.get('data-entity-hovercard-id', '')
            company_id = urn.split(':')[-1] if ':' in urn else None
        else:
            sp = li.select_one('.base-search-card__subtitle span[dir="ltr"]')
            company_name = sp.get_text(strip=True) if sp else "N/A"
            company_id = None

        # Optional: blacklist check (kept since you had it)
        try:
            if is_blacklisted(company_id, company_name):
                if company_id:
                    blacklist_companies.add(company_id)
                continue
        except NameError:
            pass  # if you removed blacklist funcs, this still works

        # Posted date (present on many list cards, missing on 'Promoted')
        t = li.select_one('time[datetime]')
        posted_dt = parse_date(t.get('datetime')) if t else None

        yield {
            "job_id": job_id,
            "title": title,
            "posted_dt": posted_dt,
            "company_name": company_name,
            "url": url,
        }



## Linking Jobs to Shards

In [8]:
def add_job_links(jid: str, sid: str,
                  jobs_for_shard: dict, shards_for_job: dict):
# shard -> jobs (encounter order)
  if not jobs_for_shard[sid] or jobs_for_shard[sid][-1] != jid:
      jobs_for_shard[sid].append(jid)

  # job -> shards (discovery order)
  if sid not in shards_for_job[jid]:
      shards_for_job[jid].append(sid)

## Humanlike Scrolling Interactions:

In [17]:
from selenium.webdriver import ActionChains
def human_scroll(pane, steps=5): # more random scrolling
    height = driver.execute_script("return arguments[0].scrollHeight", pane)
    for i in range(steps):
        y = height * (i + 1) / steps
        driver.execute_script("arguments[0].scrollTo(0, arguments[1]);", pane, y)
        time.sleep(random.uniform(0.2, 0.6))
def hover_on_job():
    actions = ActionChains(driver)
    card = random.choice(driver.find_elements(By.CSS_SELECTOR, "li.scaffold-layout__list-item"))
    actions.move_to_element(card).pause(random.uniform(0.5, 1.2)).perform()
def back_and_forth(pane):
    driver.execute_script("arguments[0].scrollBy(0, -100);", pane)
    time.sleep(random.uniform(0.3, 0.8))
    driver.execute_script("arguments[0].scrollBy(0, 100);", pane)
    time.sleep(random.uniform(0.3, 0.8))

In [45]:
#ua = UserAgent().random
#login_and_save_cookies("api.authenticate.user@gmail.com", "authenticate",user_agent=ua)
driver = make_driver_with_cookies(user_agent=ua)

python(20392) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(20393) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


we will use nested loops to iterate over all combinations of these codes for sharding

In [61]:
EW = len(WT_CODES)
EJW = len(JT_CODES) * EW
for e_idx, fE in enumerate(EXP_CODES):
    for j_idx, fJT in enumerate(JT_CODES):
        for w_idx, fWT in enumerate(WT_CODES): # grab indexes for ranking logic
                
                params = dict(BASE, **{"f_E": fE, "f_JT": fJT, "f_WT": fWT})
                rank = e_idx * EJW + j_idx * EW + w_idx
                sid = register_shard(shards, params, (fE, fJT, fWT), rank)


                driver.get(build_url(params, start=0))
                if page_has_no_results(driver):
                    print(f"❌ No results for shard {sid[:6]} rank={rank} | {params}")
                    continue
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "li[data-occludable-job-id]"))
                )
                pane = driver.find_element(By.XPATH, "//ul[li[@data-occludable-job-id]]")

                for start in range(0, 1000, 25):
                    if start > 0:
                        driver.get(build_url(params, start))
                        WebDriverWait(driver, 15).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, "li[data-occludable-job-id]"))
                        )
                        pane = driver.find_element(By.XPATH, "//ul[li[@data-occludable-job-id]]")

                    # load cards with your human-like motions
                    driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", pane)
                    back_and_forth(pane)
                    hover_on_job()
                    html = pane.get_attribute("outerHTML")

                    # parse and store
                    for rec in parse_cards_from_html(html):
                        jid = rec["job_id"]
                        if jid not in jobs:
                            jobs[jid] = rec  # store the whole record
                        add_job_links(jid, sid, jobs_for_shard, shards_for_job)

                    time.sleep(random.uniform(5, 8))
                    print(f' shard {sid[:6]} rank={rank} page {start//25 + 1} | jobs={len(jobs)}')
        

 shard ac23ba rank=0 page 1 | jobs=12


KeyboardInterrupt: 

In [62]:
jobs

{'4281201823': {'job_id': '4281201823',
  'title': 'N/A',
  'posted_dt': None,
  'company_name': 'N/A'},
 '4284518435': {'job_id': '4284518435',
  'title': 'N/A',
  'posted_dt': datetime.datetime(2025, 8, 13, 0, 0, tzinfo=datetime.timezone.utc),
  'company_name': 'N/A'},
 '4184290448': {'job_id': '4184290448',
  'title': 'N/A',
  'posted_dt': None,
  'company_name': 'N/A'},
 '4279550962': {'job_id': '4279550962',
  'title': 'N/A',
  'posted_dt': datetime.datetime(2025, 8, 7, 0, 0, tzinfo=datetime.timezone.utc),
  'company_name': 'N/A'},
 '4284512381': {'job_id': '4284512381',
  'title': 'N/A',
  'posted_dt': datetime.datetime(2025, 8, 13, 0, 0, tzinfo=datetime.timezone.utc),
  'company_name': 'N/A'},
 '4284011495': {'job_id': '4284011495',
  'title': 'N/A',
  'posted_dt': None,
  'company_name': 'N/A'},
 '4282193663': {'job_id': '4282193663',
  'title': 'N/A',
  'posted_dt': None,
  'company_name': 'N/A'},
 '4284543707': {'job_id': '4284543707',
  'title': 'N/A',
  'posted_dt': datetim

In [None]:
#Testing some selectors
s = '''
<li class="ember-view KDTKDKTrDQWiObaNvgmFWztfOOeQlJmzDxw occludable-update p0 relative scaffold-layout__list-item" data-occludable-job-id="4273788657" id="ember159">
<div>
<div aria-current="page" class="display-flex job-card-container relative job-card-list job-card-container--clickable job-card-list--underline-title-on-hover jobs-search-two-pane__job-card-container--viewport-tracking-0" data-job-id="4273788657">
<div>
<div class="job-card-list__entity-lockup artdeco-entity-lockup artdeco-entity-lockup--size-4 ember-view" id="ember160">
<div class="job-card-list__logo artdeco-entity-lockup__image artdeco-entity-lockup__image--type-square ember-view" id="ember161" type="square">
<div class="ivm-image-view-model job-card-list__logo-ivm">
<div class="ivm-view-attr__img-wrapper">
<!-- -->
<!-- --> <img alt="Convergenz logo" class="ivm-view-attr__img--centered EntityPhoto-square-4 evi-image lazy-image ember-view" height="56" id="ember162" loading="lazy" src="https://media.licdn.com/dms/image/v2/D4E0BAQFBDe02KaoXbA/company-logo_100_100/company-logo_100_100/0/1719840173099/convergenz_logo?e=1757548800&amp;v=beta&amp;t=eAjfrb-iA0ZpzScbB6jvFjuPWGvJayv1HkrznKIpHeo" width="56"/>
</div>
</div>
</div>
<div class="flex-grow-1 artdeco-entity-lockup__content ember-view" id="ember163">
<div class="full-width artdeco-entity-lockup__title ember-view" id="ember164">
<a aria-label="AI Developer with verification" class="disabled ember-view job-card-container__link bCqQGuAEWHkPagGRHFqUxtwsgeIGYLnC job-card-list__title--link" data-control-id="rkjF1FDtv21NhPofMnViig==" dir="ltr" href="/jobs/view/4273788657/?eBP=CwEAAAGYd-R54PfY6WeZtOSxTNT3F2CnIPJaBNq95O-9SKcy_uyHyvNoW5d1XE-CZ1QW3ceBWKbxK2EZ_DWuXB7nns_XhiMCDLLTbOHC0Jmb36pMY09ADBHTxSVUtDKPku8qpOJaa8YIhfJ29dxydzxISd0dY04CDT43FRncbWh7MM7KG7iQxCUMD9ox2mbG5QuuJD5ArPsdZ7R-pfgk67we28ZgwjgBu7y2cygHok5kf_ZlNAN71M0Wyre2W_G5AWi6bD3lM7NWNPRjh7GnzN-yOvFAnq5wVibWPMSIqC_Ixq9crbsEtI0uYSby418JFB2cNmO_jKEJmWUTwBlPTJNF5M_B0cKUMCWqAbGDkEvHJVVvO7CZVakD4jN5gKUL59w0cKIuE2V9xZnDUTK7zbJqD7ck1IEMiT4Vq9t4LeVgYuYqgOlpL1MNjIbLVwqKPZMpK2AUvFmcsvF2o3-udCqoSDlBCN1JikW3ubLUNzz79xKGqwZcej8yQGIsM13rgO_nRJbuBoMH5jO8_g&amp;refId=EsWaDqWDjtaby5qyYeQ8Sw%3D%3D&amp;trackingId=rkjF1FDtv21NhPofMnViig%3D%3D&amp;trk=flagship3_search_srp_jobs" id="ember165" tabindex="0">
<span aria-hidden="true"><strong><!-- -->AI Developer<!-- --></strong><span class="white-space-pre"> </span><!-- --><!-- --><span class="tvm__text tvm__text--low-emphasis"><svg aria-hidden="true" class="text-view-model__verified-icon" data-supported-dps="16x16" data-test-icon="verified-small" height="16" role="none" viewbox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg">
<!-- -->
<use height="16" href="#verified-small" width="16"></use>
</svg>
</span></span><span class="visually-hidden"><!-- -->AI Developer with verification<!-- --></span>
</a>
</div>
<div class="artdeco-entity-lockup__subtitle ember-view" id="ember166">
<span class="QvDBVzWZFVqwZaHzJdiwaQHXZllMcgg" dir="ltr">
<!-- -->Convergenz<!-- -->
</span>
</div>
<div class="artdeco-entity-lockup__caption ember-view" id="ember167">
<ul class="job-card-container__metadata-wrapper">
<li class="FIwncFhfpYjDwbtCGptBbflaEUiZtzic">
<span dir="ltr">
<!-- -->Arlington, VA (Remote)<!-- -->
</span>
</li>
<!-- --> </ul>
</div>
<div class="mt1 t-sans t-12 t-black--light t-normal t-roman artdeco-entity-lockup__metadata ember-view" id="ember168">
<ul class="job-card-container__metadata-wrapper">
<li class="FIwncFhfpYjDwbtCGptBbflaEUiZtzic">
<span dir="ltr">
<!-- -->$80/hr - $100/hr<!-- -->
</span>
</li>
</ul>
</div>
<!-- --> </div>
</div>
<div class="job-card-list__insight">
<div class="display-flex align-items-center t-black--light t-12">
<div class="mv1">
<div class="ivm-image-view-model">
<div class="ivm-view-attr__img-wrapper">
<!-- -->
<svg aria-hidden="false" aria-label="Actively reviewing applicants" class="ivm-view-attr__icon ivm-view-attr__icon--signal-positive job-card-container__job-insight-image" data-supported-dps="24x24" data-test-icon="responsive-medium" height="24" role="img" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg">
<!-- -->
<use height="24" href="#responsive-medium" width="24"></use>
</svg>
</div>
</div>
</div>
<div class="job-card-container__job-insight-text" dir="ltr">
<!-- -->Actively reviewing applicants<!-- -->
</div>
</div>
</div>
<ul class="job-card-list__footer-wrapper job-card-container__footer-wrapper flex-shrink-zero display-flex t-sans t-12 t-black--light t-normal t-roman mt1">
<li class="job-card-container__footer-item job-card-container__footer-job-state t-bold">
                      Viewed
                    </li>
<li class="job-card-container__footer-item inline-flex align-items-center">
<span dir="ltr">
<!-- -->Promoted<!-- -->
</span>
</li>
<!-- --> <li class="jndVwfSckDYsLkOlokqxOnesTSkFMWaNE job-card-container__footer-item inline-flex align-items-center">
<svg aria-hidden="true" class="job-card-list__icon" data-supported-dps="16x16" data-test-icon="linkedin-bug-color-small" height="16" role="none" viewbox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg">
<!-- --> <svg display="var(--hue-web-svg-display-light)">
<image height="16" href="https://static.licdn.com/aero-v1/sc/h/cukxdu7s8ldmqz13xdao5xe75" width="16" x="0" y="0"/>
</svg>
<svg display="var(--hue-web-svg-display-dark)">
<image height="16" href="https://static.licdn.com/aero-v1/sc/h/7qvn5nkkh1mlaqd5xm0radtjv" width="16" x="0" y="0"/>
</svg>
</svg>
<span dir="ltr">
<!-- -->Easy Apply<!-- -->
</span>
</li>
</ul>
<span aria-live="polite" class="visually-hidden">
<!-- --> </span>
</div>
<div class="job-card-list__actions-container">
<div>
<button aria-label="Dismiss AI Developer job" class="job-card-container__action job-card-container__action-small artdeco-button artdeco-button--muted artdeco-button--2 artdeco-button--tertiary ember-view" id="ember169" type="button"><!-- -->
<span class="artdeco-button__text">
<svg aria-hidden="true" data-supported-dps="16x16" data-test-icon="close-small" height="16" role="none" viewbox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg">
<!-- -->
<use height="16" href="#close-small" width="16"></use>
</svg>
<span class="job-card-container__action-text"></span>
</span></button>
</div>
<!-- -->
</div>
</div>
</div>
</li>
'''