In [32]:
from bs4 import BeautifulSoup
import bs4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Safari
import requests
import pandas as pd
from langdetect import detect_langs
from urllib.parse import quote
import re

In [33]:
def getJobInfo(url, driver):
    """
    Retrieves job information from a given URL using a web driver.

    Args:
        url (str): The URL of the job posting.
        driver: The web driver used to access the URL.

    Returns:
        list: A list containing the job information, including job ID, company, position, location, description, language, and URL.
            If the job information cannot be retrieved, None is returned.
            If the description language is not English, -1 is returned.
    """
    
    driver.get(url)

    try:
        WebDriverWait(driver, 5).until(lambda d: d.find_element(By.CLASS_NAME, 'description__text') 
                                        and d.find_element(By.CLASS_NAME, 'topcard__title') 
                                        and d.find_element(By.CLASS_NAME, 'topcard__org-name-link') 
                                        and d.find_element(By.CLASS_NAME, 'topcard__flavor--bullet')) 
    except:
        # If this happens, it probably means LinkedIn is asking for a login
        try:
            if driver.find_element(By.CLASS_NAME, "topcard__flavor-row"):
                return -1
        except:
            return None

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    if soup.find('section', {"class":'two-pane-serp-page__search-header'}):
        return -1

    jobID = re.search(r'\d{10}', url).group(0)
    position = soup.find('h1', {"class":'topcard__title'}).get_text().strip()
    company = soup.find('a', {"class":'topcard__org-name-link'}).get_text().strip()
    location = soup.find('span', {"class":'topcard__flavor--bullet'}).get_text().strip()

        
    try:
        location = location.split(', ')[2]
    except:
        try:
            location = location.split(', ')[1]
        except:
            pass

    # TODO: need to find a better way to get requirements
    keywords = ["require", "qualifi", "skill", "have", "looking for", "must be", "ideal", "succ"]
    description = soup.find('div', {"class":'description__text description__text--rich'})
    # get it in text format
    description = description.get_text().replace('\n', '  ').replace(';', ', ').strip()
    description = re.sub(r'([a-z]+)([A-Z])', r'\1 \2', description)
    # detect description language
    try:
        lang = detect_langs(description)
        if lang[0].lang == 'en':
            descLang = lang[0].lang
        else:
            return 0
    except:
        return 0
    
    row = [jobID, company, position, location, description, descLang, url]

    return row
    
    

In [34]:
import queue
from time import sleep

def parseJobsFromUrlList(urlList, driver):
    """
    Parses job information from a list of URLs.

    Args:
        urlList (list): A list of URLs to parse job information from.
        driver: The web driver used to retrieve the web pages.

    Returns:
        tuple: A tuple containing two elements:
            - newRows (list): A list of parsed job information.
            - failedJobs (int): The number of jobs that failed to parse.
    """
    newRows = []
    failedJobs = 0
    discardedJobs = set()
    jobQueue = queue.Queue()

    for url in urlList:
        jobQueue.put(url)

    while not jobQueue.empty():
        url = jobQueue.get()
        row = getJobInfo(url, driver)
        
        if row == -1:
            print("Job failed to parse: " + url)
            discardedJobs.add(str(re.search(r'\d{10}', url).group(0)))
            failedJobs += 1
            continue

        if row == 0:
            print("Job is not in English: " + url)
            discardedJobs.add(str(re.search(r'\d{10}', url).group(0)))
            continue

        if row is None:
            #wait some time and try again
            sleep(5)
            jobQueue.put(url)

            continue

        print("parsed job: " + row[1] + " " + row[2])
        newRows.append(row)    

    return newRows, failedJobs, discardedJobs

In [35]:
from tqdm import tqdm

def saveJobsCsvFromUrls(txtfile='data/urls.txt', urlList=None, csvfile='data/jobs.csv'):
    """
    Parses job information from a list of URLs and saves the data to a CSV file.

    Args:
        txtfile (str): Path to the text file containing the list of URLs. Default is 'urls.txt'.
        urlList (list): List of URLs to parse. If not provided, the URLs will be read from the txtfile.
        csvfile (str): Path to the CSV file to save the parsed job data. Default is 'jobs.csv'.

    Returns:
        int: The number of failed jobs during parsing.

    """
    
    if urlList == None:
        f = open(txtfile, 'r')
        urlList = f.readlines()
        urlList = {x.strip() for x in urlList}
        f.close()

    try:
        oldJobs = pd.read_csv(csvfile, sep=';', names=['jobID', 'company', 'position', 'location', 'requirements', 'language', 'jobUrl'])
        discardedJobs = pd.read_csv('discarded_'+csvfile, sep=';', names=['jobID'])
    except:
        print("No previous jobs found")
        oldJobs = pd.DataFrame(columns=['jobID', 'company', 'position', 'location', 'requirements', 'language', 'jobUrl'])
        discardedJobs = pd.DataFrame(columns=['jobID'])
        
    # clean url list from duplicates and remove already parsed jobs
    seenids = set(oldJobs['jobID']).union(set(discardedJobs['jobID']))
    
    urlList = [x for x in urlList if int(re.search(r'3+\d{9}', x)[0]) not in seenids]
    chunks = [urlList[x:x+50] for x in range(0, len(urlList), 50)]
    failedJobs = 0

    for chunk in tqdm(chunks):
        driver = webdriver.Chrome()
        rows, failedJobs, toBeDiscardedJobs = parseJobsFromUrlList(chunk, driver)
        driver.quit()
        failedJobs += failedJobs
        
        jobs = pd.DataFrame(rows, columns=['jobID', 'company', 'position', 'location', 'requirements', 'language', 'jobUrl'])

        jobs.to_csv(csvfile, mode='a', header=False, index=False, sep=';')
        jobs.to_csv('today_'+csvfile, mode='a', header=False, index=False, sep=';')
        toBeDiscardedJobs = pd.DataFrame(list(toBeDiscardedJobs))
        toBeDiscardedJobs.to_csv('discarded_'+csvfile, mode='a', header=False, index=False, sep=';')

    
    return failedJobs


In [36]:
def getUrlsFromSearch(args, txtfile='data/urls.txt'):
    """
    Retrieves job URLs from a LinkedIn job search based on the provided arguments.

    Args:
        args (list): A list of search parameters, including keywords, location, f_TPR, and f_WT.
        txtfile (str): The name of the text file to store the URLs (default is 'urls.txt').

    Returns:
        int: The number of new job URLs found and added to the text file.
    """
    try:
        f = open(txtfile, 'r')
        oldurls = f.readlines()
        oldurls = {x.strip() for x in oldurls}
        f.close()
    except:
        oldurls = set()
    urls = set()
    target_url='https://www.linkedin.com/jobs/search/?keywords={}&location={}&f_TPR={}&f_WT={}'
    target_url = target_url.format(*list(map(quote, args)))

    driver = webdriver.Chrome()
    
    driver.get(target_url)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "two-pane-serp-page__results-list")))
    except:
        sleep(5)
        return None
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    resultList = soup.find('ul', {"class":'jobs-search__results-list'})
    if resultList is None:
        return 0
    
    newUrls = {x['href'] for x in resultList.find_all('a', href=True)}
    urls = urls.union(newUrls)
    
    ids = {re.search(r'3+\d{9}', x)[0] for x in urls if re.search(r'3+\d{9}', x) is not None}
    oldids = {re.search(r'3+\d{9}', x)[0] for x in oldurls if re.search(r'3+\d{9}', x) is not None}

    ids = ids.difference(oldids)

    f = open(txtfile, 'a')
    print(str(len(ids) + len(oldurls)) + ' total ids') 
    for id in ids:
        f.write('https://www.linkedin.com/jobs/view/' + id + '\n')

    f.close()
    return len(ids)

In [37]:
REMOTE= {"ON_SITE": "1", "REMOTE" : "2", "HYBRID" : "3"}
DAY = "r86400"
roles = ["RES engineer", "Energy Market Analyst", "Energy Modeller", "Energy Analyst"]
locations = ["EMEA", "United States", "United Kingdom", "Netherlands", "Denmark", "Germany", "France", "Spain", "Belgium", "Sweden", "Norway", "Finland", "Switzerland", "Austria", "Greece"]

argList = [[role, loc, remote, DAY] for role in roles for loc in locations for remote in list(REMOTE.values())]
argQueue = queue.Queue()

for args in argList:
    argQueue.put(args)

while not argQueue.empty():
    args = argQueue.get()
    rc = getUrlsFromSearch(args, 'data/urls_energy.txt')
    if rc is None:
        argQueue.put(args)
        continue
    print(str(argQueue.qsize()) + ' searches left')

4726 total ids
179 searches left
4730 total ids
178 searches left
4731 total ids
177 searches left
4731 total ids
176 searches left
4746 total ids
175 searches left
4749 total ids
174 searches left
4749 total ids
173 searches left
4749 total ids
172 searches left
4751 total ids
171 searches left
4751 total ids
170 searches left
4752 total ids
169 searches left
4752 total ids
168 searches left
4753 total ids
167 searches left
4753 total ids
166 searches left
4753 total ids
165 searches left
4753 total ids
164 searches left
4753 total ids
163 searches left
4753 total ids
162 searches left
4759 total ids
161 searches left
4759 total ids
160 searches left
4763 total ids
159 searches left
4764 total ids
158 searches left
4764 total ids
157 searches left
4764 total ids
156 searches left
4764 total ids
155 searches left
4764 total ids
154 searches left
4768 total ids
153 searches left
4768 total ids
152 searches left
4768 total ids
151 searches left
4771 total ids
150 searches left
4771 total

In [38]:
saveJobsCsvFromUrls(txtfile='data/urls_energy.txt', csvfile='data/jobs_energy.csv')

  0%|          | 0/7 [00:00<?, ?it/s]

Job is not in English: https://www.linkedin.com/jobs/view/3828340997
parsed job: MatchaTalent (Largest Energy Chemical) Generative AI Expert
parsed job: Reckitt Business Analyst - Procurement
parsed job: WhiteCrow Research 3D Geological Modeler
Job is not in English: https://www.linkedin.com/jobs/view/3800211932
Job is not in English: https://www.linkedin.com/jobs/view/3829060771
Job is not in English: https://www.linkedin.com/jobs/view/3820388086
Job is not in English: https://www.linkedin.com/jobs/view/3824320494
parsed job: Sunrun Energy Consultant
Job is not in English: https://www.linkedin.com/jobs/view/3824646833
parsed job: Microsoft Silicon Engineering: Internship Opportunities
Job is not in English: https://www.linkedin.com/jobs/view/3808806430
Job is not in English: https://www.linkedin.com/jobs/view/3828630020
Job is not in English: https://www.linkedin.com/jobs/view/3809416941
parsed job: KPMG Belgium IT Project manager - Energy
parsed job: Capgemini Business Analyst
Job is

 14%|█▍        | 1/7 [04:53<29:18, 293.00s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3778652395
Job is not in English: https://www.linkedin.com/jobs/view/3807594223
Job is not in English: https://www.linkedin.com/jobs/view/3814811652
parsed job: CareerAddict Algorithm Developer/Mathematical Modeler (C++)
parsed job: TRACT Sustainability Measurement Senior Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3810222556
parsed job: Sasol Market & Product Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3828161668
parsed job: Rentokil Initial Business Analyst
parsed job: AILY LABS DevSevOps Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3828054274
parsed job: Alliant Engineering, Inc. Senior Professional Engineer - Roadway
Job is not in English: https://www.linkedin.com/jobs/view/3814811397
parsed job: DTE Energy Engineer - Senior
parsed job: WSP in the U.S. Renewables Engineer
parsed job: Deloitte Business Analyst
parsed job: Spokane Regional Clean Air Agency Engi

 29%|██▊       | 2/7 [08:37<21:04, 252.84s/it]

parsed job: State of Minnesota Permit Engineer - Engineer 1 or 2 Graduate
parsed job: Munkahelyeink.hu Finnish Vattenfall Energy Advisor in Chania, Crete
parsed job: Acro Service Corp Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3828337656
parsed job: DNV Approval Engineer
parsed job: EyeTech Solutions Business Analyst
parsed job: The Whiting-Turner Contracting Company Morristown - Entry-Level Engineer
Job failed to parse: https://www.linkedin.com/jobs/view/3800849249
parsed job: Progressive Recruitment Senior MES Engineer
parsed job: TRIJIT Business Analyst Utilities P&S
parsed job: Spencer Ogden Business Development Analyst
parsed job: Head Energy AS Head Energy is looking for a skilled/experienced Pipeline engineer.
Job is not in English: https://www.linkedin.com/jobs/view/3824796237
parsed job: CREO Senior Manager, Energy
Job is not in English: https://www.linkedin.com/jobs/view/3808277858
Job is not in English: https://www.linkedin.com/jobs/view/3824754594
Jo

 43%|████▎     | 3/7 [13:49<18:39, 279.95s/it]

parsed job: Norsk Hydro Investor Relations Officer
parsed job: Cutting Edge Search Media Market Analyst  (Saudi National Only)
Job is not in English: https://www.linkedin.com/jobs/view/3824795945
parsed job: Likga Consultancy National Account Manager Belgium/ UK - Renewable energy solutions
Job is not in English: https://www.linkedin.com/jobs/view/3812883384
parsed job: Damen Business Analyst
parsed job: Swedbank Business Analyst
parsed job: Smart Energy LAB Business Analyst
parsed job: GELITA (Junior) Global Energy Manager (all genders)
parsed job: Energy Jobline Market Analyst
parsed job: emagine Business analyst and Process designer for KYC
parsed job: MatchaTalent (Global Refinery Petrochemical) Contract Advisor
parsed job: RBW Consulting CSV Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3827534070
parsed job: CMCC Foundation - Centro Euro Mediterraneo sui Cambiamenti Climatici 12539 - Integrated Assessment Modeler
parsed job: Energy Jobline Downstream Market A

 57%|█████▋    | 4/7 [18:23<13:51, 277.31s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3828058178
parsed job: Intellectt Inc Urgent hiring for the role Manufacture Engineer
parsed job: Universal Music Group Business Analyst
parsed job: Elisa (Senior) Data Scientist, Smart Energy Solutions (Helsinki or other)
parsed job: CareerAddict ETRM BA - €800pd - Geneva
Job is not in English: https://www.linkedin.com/jobs/view/3800580358
parsed job: CapturePoint LLC CCUS Geo-Modeler
parsed job: Meta Connectivity Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3814809685
Job is not in English: https://www.linkedin.com/jobs/view/3828044983
Job is not in English: https://www.linkedin.com/jobs/view/3820315616
parsed job: Evolution Business Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3828137999
Job is not in English: https://www.linkedin.com/jobs/view/3812876826
Job is not in English: https://www.linkedin.com/jobs/view/3828052848
parsed job: The NPD Group Business Insights Analyst
parsed jo

 71%|███████▏  | 5/7 [22:03<08:33, 256.77s/it]

parsed job: Novo Nordisk Digital Transformation Business Analyst
parsed job: Mackinnon Bruce International Market Analyst
parsed job: Global Water Technology, Inc. Analyst, Water & Energy Services
Job is not in English: https://www.linkedin.com/jobs/view/3782029387
Job is not in English: https://www.linkedin.com/jobs/view/3828112055
parsed job: Sia Partners Senior Manager Energy & Industry
Job is not in English: https://www.linkedin.com/jobs/view/3828161290
parsed job: Poudouleveis.gr Finnish Vattenfall Energy Advisor in Chania, Crete
parsed job: Western Digital Pricing Analyst - Business Engagement
parsed job: Energy Jobline Energieberater (d/w/m) (Homeoffice möglich) in Hamburg, Deutschland
Job is not in English: https://www.linkedin.com/jobs/view/3785018630
parsed job: Twain Financial Partners Asset Manager- Solar/Renewable Energy
Job is not in English: https://www.linkedin.com/jobs/view/3828106762
Job is not in English: https://www.linkedin.com/jobs/view/3807127307
parsed job: US T

 86%|████████▌ | 6/7 [25:54<04:07, 247.94s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3819940805
parsed job: ALTEN Sweden Business Controller and Analyst to ALTEN Nordics
Job is not in English: https://www.linkedin.com/jobs/view/3744737731
parsed job: CFP Energy Business Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3828051717
parsed job: Wella Company Senior Consumer and Market Insights Analyst
parsed job: INDI Staffing Services Work From Home Business Analyst - 8083
Job is not in English: https://www.linkedin.com/jobs/view/3824144197
Job is not in English: https://www.linkedin.com/jobs/view/3828347338
parsed job: ENGIE Business Analyst Field Service Management M/F
parsed job: FPS Studios 3D Modeler
Job is not in English: https://www.linkedin.com/jobs/view/3770321032
Job is not in English: https://www.linkedin.com/jobs/view/3579586355
parsed job: Ashley Ellis, Inc Turbine Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3815725134
Job is not in English: https://www.linkedin.

100%|██████████| 7/7 [28:42<00:00, 246.09s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3819992813





2