In [1]:
from bs4 import BeautifulSoup
import bs4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Safari
import requests
import pandas as pd
from langdetect import detect_langs
from urllib.parse import quote
import re

In [2]:
def getJobInfo(url, driver):
    """
    Retrieves job information from a given URL using a web driver.

    Args:
        url (str): The URL of the job posting.
        driver: The web driver used to access the URL.

    Returns:
        list: A list containing the job information, including job ID, company, position, location, description, language, and URL.
            If the job information cannot be retrieved, None is returned.
            If the description language is not English, -1 is returned.
    """
    
    driver.get(url)

    try:
        WebDriverWait(driver, 5).until(lambda d: d.find_element(By.CLASS_NAME, 'description__text') 
                                        and d.find_element(By.CLASS_NAME, 'topcard__title') 
                                        and d.find_element(By.CLASS_NAME, 'topcard__org-name-link') 
                                        and d.find_element(By.CLASS_NAME, 'topcard__flavor--bullet')) 
    except:
        # If this happens, it probably means LinkedIn is asking for a login
        try:
            if driver.find_element(By.CLASS_NAME, "topcard__flavor-row"):
                return -1
        except:
            return None

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    if soup.find('section', {"class":'two-pane-serp-page__search-header'}):
        return -1

    jobID = re.search(r'\d{10}', url).group(0)
    position = soup.find('h1', {"class":'topcard__title'}).get_text().strip()
    company = soup.find('a', {"class":'topcard__org-name-link'}).get_text().strip()
    location = soup.find('span', {"class":'topcard__flavor--bullet'}).get_text().strip()

        
    try:
        location = location.split(', ')[2]
    except:
        try:
            location = location.split(', ')[1]
        except:
            pass

    # TODO: need to find a better way to get requirements
    keywords = ["require", "qualifi", "skill", "have", "looking for", "must be", "ideal", "succ"]
    description = soup.find('div', {"class":'description__text description__text--rich'})
    # get it in text format
    description = description.get_text().replace('\n', '  ').replace(';', ', ').strip()
    description = re.sub(r'([a-z]+)([A-Z])', r'\1 \2', description)
    # detect description language
    try:
        lang = detect_langs(description)
        if lang[0].lang == 'en':
            descLang = lang[0].lang
        else:
            return 0
    except:
        return 0
    
    row = [jobID, company, position, location, description, descLang, url]

    return row
    
    

In [3]:
import queue
from time import sleep

def parseJobsFromUrlList(urlList, driver):
    """
    Parses job information from a list of URLs.

    Args:
        urlList (list): A list of URLs to parse job information from.
        driver: The web driver used to retrieve the web pages.

    Returns:
        tuple: A tuple containing two elements:
            - newRows (list): A list of parsed job information.
            - failedJobs (int): The number of jobs that failed to parse.
    """
    newRows = []
    failedJobs = 0
    discardedJobs = set()
    jobQueue = queue.Queue()

    for url in urlList:
        jobQueue.put(url)

    while not jobQueue.empty():
        url = jobQueue.get()
        row = getJobInfo(url, driver)
        
        if row == -1:
            print("Job failed to parse: " + url)
            discardedJobs.add(str(re.search(r'\d{10}', url).group(0)))
            failedJobs += 1
            continue

        if row == 0:
            print("Job is not in English: " + url)
            discardedJobs.add(str(re.search(r'\d{10}', url).group(0)))
            continue

        if row is None:
            #wait some time and try again
            sleep(5)
            jobQueue.put(url)

            continue

        print("parsed job: " + row[1] + " " + row[2])
        newRows.append(row)    

    return newRows, failedJobs, discardedJobs

In [12]:
from tqdm import tqdm

def saveJobsCsvFromUrls(txtfile='data/urls.txt', urlList=None, csvfile='data/jobs.csv'):
    """
    Parses job information from a list of URLs and saves the data to a CSV file.

    Args:
        txtfile (str): Path to the text file containing the list of URLs. Default is 'urls.txt'.
        urlList (list): List of URLs to parse. If not provided, the URLs will be read from the txtfile.
        csvfile (str): Path to the CSV file to save the parsed job data. Default is 'jobs.csv'.

    Returns:
        int: The number of failed jobs during parsing.

    """
    
    if urlList == None:
        f = open(txtfile, 'r')
        urlList = f.readlines()
        urlList = {x.strip() for x in urlList}
        f.close()

    disccardedJobsFile = 'discarded_'+csvfile if len(csvfile.split('/')) == 1 else "/".join(csvfile.split('/')[:-1])+'/discarded_'+csvfile.split('/')[-1]
    todayJobsFile = 'today_'+csvfile if len(csvfile.split('/')) == 1 else "/".join(csvfile.split('/')[:-1])+'/today_'+csvfile.split('/')[-1]

    try:
        oldJobs = pd.read_csv(csvfile, sep=';', names=['jobID', 'company', 'position', 'location', 'requirements', 'language', 'jobUrl'])
        discardedJobs = pd.read_csv(disccardedJobsFile, sep=';', names=['jobID'])
    except:
        print("No previous jobs found")
        oldJobs = pd.DataFrame(columns=['jobID', 'company', 'position', 'location', 'requirements', 'language', 'jobUrl'])
        discardedJobs = pd.DataFrame(columns=['jobID'])
        
    # clean url list from duplicates and remove already parsed jobs
    seenids = set(oldJobs['jobID']).union(set(discardedJobs['jobID']))
    
    urlList = [x for x in urlList if int(re.search(r'3+\d{9}', x)[0]) not in seenids]
    chunks = [urlList[x:x+50] for x in range(0, len(urlList), 50)]
    failedJobs = 0

    for chunk in tqdm(chunks):
        #headless mode
        options = Options()
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)
        rows, failedJobs, toBeDiscardedJobs = parseJobsFromUrlList(chunk, driver)
        driver.quit()
        failedJobs += failedJobs
        
        jobs = pd.DataFrame(rows, columns=['jobID', 'company', 'position', 'location', 'requirements', 'language', 'jobUrl'])

        jobs.to_csv(csvfile, mode='a', header=False, index=False, sep=';')
        if len(csvfile.split('/')) > 1:
            jobs.to_csv("/".join(csvfile.split('/')[:-1])+'/today_'+csvfile.split('/')[-1], mode='a', header=False, index=False, sep=';')
        else:
            jobs.to_csv(todayJobsFile, mode='a', header=False, index=False, sep=';')
        toBeDiscardedJobs = pd.DataFrame(list(toBeDiscardedJobs))
        toBeDiscardedJobs.to_csv(disccardedJobsFile, mode='a', header=False, index=False, sep=';')

    
    return failedJobs


In [5]:
def getUrlsFromSearch(args, txtfile='data/urls.txt'):
    """
    Retrieves job URLs from a LinkedIn job search based on the provided arguments.

    Args:
        args (list): A list of search parameters, including keywords, location, f_TPR, and f_WT.
        txtfile (str): The name of the text file to store the URLs (default is 'urls.txt').

    Returns:
        int: The number of new job URLs found and added to the text file.
    """
    try:
        f = open(txtfile, 'r')
        oldurls = f.readlines()
        oldurls = {x.strip() for x in oldurls}
        f.close()
    except:
        oldurls = set()
    urls = set()
    target_url='https://www.linkedin.com/jobs/search/?keywords={}&location={}&f_TPR={}&f_WT={}'
    target_url = target_url.format(*list(map(quote, args)))

    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    
    driver.get(target_url)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "two-pane-serp-page__results-list")))
    except:
        sleep(5)
        return None
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    resultList = soup.find('ul', {"class":'jobs-search__results-list'})
    if resultList is None:
        return 0
    
    newUrls = {x['href'] for x in resultList.find_all('a', href=True)}
    urls = urls.union(newUrls)
    
    ids = {re.search(r'3+\d{9}', x)[0] for x in urls if re.search(r'3+\d{9}', x) is not None}
    oldids = {re.search(r'3+\d{9}', x)[0] for x in oldurls if re.search(r'3+\d{9}', x) is not None}

    ids = ids.difference(oldids)

    f = open(txtfile, 'a')
    print(str(len(ids) + len(oldurls)) + ' total ids') 
    for id in ids:
        f.write('https://www.linkedin.com/jobs/view/' + id + '\n')

    f.close()
    return len(ids)

In [6]:
REMOTE= {"ON_SITE": "1", "REMOTE" : "2", "HYBRID" : "3"}
DAY = "r86400"
roles = ["RES engineer", "Energy Market Analyst", "Energy Modeller", "Energy Analyst"]
locations = ["EMEA", "United States", "United Kingdom", "Netherlands", "Denmark", "Germany", "France", "Spain", "Belgium", "Sweden", "Norway", "Finland", "Switzerland", "Austria", "Greece"]

argList = [[role, loc, remote, DAY] for role in roles for loc in locations for remote in list(REMOTE.values())]
argQueue = queue.Queue()

for args in argList:
    argQueue.put(args)

while not argQueue.empty():
    args = argQueue.get()
    rc = getUrlsFromSearch(args, 'data/urls_energy.txt')
    if rc is None:
        argQueue.put(args)
        continue
    print(str(argQueue.qsize()) + ' searches left')

4990 total ids
179 searches left
4994 total ids
178 searches left
4996 total ids
177 searches left
4996 total ids
176 searches left
4996 total ids
175 searches left
5001 total ids
174 searches left
5018 total ids
173 searches left
5019 total ids
172 searches left
5027 total ids
171 searches left
5031 total ids
170 searches left
5041 total ids
169 searches left
5041 total ids
168 searches left
5041 total ids
167 searches left
5049 total ids
166 searches left
5058 total ids
165 searches left
5060 total ids
164 searches left
5085 total ids
163 searches left
5097 total ids
162 searches left
5098 total ids
161 searches left
5106 total ids
160 searches left
5112 total ids
159 searches left
5112 total ids
158 searches left
5113 total ids
157 searches left
5113 total ids
156 searches left
5116 total ids
155 searches left
5116 total ids
154 searches left
5122 total ids
153 searches left
5147 total ids
152 searches left
5156 total ids
151 searches left
5156 total ids
150 searches left
5162 total

In [13]:
saveJobsCsvFromUrls(txtfile='data/urls_energy.txt', csvfile='data/jobs_energy.csv')

  0%|          | 0/11 [00:00<?, ?it/s]

Job is not in English: https://www.linkedin.com/jobs/view/3816166044
Job is not in English: https://www.linkedin.com/jobs/view/3829329219
Job is not in English: https://www.linkedin.com/jobs/view/3826708305
parsed job: Expert Hub Market Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3830228009
parsed job: Energy Jobline Clean Energy CEO & Entrepreneur at your own start-up (100 % remote) (m/f/d) in Frankfurt, Deutschland
parsed job: Dastur Energy Principal Consultant (Oil & Energy)
parsed job: JE Dunn Construction M/E Engineer 1 - Temple TX
Job is not in English: https://www.linkedin.com/jobs/view/3812727854
Job is not in English: https://www.linkedin.com/jobs/view/3829983716
parsed job: Page Southerland Page, Inc. Energy Modeler
parsed job: Antaes Consulting CQV / C&Q Engineer (F/M)
parsed job: TradeHeader Business Development and Marketing Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3826021620
parsed job: Poriferous, LLC Associate Engineer - En

  9%|▉         | 1/11 [05:48<58:04, 348.49s/it]

parsed job: Time2Market - Market Management Services Market Manager | Market Access Specialist in the Energy Industry
Job is not in English: https://www.linkedin.com/jobs/view/3736705612
parsed job: M-PRIME ADVISORY SERVICES Associate (Engineer) in management consulting
parsed job: Obrela SOC Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3806810095
Job is not in English: https://www.linkedin.com/jobs/view/3817391977
Job is not in English: https://www.linkedin.com/jobs/view/3830205676
Job is not in English: https://www.linkedin.com/jobs/view/3811817748
parsed job: Goodwin Recruiting Entry Level Engineer
parsed job: Emmett Green Ingenieur
parsed job: Chery Europe GmbH (Senior) Engineer – HMI
parsed job: Solar Alternatives Solar Energy Consultant
Job is not in English: https://www.linkedin.com/jobs/view/3830583131
Job failed to parse: https://www.linkedin.com/jobs/view/3818011377
parsed job: Picnic Technologies Business Analyst (m/w/d)
parsed job: Athenix Solutions Gr

 18%|█▊        | 2/11 [10:27<46:06, 307.44s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3830064197
Job is not in English: https://www.linkedin.com/jobs/view/3826717795
parsed job: D&T Group Engineering and Contracting Co. Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3830143844
parsed job: Technoform Raw Material Market Analyst
Job failed to parse: https://www.linkedin.com/jobs/view/3830474508
Job is not in English: https://www.linkedin.com/jobs/view/3828056002
parsed job: Jobot Groundwater Modeler
parsed job: Engineering International Belgium Business Analyst | European Institutions
parsed job: ALTEN Business Controller & Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3828926447
parsed job: De'Longhi Group BUSINESS ANALYST
Job is not in English: https://www.linkedin.com/jobs/view/3814532821
parsed job: ALC Evolution QARA Engineer
parsed job: IBM iX DACH Digital Business Analyst/Consultant (f/m/x)
Job is not in English: https://www.linkedin.com/jobs/view/3808956842
parsed job

 27%|██▋       | 3/11 [17:35<48:22, 362.80s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3810387813
Job is not in English: https://www.linkedin.com/jobs/view/3826726112
Job is not in English: https://www.linkedin.com/jobs/view/3808732583
parsed job: MatchaTalent (Global Refinery Petrochemical) Contract Advisor
parsed job: Tibber Conversion Rate Optimization Specialist
parsed job: Nordea Temenos Transact Senior Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3829565105
parsed job: Hitachi Energy Energy System Project Management Senior Professional
Job is not in English: https://www.linkedin.com/jobs/view/3826716904
Job is not in English: https://www.linkedin.com/jobs/view/3766653861
parsed job: ecocareers Energy Analyst
parsed job: Damen Market Intelligence Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3759321285
Job is not in English: https://www.linkedin.com/jobs/view/3824827355
Job is not in English: https://www.linkedin.com/jobs/view/3825829836
Job is not in English: https:/

 36%|███▋      | 4/11 [23:41<42:27, 363.88s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3825879608
parsed job: Energy Jobline Sales Program Manager - Energy and Sustainability Programs 80-100%
Job is not in English: https://www.linkedin.com/jobs/view/3830023887
parsed job: NielsenIQ Research Analyst - Internal
Job is not in English: https://www.linkedin.com/jobs/view/3830784006
parsed job: Nova B2B / FTTS Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3829949919
parsed job: RED Global Business Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3829338474
parsed job: INEOS Module Engineer - Interior
Job is not in English: https://www.linkedin.com/jobs/view/3817352577
parsed job: Blueprint Staffing Building Energy Inspector
Job is not in English: https://www.linkedin.com/jobs/view/3830095486
parsed job: Verto People Generator Engineer
parsed job: Teoresi Group Sign-Off Engineer for BEV Vehicles
Job is not in English: https://www.linkedin.com/jobs/view/3813726319
parsed job: Deloitte

 45%|████▌     | 5/11 [28:37<33:57, 339.60s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3830483116
parsed job: Whitetree Supportability Engineer
parsed job: PM Group Turnover Engineer
parsed job: International Atomic Energy Agency (IAEA) Energy and Climate Advisor(P5)
parsed job: Energy Jobline Bioenergy Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3807748187
parsed job: Ørsted DevFinOps Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3814814305
parsed job: Teespro Recruitment Engineer
parsed job: Scania Sverige TRATON  Group Product Management - Business Analyst Customer Requirements
parsed job: Canonical Developer Relations Engineer
parsed job: Head Energy AS On behalf of one of our clients, we are looking for Business Analyst.
Job is not in English: https://www.linkedin.com/jobs/view/3825828872
parsed job: ADM MARKET ANALYST, ANIMAL NUTRITION
parsed job: TotalEnergies Oil Market Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3829945320
Job is not in Eng

 55%|█████▍    | 6/11 [34:20<28:23, 340.73s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3824626895
Job is not in English: https://www.linkedin.com/jobs/view/3830078320
Job is not in English: https://www.linkedin.com/jobs/view/3826719346
parsed job: Infosys Consulting Biofuels/Low Carbon Business Analyst - Principal Consultant
parsed job: Canonical Kernel Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3802250818
parsed job: Energy Jobline Waste Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3830211476
parsed job: Athenix Solutions Group GFT 3d Modeler/Videographer
Job is not in English: https://www.linkedin.com/jobs/view/3704148433
parsed job: Axpo Group Senior Application Engineer / IT Business Analyst (f/m/d)
Job is not in English: https://www.linkedin.com/jobs/view/3829823570
Job is not in English: https://www.linkedin.com/jobs/view/3830481639
parsed job: InterEx Group D365 F&O Business Analyst (SCM)
Job is not in English: https://www.linkedin.com/jobs/view/3825274853
Job i

 64%|██████▎   | 7/11 [39:28<22:00, 330.03s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3829397676
Job is not in English: https://www.linkedin.com/jobs/view/3814821858
parsed job: Damen Autonomy Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3830130560
parsed job: Cartier B.E Beyond Engineering Graduate Program
Job is not in English: https://www.linkedin.com/jobs/view/3780068051
parsed job: Krila Consultancy & Recruitment Steel Market Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3830480730
Job is not in English: https://www.linkedin.com/jobs/view/3828999491
Job is not in English: https://www.linkedin.com/jobs/view/3812301326
Job is not in English: https://www.linkedin.com/jobs/view/3819384377
parsed job: Subsea7 Group SCM Market Analyst
Job is not in English: https://www.linkedin.com/jobs/view/3828188712
Job is not in English: https://www.linkedin.com/jobs/view/3807744402
Job is not in English: https://www.linkedin.com/jobs/view/3815557085
parsed job: SSE plc Wholesale Energ

 73%|███████▎  | 8/11 [45:16<16:47, 335.73s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3811642294
parsed job: Clarity Recruiting WC Meter Engineer (Cornwall) 35k  3k Bonus  Vehicle
Job is not in English: https://www.linkedin.com/jobs/view/3826723088
Job is not in English: https://www.linkedin.com/jobs/view/3829095503
Job is not in English: https://www.linkedin.com/jobs/view/3816681447
Job is not in English: https://www.linkedin.com/jobs/view/3807742319
Job is not in English: https://www.linkedin.com/jobs/view/3826726478
parsed job: #twiceasnice Recruiting Air Quality / Emissions Modeler (Up to $130K) - REMOTE
parsed job: VeroTech IVVQ Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3829950678
Job is not in English: https://www.linkedin.com/jobs/view/3807589343
Job is not in English: https://www.linkedin.com/jobs/view/3830463644
Job failed to parse: https://www.linkedin.com/jobs/view/3829309265
Job is not in English: https://www.linkedin.com/jobs/view/3830482397
Job is not in English: https://www

 82%|████████▏ | 9/11 [50:05<10:42, 321.06s/it]

Job is not in English: https://www.linkedin.com/jobs/view/3789284092
Job is not in English: https://www.linkedin.com/jobs/view/3814304263
parsed job: MatchaTalent (Global Refinery Petrochemical) Contract Advisor
parsed job: SPECTRAFORCE Associate Scientist/Engineer
parsed job: ANOTECH Pressure Vessel Engineer
parsed job: Canonical Kernel Engineer
parsed job: Canonical Microservices Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3825834193
Job is not in English: https://www.linkedin.com/jobs/view/3814810560
parsed job: Faststream Recruitment Group Plan Approval Engineer - Hull
parsed job: Capgemini Engineering CSV Engineer
parsed job: Vita Business Development Analyst - Venture Capital
Job is not in English: https://www.linkedin.com/jobs/view/3830136417
parsed job: DTE Energy Manager - Renewable Energy Operations
parsed job: Energy Jobline Tender Engineer in Liverpool, UK
parsed job: Canonical Microservices Engineer
Job is not in English: https://www.linkedin.com/job

 91%|█████████ | 10/11 [56:41<05:44, 344.20s/it]

parsed job: Athenix Solutions Group GFT 3d Modeler/Videographer
parsed job: Leonardo B2 Engineer
parsed job: Capgemini Engineering CQV Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3829362152
parsed job: Athenix Solutions Group GFT 3d Modeler/Videographer
parsed job: Pro Informatik AG Business Analyst Trading/EMS
parsed job: Belmont Lavan Metaverse Engineer
Job is not in English: https://www.linkedin.com/jobs/view/3829902142
Job failed to parse: https://www.linkedin.com/jobs/view/3830109206
Job is not in English: https://www.linkedin.com/jobs/view/3830209290
Job is not in English: https://www.linkedin.com/jobs/view/3828107340
Job is not in English: https://www.linkedin.com/jobs/view/3830011127
Job is not in English: https://www.linkedin.com/jobs/view/3789669896
Job is not in English: https://www.linkedin.com/jobs/view/3826144980
Job is not in English: https://www.linkedin.com/jobs/view/3829496068
Job is not in English: https://www.linkedin.com/jobs/view/3808727929


100%|██████████| 11/11 [59:34<00:00, 324.96s/it]

parsed job: International Atomic Energy Agency (IAEA) Energy Systems Analyst(P4)





2