In [19]:
import os
import logging
import requests
from bs4 import BeautifulSoup
import logging
from linkedin_jobs_scraper import LinkedinScraper
from linkedin_jobs_scraper.events import Events, EventData, EventMetrics
from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters
from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters, ExperienceLevelFilters, \
    OnSiteOrRemoteFilters, SalaryBaseFilters
import pandas as pd
from linkedin_jobs_scraper.utils.chrome_driver import build_driver
import time
from datetime import datetime
import csv

In [35]:
FieldName = ['job_id', 'title', 'company','date','link','insight','description']
today = datetime.now().strftime("%y_%m_%d")
target_csv = f"job_scan_{today}.csv"

# Create the output csv if not exist
if target_csv and not os.path.exists(target_csv):
    with open(target_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file,fieldnames=FieldName)
        writer.writeheader()


In [31]:
# Function to append each result to the CSV file
def write_output(result, csv_file,FieldName = FieldName):
    with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=FieldName)
        writer.writerow(result)


# Change root logger level (default is WARN)
logging.basicConfig(level=logging.INFO)


# Fired once for each successfully processed job
def on_data(data: EventData):
    print(target_csv)
    print('[ON_DATA]',data.job_id, data.title, data.company, data.date, data.link, data.insights,
          len(data.description))
    output = {'job_id':data.job_id, 'title':data.title, 'company':data.company,'date':data.date,'link':data.link,'insight':data.insights,'description':data.description}
    write_output(output, target_csv)

# Fired once for each page (25 jobs)
def on_metrics(metrics: EventMetrics):
    print('[ON_METRICS]', str(metrics))


def on_error(error):
    print('[ON_ERROR]', error)


def on_end():
    print('[ON_END]')


scraper = LinkedinScraper(
    chrome_executable_path=None,  # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
    chrome_binary_location=None,  
    chrome_options=None,  # Custom Chrome options here
    headless=True,  # Overrides headless mode only if chrome_options is None
    max_workers=1,  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
    slow_mo=0.5,  # Slow down the scraper to avoid 'Too many requests 429' errors (in seconds)
    page_load_timeout=40,
  # Page load timeout (in seconds)    
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)


INFO:li:scraper:('Using strategy AuthenticatedStrategy',)


In [None]:
linked_page = pd.read_csv('linkedin_pages.csv')
execute_list = linked_page.dropna()
sample = execute_list.iloc[10,:]
for link in execute_list['job_link']:
    query = Query(    
        options=QueryOptions(
            locations=['United States'],
            skip_promoted_jobs=True,
            limit = 1000,      
            filters=QueryFilters(
                company_jobs_url= link
            )
        )
    )
    scraper.run(query)

INFO:li:scraper:('Starting new query', "Query(query= options=QueryOptions(limit=1000 locations=['United States'] filters=QueryFilters(company_jobs_url=https://www.linkedin.com/jobs/search/?f_C=95311953&geoId=92000000&origin=COMPANY_PAGE_JOBS_CLUSTER_EXPANSION&originToLandingJobPostings=3966683952) apply_link=False skip_promoted_jobs=True page_offset=0))")
INFO:li:scraper:('Chrome debugger url', 'http://localhost:51647')
INFO:li:scraper:('Websocket debugger url: ', 'ws://localhost:51647/devtools/page/EBCBF3AEF496FBE5BEE9A188BBED873B')
INFO:li:scraper:('[][United States]', 'Setting authentication cookie')
INFO:li:scraper:('[][United States]', 'Opening https://www.linkedin.com/jobs/search?location=United+States&f_C=95311953&start=0')
INFO:li:scraper:('[][United States]', 'Session is valid')
INFO:li:scraper:('[][United States][1]', 'Processed')


job_scan_24_07_18.csv
[ON_DATA] 3966683952 Senior Research Associate, RNA Sciences (Application Sciences) Addition Therapeutics  https://www.linkedin.com/jobs/view/3966683952/?eBP=NON_CHARGEABLE_CHANNEL&refId=BshJlRjmvkK6wqc7gSabTg%3D%3D&trackingId=R11U3U6B16BteFZt42QXJw%3D%3D&trk=flagship3_search_srp_jobs ['$95K/yr - $110K/yr On-site Full-time Entry level', 'Research', '11-50 employees · Biotechnology Research', 'Skills: RNA, Molecular Biology, +8 more', 'See how you compare to over 100 other applicants. Try Premium for $0'] 3780


INFO:li:scraper:('[][United States][2]', 'Processed')


job_scan_24_07_18.csv
[ON_DATA] 3978695218 Associate Scientist/Scientist, RNA Process Development Addition Therapeutics  https://www.linkedin.com/jobs/view/3978695218/?eBP=NON_CHARGEABLE_CHANNEL&refId=BshJlRjmvkK6wqc7gSabTg%3D%3D&trackingId=OkxNKvHy0JYs%2BvsUP%2F2PSQ%3D%3D&trk=flagship3_search_srp_jobs ['$115K/yr - $135K/yr On-site Full-time Mid-Senior level', '11-50 employees · Biotechnology Research', 'Skills: Purification, Molecular Biology, +8 more', 'See how you compare to over 100 other applicants. Try Premium for $0', '', 'Am I a good fit for this job?', 'How can I best position myself for this job?', 'Tell me more about Addition ...'] 3310


INFO:li:scraper:('[][United States][3]', 'Skipped because promoted')
INFO:li:scraper:('[][United States][4]', 'Skipped because promoted')
INFO:li:scraper:('[][United States][5]', 'Skipped because promoted')
INFO:li:scraper:('[][United States]', 'No more jobs to process in this page')
INFO:li:scraper:('[][United States]', 'Metrics:', '{ processed: 2, failed: 0, missed: 20, skipped: 3 }')
INFO:li:scraper:('[][United States]', 'Pagination requested [1]')
INFO:li:scraper:('[][United States]', 'Opening https://www.linkedin.com/jobs/search?location=United+States&f_C=95311953&start=25')
INFO:li:scraper:('[][United States]', 'Waiting for new jobs to load')
INFO:li:scraper:('[][United States]', "Couldn't find more jobs for the running query")


[ON_END]
