Web scraping jobs on linkedin

In [1]:
import os

## Working directory
parent_dir = r'C:\Users\garcia38\Dropbox\Projects\project_linkedin\web-scraping-linkedin'
os.chdir(parent_dir)

## Create a folder where to store the code and save this notebook there
if not os.path.isdir('code'):
    os.mkdir('code')

## Create a folder where to store the raw data
if not os.path.isdir('data-raw'):
    os.mkdir('data-raw')

In [2]:
## Import libraries
import requests
from bs4 import BeautifulSoup
import csv 
import re

Define function to web scrape jobs on Linkedin and write them into a csv file

In [3]:
def webscraper_linkedin(webpage_base, page_number, max_page_number):
    
    ## Define URL
    webpage = webpage_base + str(page_number)
    print(str(webpage)) 

    ## Send HTTP request to retrieve the HTML code and store it 
    response = requests.get(str(webpage)) 
    
    ## Parse the HTML code
    soup = BeautifulSoup(response.content,'html.parser')
    print(response)

    ## Access the data needed (all job listings are wrapped in a <div> with the following class)
    jobs = soup.find_all('div', class_='base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card')

    ## Loop over the CSS selectors
    for job in jobs:
        job_title = job.find('h3', class_='base-search-card__title').text.strip()
        job_company = job.find('h4', class_='base-search-card__subtitle').text.strip()
        job_location = job.find('span', class_='job-search-card__location').text.strip()
        job_date = job.find('time', class_='job-search-card__listdate')
        pattern = re.compile('([0-9]{4})-([0-9]{2})-([0-9]{2})')
        if pattern.findall(str(job_date)):
            job_datem = pattern.search(str(job_date))[0]
        else:
            job_datem = 'None'
        job_link = job.find('a', class_='base-card__full-link')['href']
        
        # Write extracted data into csv file
        writer.writerow([job_title, job_company, job_location, job_datem, job_link])
    
    if page_number < max_page_number:
        page_number = page_number + 25 # increase the start parameter by 25
        webscraper_linkedin(webpage_base, page_number, max_page_number)
    else:
        file.close()

Web scraping Data Analyst jobs

In [4]:
## Remove file if exists
import os

filename = os.path.join(parent_dir, 'data-raw', 'linkedin_jobs_dataanalyst.csv')
if os.path.exists(filename):
    os.remove(filename)

In [5]:
## Open csv file and write headers
file = open(os.path.join(parent_dir, 'data-raw', 'linkedin_jobs_dataanalyst.csv'), 'a', newline='', encoding="utf-8-sig")
writer = csv.writer(file)
headers = ['Title', 'Company', 'Location', 'Date', 'Link']
writer.writerow(headers)

## Define URL
url_linkedin = 'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Analyst&location=Innere+Stadt,+Vienna,+Austria&geoId=104916553&f_TPR=r2592000&f_PP=105890822,104916553&distance=50&f_JT=F,P,C&f_E=3,2,4&currentJobId=3452827396&position=10&pageNum=1&start='

## Run function
webscraper_linkedin(url_linkedin, 0, 0)

https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Analyst&location=Innere+Stadt,+Vienna,+Austria&geoId=104916553&f_TPR=r2592000&f_PP=105890822,104916553&distance=50&f_JT=F,P,C&f_E=3,2,4&currentJobId=3452827396&position=10&pageNum=1&start=0
<Response [200]>


In [6]:
## Load the csv file
import pandas as pd

dataanalyst = pd.read_csv(os.path.join(parent_dir, 'data-raw', 'linkedin_jobs_dataanalyst.csv'), encoding="utf8")

In [7]:
## Inspect the data
dataanalyst.head(5)

Unnamed: 0,Title,Company,Location,Date,Link
0,Data Analyst (f/m/x),Mondi Group,"Vienna, Vienna, Austria",2023-02-07,https://at.linkedin.com/jobs/view/data-analyst...
1,Project and Data Analyst (m/f/d),Netconomy,"Vienna, Vienna, Austria",2023-01-31,https://at.linkedin.com/jobs/view/project-and-...
2,Cloud Data Analyst (m/f/d),Siemens,"Vienna, Vienna, Austria",2023-02-09,https://at.linkedin.com/jobs/view/cloud-data-a...
3,Cloud Data Analyst (m/f/d),Siemens,"Vienna, Vienna, Austria",2023-02-09,https://at.linkedin.com/jobs/view/cloud-data-a...
4,Cloud Data Analyst (m/f/d),Siemens,"Vienna, Vienna, Austria",2023-02-06,https://at.linkedin.com/jobs/view/cloud-data-a...
