# Scraper of available Bolt jobs

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from html import unescape
import re
from time import sleep
from tqdm import tqdm

- Scraper class

In [2]:
class boltScraper():
    def __init__(self):
        self.jobs_df = pd.DataFrame()
    
    def get_job_list(self):
        """Method to get all positions available on site (without details)"""
        request_url = 'https://node.bolt.eu/careers-portal/careersPortal/v2/getJobs/?version=CP.3.72'
        response = requests.get(request_url)
        
        if response.status_code == 200:
            response_dict = response.json()
            self.jobs_df = pd.DataFrame(response_dict['data']['jobs'])
            #add a column with the job page
            self.jobs_df['link'] = 'https://bolt.eu/en/careers/positions/' + self.jobs_df['id']
            print(f'Success! There are {len(self.jobs_df)} jobs available!')
        else:
            print('Request ERROR!!')
    
    def clean_description(self, text):
        """Helper function to clean html text from response"""
        text_without_tags = re.sub('<[^<]+?>', '', text)
        clean_text = re.sub('\xa0|\n|\u200b', ' ', text_without_tags)
        return clean_text
    
    def request_job_info(self, job_id):
        """Helper function to get jobs details by request"""
        #request url
        job_info_url = f'https://node.bolt.eu/careers-portal/careersPortal/v2/getJob/?id={job_id}&version=CP.3.72'
        #random sleep time to not get caught by anti-crawlers
        sleep(np.random.chisquare(2))
        #get response
        response = requests.get(job_info_url)

        if response.status_code == 200:
            #get the job description from json object
            description_text = response.json()['data']['description']
            #parse text using beautifulSoup
            soup = BeautifulSoup(unescape(description_text))
            return self.clean_description(soup.text)
        else:
            print('Request ERROR')
            return None
        
    def get_job_info(self):
        """Method to make all job infos requests and storing them in a column of the DataFrame"""
        description_list = []
        for job_id in tqdm(self.jobs_df['id']):
            #requests
            job_description = self.request_job_info(job_id)
            description_list.append(job_description)
            
        self.jobs_df['description'] = description_list
        
    def save_data(self):
        """Method to save the dataFrame in a csv format"""
        self.jobs_df.to_csv('bolt_jobs_info.csv', index=False)

- Let's get the data! (I hope no one blocks me)

In [3]:
bolt_jobs = boltScraper()

In [4]:
bolt_jobs.get_job_list()

Success! There are 968 jobs available!


In [5]:
bolt_jobs.get_job_info()
bolt_jobs.save_data()

100%|██████████| 968/968 [47:40<00:00,  2.96s/it]  
