In [162]:
#imports
import csv
import requests
from datetime import datetime
from bs4 import BeautifulSoup

#url
def get_url(position, location,page_start):
    """(Specifically for se.Indeed.com)Generate a url from position and location on which page to start"""
    template = 'https://se.indeed.com/jobs?q={}&l={}&start={}'
    #Starting with the template var, use string formatting to pass in the position and location
    #and format it to the var url. Don't need worry about formatting the url with %20 or + signs to fill in spaces, 
    #the request will interpret correctly without them on this website, however don't include and punctuation or chars not typically used in a url.
    url = template.format(position, location,(page_start-1)*10)
    return url

def web_scrape(position, location,pages):
    """(Specifically for se.Indeed.com)"""
    global current_page                            # If captcha stops scraping, current_page keeps track of page number
    front_page_url =get_url(position, location,current_page)  
    front_page_response = requests.get(front_page_url)                   
    front_page_html = BeautifulSoup(front_page_response.text, 'html.parser') #BeautifulSoup returns the HTML code of the page
    atag_list = front_page_html.find_all('a',{'data-hiring-event': 'false'}) #Magically returns the html responsible for job posts
    data_rows = []
    captcha_exit = False
    while current_page <= pages:
        print('Page: ' + current_page)
        for atag in atag_list:
            print('\n')
            #For each job post(atag), got to that page and save the title and requirements as a row in data_rows  
            print('href: ' + atag.get('href'))
            job_url = 'https://se.indeed.com' + atag.get('href')   # go to the atag link
            job_page_response = requests.get(job_url)  
            job_html = BeautifulSoup(job_page_response.text, 'html.parser')  #Read the page's Html
            
            #If captcha stops our request.get, the code below will return an Attribute error.
            try:
                title = job_html.head.title.get_text(strip=True)
                print("title = " +title)
                if position in title.lower(): # Better to have accurate data than false positives
                    div_list = job_html.find('div', class_='jobsearch-jobDescriptionText')
                    
                    #After inspecting patterns at Indeed, (unordered list -> list) best way to find requirements
                    #It will sometimes not return anything, but if it returns anything, it often is the desired info.
                    requirements = ' '.join([div_list.select('ul > li')[x].get_text(strip=True) for x in range(len(div_list.select('ul > li')))])
                    
                    if requirements != '':
                        data_rows.append([title,requirements])
            except AttributeError:
                captcha_exit = True
                print(AttributeError)
                break
                
        if captcha_exit == True:
            break
        try: #Go back to front page and click on the next page button.
            next_page_url = 'https://se.indeed.com' + front_page_html.find('a',{'aria-label': 'Nästa'}).get('href')
            next_page_response = requests.get(next_page_url,headers={'User-Agent': 'Mozilla/5.0'})  
            next_page_html = BeautifulSoup(next_page_response.text, 'html.parser')
            atag_list = next_page_html.find_all('a',{'data-hiring-event': 'false'})#Replace atag_list to repeat for next page
            front_page_html = next_page_html
            current_page += 1
        except AttributeError:
            print(AttributeError)
            break
        
    #Writing data_rows to a csv file.
    today = datetime.today().strftime('%Y-%m-%d')
    with open('_'.join(position.split(' ')) + '_' + today + '.csv','a',newline='',encoding="utf-8", errors='replace') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'requirements'])
        writer.writerows(data_rows)
    return data_rows



import pyautogui
from time import sleep
#To bypass captcha, download desired VPN and switch vpn automatically when captcha stops the scraping process
def connect_to_vpn(vpn_button_x,vpn_button_y):
    pyautogui.click(vpn_button_x, vpn_button_y)
    pyautogui.click(vpn_button_x, vpn_button_y)
def run_cell(cell_x,cell_y):
    pyautogui.click(cell_x, cell_y)
    pyautogui.click(cell_x, cell_y)
    pyautogui.hotkey('ctrl',"enter")
    
#Initialize the positions of vpn button and jupyter cell position
def init():
    global vpn_button_x
    global vpn_button_y
    global cell_x
    global cell_y
    print('Put your mouse over the vpn connect button')
    sleep(1)
    print('3..')
    sleep(1)
    print('2..')
    sleep(1)
    print('1..')
    sleep(1)
    print('VPN connect button position:')
    print(pyautogui.position())
    vpn_button_x, vpn_button_y = pyautogui.position()
    print('Put your mouse over this cell that runs the main script')
    sleep(1)
    print('3..')
    sleep(1)
    print('2..')
    sleep(1)
    print('1..')
    sleep(1)
    print('Cell position:')
    print(pyautogui.position())
    cell_x, cell_y = pyautogui.position() 
    
def main(position,location,pages_to_scrape = 20, auto_repeat = True):
    global current_page #For when captcha stops the scraping
       
    try:
        current_page
    except NameError:
        current_page = 1
    parse_list = web_scrape(position, location,pages_to_scrape)
    if auto_repeat == True:
        if current_page <= pages_to_scrape:
            global vpn_button_x
            global vpn_button_y
            global cell_x
            global cell_y
            connect_to_vpn(vpn_button_x,vpn_button_y)
            sleep(5)
            run_cell(cell_x,cell_y)

In [154]:
global current_page
current_page = 1
init()

Put your mouse over the vpn connect button
3..
2..
1..
VPN connect button position:
Point(x=263, y=399)
Put your mouse over this cell that runs the main script
3..
2..
1..
Cell position:
Point(x=1064, y=714)


In [161]:
main('backend','sweden',120,True)

60


/rc/clk?jk=9ecc0130d3d305e7&fccid=c3c8df96df173714&vjs=3
title = Senior Java Developer with AWS experience - 411 03 Göteborg - Indeed.com


/rc/clk?jk=ad2c3790e3e0cff6&fccid=377ff150a39aeb0a&vjs=3
title = Systemutvecklare till Gävle-Dalarna - 784 33 Borlänge - Indeed.com


/rc/clk?jk=38ec7525bc00521d&fccid=f452ff89851e4578&vjs=3
title = Full stack GoLang-utvecklare - Göteborg - Indeed.com


/rc/clk?jk=e1fcc336324c8018&fccid=ed21d9e9151873ed&vjs=3
title = Senior utvecklare - Falun - Indeed.com


/rc/clk?jk=5d151a84785a1caf&fccid=6e45afc8910cab9e&vjs=3
title = Senior Software Engineer - Stockholm - Indeed.com


/rc/clk?jk=0885a56e29043c66&fccid=fa0ca3b638673d62&vjs=3
title = Java Developer, Stockholm - Solna - Indeed.com


/rc/clk?jk=bf60686669e75e83&fccid=f8b265243da9f9d7&vjs=3
title = Backend Developer to ADB Safegate - Malmö kommun - Indeed.com
im in


/rc/clk?jk=56c71f123ed7f46d&fccid=79355ba9d85aec2a&vjs=3
title = Senior backend Engineer (Java/Kotlin/AWS) - Build brand... - 111

title = Fullstack Developer looking for their next challange - Stockholm - Indeed.com


/rc/clk?jk=7d951374840ba153&fccid=23da706355bae1fb&vjs=3
title = Utvecklare till samhällsnyttiga projekt - 652 26 Karlstad - Indeed.com


/rc/clk?jk=4ca33d725c6bbd20&fccid=3e24dd0731ba58b3&vjs=3
title = Java Developer - Fintech - Fully Remote (Swedish Speaking) - Västerås - Indeed.com


/rc/clk?jk=d84ffcf739d2ed09&fccid=d224ef43689d6e3b&vjs=3
title = C#/.NET - Backend lead developer - 111 44 Stockholm - Indeed.com
im in


/rc/clk?jk=62199ebfa029643e&fccid=639bee6393c60c73&vjs=3
title = Electrical Engineer - 417 05 Göteborg - Indeed.com


/rc/clk?jk=91f3a8f27c9900cd&fccid=a71f63892a265108&vjs=3
title = Systemutvecklare till snabbväxande Health-Tech Startup - Stockholm - Indeed.com


/rc/clk?jk=f459fb9a579a3db8&fccid=2bcb74f46843a3b9&vjs=3
title = Senior Financial Consultant – NetSuite - Sverige - Indeed.com


/rc/clk?jk=b841a3efc29c3c9c&fccid=505a9f8310876030&vjs=3
title = Junior mjukvaruutvecklare t

title = Erfaren Fullstack Engineer - Göteborg - Indeed.com


/pagead/clk?mo=r&ad=-6NYlbfkN0B2WYB5nUIf-nu8IexjBrTCOI-EIfgJEpLl-C_Vi97ErftAXPNx-epAvB1zkxmaGTFT1aKIgiONPX-XJOeu9qFgm3Ej2UyFMZ-MUl7yQRBHjV6dzkIVj1GQ2Uz1TzJyHDZsZt7uBIxqTeEqqjoj-by831IwAtasC_BaRV8btSVXVFZ3kPrDPwmFIqeoWcNf_9IQ4ioAVL7KJhbYk2OHF8L8_KLAmkzAc50BKjojzVbUvQzIxTySy8cCrBiyBVvfHA2V39eHSdw3RQpGXFwnI31RwPjZdNf1-Sr3LjwGuWp6UiJ72FuCLIsDVFsOnKwUXGamSw1ghfwWNI1WuXrlbpSY3hICLn_w8fZ3zaDUuD_fxsFIQN46aJ-P_iWZ421a1X-eDkNifA9Tm0S_gvEVa0Ad97sjn6ISm9SpHdw8XPaB2AFky13Hx-A2B8ucSvT39kYkJSKWHULk5w==&p=6&fvj=0&vjs=3
title = Senior Backend Engineer - Stockholm - Indeed.com
im in
<class 'AttributeError'>
