In [162]:
#imports
import csv
import requests
from datetime import datetime
from bs4 import BeautifulSoup

#url
def get_url(position, location,page_start):
    """(Specifically for se.Indeed.com)Generate a url from position and location on which page to start"""
    template = 'https://se.indeed.com/jobs?q={}&l={}&start={}'
    #Starting with the template var, use string formatting to pass in the position and location
    #and format it to the var url. Don't need worry about formatting the url with %20 or + signs to fill in spaces, 
    #the request will interpret correctly without them on this website, however don't include and punctuation or chars not typically used in a url.
    url = template.format(position, location,(page_start-1)*10)
    return url

def web_scrape(position, location,pages):
    """(Specifically for se.Indeed.com)"""
    global current_page                            # If captcha stops scraping, current_page keeps track of page number
    front_page_url =get_url(position, location,current_page)  
    front_page_response = requests.get(front_page_url)                   
    front_page_html = BeautifulSoup(front_page_response.text, 'html.parser') #BeautifulSoup returns the HTML code of the page
    atag_list = front_page_html.find_all('a',{'data-hiring-event': 'false'}) #Magically returns the html responsible for job posts
    data_rows = []
    captcha_exit = False
    while current_page <= pages:
        print('Page: ' + current_page)
        for atag in atag_list:
            print('\n')
            #For each job post(atag), got to that page and save the title and requirements as a row in data_rows  
            print('href: ' + atag.get('href'))
            job_url = 'https://se.indeed.com' + atag.get('href')   # go to the atag link
            job_page_response = requests.get(job_url)  
            job_html = BeautifulSoup(job_page_response.text, 'html.parser')  #Read the page's Html
            
            #If captcha stops our request.get, the code below will return an Attribute error.
            try:
                title = job_html.head.title.get_text(strip=True)
                print("title = " +title)
                if position in title.lower(): # Better to have accurate data than false positives
                    div_list = job_html.find('div', class_='jobsearch-jobDescriptionText')
                    
                    #After inspecting patterns at Indeed, (unordered list -> list) best way to find requirements
                    #It will sometimes not return anything, but if it returns anything, it often is the desired info.
                    requirements = ' '.join([div_list.select('ul > li')[x].get_text(strip=True) for x in range(len(div_list.select('ul > li')))])
                    
                    if requirements != '':
                        data_rows.append([title,requirements])
            except AttributeError:
                captcha_exit = True
                print(AttributeError)
                break
                
        if captcha_exit == True:
            break
        try: #Go back to front page and click on the next page button.
            next_page_url = 'https://se.indeed.com' + front_page_html.find('a',{'aria-label': 'Nästa'}).get('href')
            next_page_response = requests.get(next_page_url,headers={'User-Agent': 'Mozilla/5.0'})  
            next_page_html = BeautifulSoup(next_page_response.text, 'html.parser')
            atag_list = next_page_html.find_all('a',{'data-hiring-event': 'false'})#Replace atag_list to repeat for next page
            front_page_html = next_page_html
            current_page += 1
        except AttributeError:
            print(AttributeError)
            break
        
    #Writing data_rows to a csv file.
    today = datetime.today().strftime('%Y-%m-%d')
    with open('_'.join(position.split(' ')) + '_' + today + '.csv','a',newline='',encoding="utf-8", errors='replace') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'requirements'])
        writer.writerows(data_rows)
    return data_rows



import pyautogui
from time import sleep
#To bypass captcha, download desired VPN and switch vpn automatically when captcha stops the scraping process
def connect_to_vpn(vpn_button_x,vpn_button_y):
    pyautogui.click(vpn_button_x, vpn_button_y)
    pyautogui.click(vpn_button_x, vpn_button_y)
def run_cell(cell_x,cell_y):
    pyautogui.click(cell_x, cell_y)
    pyautogui.click(cell_x, cell_y)
    pyautogui.hotkey('ctrl',"enter")
    
#Initialize the positions of vpn button and jupyter cell position
def init():
    global vpn_button_x
    global vpn_button_y
    global cell_x
    global cell_y
    print('Put your mouse over the vpn connect button')
    sleep(1)
    print('3..')
    sleep(1)
    print('2..')
    sleep(1)
    print('1..')
    sleep(1)
    print('VPN connect button position:')
    print(pyautogui.position())
    vpn_button_x, vpn_button_y = pyautogui.position()
    print('Put your mouse over this cell that runs the main script')
    sleep(1)
    print('3..')
    sleep(1)
    print('2..')
    sleep(1)
    print('1..')
    sleep(1)
    print('Cell position:')
    print(pyautogui.position())
    cell_x, cell_y = pyautogui.position() 
    
def main(position,location,pages_to_scrape = 20, auto_repeat = True):
    global current_page #For when captcha stops the scraping
       
    try:
        current_page
    except NameError:
        current_page = 1
    parse_list = web_scrape(position, location,pages_to_scrape)
    if auto_repeat == True:
        if current_page <= pages_to_scrape:
            global vpn_button_x
            global vpn_button_y
            global cell_x
            global cell_y
            connect_to_vpn(vpn_button_x,vpn_button_y)
            sleep(5)
            run_cell(cell_x,cell_y)

In [154]:
global current_page
current_page = 1
init()

Put your mouse over the vpn connect button
3..
2..
1..
VPN connect button position:
Point(x=263, y=399)
Put your mouse over this cell that runs the main script
3..
2..
1..
Cell position:
Point(x=1064, y=714)


In [161]:
main('backend','sweden',120,True)

60


/rc/clk?jk=9ecc0130d3d305e7&fccid=c3c8df96df173714&vjs=3
title = Senior Java Developer with AWS experience - 411 03 Göteborg - Indeed.com


/rc/clk?jk=ad2c3790e3e0cff6&fccid=377ff150a39aeb0a&vjs=3
title = Systemutvecklare till Gävle-Dalarna - 784 33 Borlänge - Indeed.com


/rc/clk?jk=38ec7525bc00521d&fccid=f452ff89851e4578&vjs=3
title = Full stack GoLang-utvecklare - Göteborg - Indeed.com


/rc/clk?jk=e1fcc336324c8018&fccid=ed21d9e9151873ed&vjs=3
title = Senior utvecklare - Falun - Indeed.com


/rc/clk?jk=5d151a84785a1caf&fccid=6e45afc8910cab9e&vjs=3
title = Senior Software Engineer - Stockholm - Indeed.com


/rc/clk?jk=0885a56e29043c66&fccid=fa0ca3b638673d62&vjs=3
title = Java Developer, Stockholm - Solna - Indeed.com


/rc/clk?jk=bf60686669e75e83&fccid=f8b265243da9f9d7&vjs=3
title = Backend Developer to ADB Safegate - Malmö kommun - Indeed.com
im in


/rc/clk?jk=56c71f123ed7f46d&fccid=79355ba9d85aec2a&vjs=3
title = Senior backend Engineer (Java/Kotlin/AWS) - Build brand... - 111

title = Fullstack Developer looking for their next challange - Stockholm - Indeed.com


/rc/clk?jk=7d951374840ba153&fccid=23da706355bae1fb&vjs=3
title = Utvecklare till samhällsnyttiga projekt - 652 26 Karlstad - Indeed.com


/rc/clk?jk=4ca33d725c6bbd20&fccid=3e24dd0731ba58b3&vjs=3
title = Java Developer - Fintech - Fully Remote (Swedish Speaking) - Västerås - Indeed.com


/rc/clk?jk=d84ffcf739d2ed09&fccid=d224ef43689d6e3b&vjs=3
title = C#/.NET - Backend lead developer - 111 44 Stockholm - Indeed.com
im in


/rc/clk?jk=62199ebfa029643e&fccid=639bee6393c60c73&vjs=3
title = Electrical Engineer - 417 05 Göteborg - Indeed.com


/rc/clk?jk=91f3a8f27c9900cd&fccid=a71f63892a265108&vjs=3
title = Systemutvecklare till snabbväxande Health-Tech Startup - Stockholm - Indeed.com


/rc/clk?jk=f459fb9a579a3db8&fccid=2bcb74f46843a3b9&vjs=3
title = Senior Financial Consultant – NetSuite - Sverige - Indeed.com


/rc/clk?jk=b841a3efc29c3c9c&fccid=505a9f8310876030&vjs=3
title = Junior mjukvaruutvecklare t

title = Erfaren Fullstack Engineer - Göteborg - Indeed.com


/pagead/clk?mo=r&ad=-6NYlbfkN0B2WYB5nUIf-nu8IexjBrTCOI-EIfgJEpLl-C_Vi97ErftAXPNx-epAvB1zkxmaGTFT1aKIgiONPX-XJOeu9qFgm3Ej2UyFMZ-MUl7yQRBHjV6dzkIVj1GQ2Uz1TzJyHDZsZt7uBIxqTeEqqjoj-by831IwAtasC_BaRV8btSVXVFZ3kPrDPwmFIqeoWcNf_9IQ4ioAVL7KJhbYk2OHF8L8_KLAmkzAc50BKjojzVbUvQzIxTySy8cCrBiyBVvfHA2V39eHSdw3RQpGXFwnI31RwPjZdNf1-Sr3LjwGuWp6UiJ72FuCLIsDVFsOnKwUXGamSw1ghfwWNI1WuXrlbpSY3hICLn_w8fZ3zaDUuD_fxsFIQN46aJ-P_iWZ421a1X-eDkNifA9Tm0S_gvEVa0Ad97sjn6ISm9SpHdw8XPaB2AFky13Hx-A2B8ucSvT39kYkJSKWHULk5w==&p=6&fvj=0&vjs=3
title = Senior Backend Engineer - Stockholm - Indeed.com
im in
<class 'AttributeError'>


### Explanation

In [8]:
front_page_url =get_url('backend', 'sweden',1)
front_page_response = requests.get(front_page_url)
all_html = BeautifulSoup(front_page_response.text, 'html.parser')

In [151]:
all_html

<html lang="en">
<head>
<title>hCaptcha solve page</title>
<script async="" defer="" src="https://www.hcaptcha.com/1/api.js"></script>
</head>
<body>
<form action="/jobs?q=backend&amp;l=sweden&amp;start=0" method="POST">
<div class="h-captcha" data-sitekey="eb27f525-f936-43b4-91e2-95a426d4a8bd"></div>
<br/>
<input type="submit" value="Submit"/>
</form>
</body>
</html>

In [5]:
all_html.find_all('a',{'class':"jcs-JobTitle"})

[<a aria-label="all information om Senior Backend Developer - Code With A Purpose" class="jcs-JobTitle" data-hide-spinner="true" data-hiring-event="false" data-jk="93a16f4f2307af1e" data-mobtk="1g2lcmtjmj9h0800" href="/rc/clk?jk=93a16f4f2307af1e&amp;fccid=7fe31231d8054866&amp;vjs=3" id="job_93a16f4f2307af1e" role="button" target="_blank"><span title="Senior Backend Developer - Code With A Purpose">Senior Backend Developer - Code With A Purpose</span></a>,
 <a aria-label="all information om Knowit Experience Traineeprogram 2022 – Utvecklare" class="jcs-JobTitle" data-ci="367488318" data-empn="6681184386882833" data-hide-spinner="true" data-hiring-event="false" data-jk="4b37bbe1bb69059f" data-mobtk="1g2lcmtjmj9h0800" href="/pagead/clk?mo=r&amp;ad=-6NYlbfkN0AXbwWRkNJwq2372GYLd0pY970pZElQAAwGosnC6oHaSx7SJWJmQhzcxmEMjAjdA2N8IIaww9ZU9-pEOHyX-DiQKlyg5jpYxY1obHEXoBJdbnxiacWe65JA2TQh34FdiIxpfT-N9wsFIVsoLUPkwjGOSTlgJ2HXLNZGjr5DUGA5hipxYoJqdVBDLI-A23T4y72AkzibAp3amnUEf-MpzD2ogU0z-outNYRJ3FqCEhLcR

In [6]:
all_html.find_all('a',{'class':"jcs-JobTitle", 'id':"job_93a16f4f2307af1e"})

[<a aria-label="all information om Senior Backend Developer - Code With A Purpose" class="jcs-JobTitle" data-hide-spinner="true" data-hiring-event="false" data-jk="93a16f4f2307af1e" data-mobtk="1g2lcmtjmj9h0800" href="/rc/clk?jk=93a16f4f2307af1e&amp;fccid=7fe31231d8054866&amp;vjs=3" id="job_93a16f4f2307af1e" role="button" target="_blank"><span title="Senior Backend Developer - Code With A Purpose">Senior Backend Developer - Code With A Purpose</span></a>]

In [135]:
import pandas as pd

#Specific NLP imports
from scipy.stats import hmean
import scattertext as st
import requests     
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
global stopwords 
stopwords = set(stopwords_list.decode().splitlines()) 
special_words = ['ml','ci','cd']
for word in special_words:
    stopwords.remove(word)

#Regex for text processing
import re
def remove_special_characters(word_list,remove_stop_words = True):
    global stopwords
    res = re.findall('(?!_)\w+(?<!_)', word_list.lower())
    if remove_stop_words == True:
        no_stop_words = [word for word in res if word not in stopwords]
        res = " ".join(no_stop_words)
    else:
        with_stop_words = [word for word in res]
        res = " ".join(with_stop_words)
    return res
def csv_string_list_to_df_list(csv_list,drop_duplicates=True,drop_rows_to_lowest = True):
    """Reads csv files and converts to a list of data frame for each file(role).
    Args:
        text (str): .csv file names including .csv   (e.g. data_scientist.csv)
    Returns:
        list of pandas.DataFrame: representing each category. 
    """
    
    #Read csv file and prune it. Repeat for all csv files.
    df_list = []
    for csv in csv_list:
        category_df = pd.read_csv(csv)         #Reads csv file
        category_name = csv.split('.')[0]      #The category gets named based on file name. fullstack.csv => fullstack
        category_df['title'] = [category_name]*len(category_df) #continuation of above line
        category_df.dropna(subset=['requirements'],inplace=True)          #drop rows with null
        category_df = category_df[category_df['title'] != 'title']        #drops the stamp created in csv making
        if drop_duplicates == True:
            category_df =  category_df[category_df.groupby(['title','requirements']).cumcount().le(0)] #remove duplicates
            #category_df.drop_duplicates(subset=['requirements'],inplace=True) 
        df_list.append(category_df)  
    
    #To remove participation bias, all roles should have the same amount of data.
    #Cut down rows so that all roles have the same rows as the role with the lowest amount of rows.
    if drop_rows_to_lowest == True:
        lowest_len = len(df_list[0])         
        for df in df_list:                   
            if lowest_len > len(df):
                lowest_len = len(df)
        df_list = list(map(lambda x: x[:lowest_len],df_list))    
    
    
    #Add the column to be parsed. requirements column will be used for the model.
    new_df =[]
    for df in df_list:                 
        df['parsed'] =df['requirements'].apply(remove_special_characters)     
        #df['parsed'] =df['parsed'].apply(ss.stem)       #stemming could be added
        df['parsed'] =df['parsed'].apply(st.whitespace_nlp_with_sentences)   #NLP Tokenizing 
        df = df[['title','parsed']]      #Drop every other column to get correct format.
        new_df.append(df)
    return new_df

In [136]:
x =csv_string_list_to_df_list(['frontend_2022-05-13.csv'])

In [137]:
x[0]

Unnamed: 0,title,parsed
0,frontend_2022-05-13,"(har, minst, 3, års, erfarenhet, frontend, utv..."
1,frontend_2022-05-13,"(erfarenhet, react, ett, bra, öga, för, design..."
2,frontend_2022-05-13,"(arbeta, med, nyutveckling, javascript, samt, ..."
3,frontend_2022-05-13,"(javascript, react, kan, skapa, engagerande, d..."
4,frontend_2022-05-13,"(tjänst, frontend, utvecklaretjänstgöringsgrad..."
...,...,...
187,frontend_2022-05-13,"(minst, 3, års, erfarenhet, ramverk, angular, ..."
188,frontend_2022-05-13,"(att, tillsammans, med, teamet, följa, våra, u..."
189,frontend_2022-05-13,"(payments, people, save, time, waiting, automa..."
191,frontend_2022-05-13,"(tjänst, på, ett, familjärt, och, entreprenörs..."
