This is the web scraping file of an 'Indeed AI Job' project

In [1]:
%cd drive/MyDrive/Colab Notebooks/Programming/Gather_Data

/content/drive/MyDrive/Colab Notebooks/Programming/Gather_Data


In [2]:
import urllib.request
import os
import pandas as pd

from bs4 import BeautifulSoup
from datetime import date

In [3]:
"""
Error handling
"""
def user_input():
    os.system('cls' if os.name == 'nt' else 'clear')
    text = """Would you like to collect even more job data?
    Answer with 0 (No) or 1 (Yes) please: """

    # Makes sure user input is correct
    while True:
        user_morelinks = input(text)

        # Input is integer
        try:
            yes_or_no = int(user_morelinks)

            # Input is valid
            if yes_or_no in [0, 1]:

                if yes_or_no == 0:
                    os.system('cls' if os.name == 'nt' else 'clear')
                    print("Not collecting more job data.")
                else:
                    os.system('cls' if os.name == 'nt' else 'clear')
                    print("Collecting more job data.")

                return yes_or_no
            else:
                os.system('cls' if os.name == 'nt' else 'clear')
                print("Answer with '0' or '1'.")
        except:
            os.system('cls' if os.name == 'nt' else 'clear')
            print("Please give a correct input.")

In [4]:
"""
Cycle through multiple search pages.
"""
def html_cycle(url = "https://www.indeed.com/jobs?q=artificial+intelligence&l=", search_depth = 1000):
    # Make sure url is easily subcriptable in code
    if url[-2:] == "l=":
        url = url[:-2] + "start=0"

    # Generating job list
    print("\nAll possible jobs are now being gathered.")
    job_list = set()
    while url:
        url = get_jobs(job_list, search_depth, url)
        if url == 2:
            print("\nCheck for Captcha on website.")
            quit()
    print("\nDone with retrieving data.")
    store_data(job_list)

In [5]:
"""
This retrieves all listed jobs given an Indeed search URL and
checks if further search is possible, if the user wants to.
"""
def get_jobs(job_list, search_depth, url):
    # Breaks the given URL down in numbers and other chars.
    word, digit = "", ""
    for char in url:
        if char.isdigit():
            digit += char
        else:
            word += char

    # Constructs next URL and checks depth
    if digit.isdigit():
        digit = int(digit) + 10
    new_page_url = word + str(digit)

    # Retrieves html data from URL
    with urllib.request.urlopen(url) as response:
        html = response.read()
    soup = BeautifulSoup(html, "lxml")

    # Searches between all the links on the given html page
    links = soup.find_all('a')
    printed = False
    for link in links:
        potential = str(link.get('href'))
        if 'clk?jk' in potential:
            new = 'www.indeed.com' + potential
            if new not in job_list:
                job_list.add(new)
                printed = False
            amount_jobs = len(job_list)
            if amount_jobs % 100 == 0 and amount_jobs != 0 and not printed:
                print("\t> So far", amount_jobs, "jobs gathered.")
                printed = True
            if amount_jobs >= search_depth: # Check search depth
                return False
        if new_page_url[22:] in potential: # Always after job URL's
            return new_page_url

    # Error when captcha is prompted
    if len(links) == 0:
        return 2
    return False

In [6]:
"""
Stores job data in a seperate document.
"""
def store_data(job_list):
    link_csv = "../Data_Indeed/AI_jobs.csv"

    # Check if csv file exists
    file_exists = True if os.path.exists(link_csv) else False

    # Retrieves already stored URLs, else creates DataFrame
    if file_exists:
        with open(link_csv, 'r') as f:
            df = pd.read_csv(f, index_col=0)
            URLs = df['URL'].values
    else:
        df = pd.DataFrame()
        URLs = []

    # Obtain current date
    today = date.today()

    # Cycling through jobs
    for job_link in job_list:
        job_link = 'https://' + job_link

        # Check if job already stored
        if file_exists:
            if job_link in URLs:
                continue
        new_row = {'date': today, 'URL': job_link}
        df = df.append(new_row, ignore_index=True)
    
    # Stores in csv file
    df.to_csv(link_csv, sep=',', float_format='%.2f')
    print("Data is now stored in '{}'.\n".format(link_csv))