## Webscrape for Data Related Jobs in the United States
----
4 queries were made into Indeed.com to obtain job postings for Data Analyst, Data Engineer, Data Scientist, and Machine Learning. Due to large quantities of job postings, a limit of 70 pages were extracted (equaling around 1000 job posts) for each role. 

Job title index was also assigned to each job posting during the web scrape. For example, while scraping for Data Analyst roles, an index number of 1 was assigned to each posting. This will help for analysis. 

In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
import time 

In [2]:
# list creation to store data from webscrape

job_title_list = []
job_title_index = []
company_list = []
job_id_list = []
location_list = []

## Data Analyst (index = 1)

In [3]:
# search query for Data Analyst roles
url = f"https://www.indeed.com/jobs?q=data+analyst&l="
print(url)

# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

# Collecting data on total number of job postings 
results = soup.find('div', id='searchCountPages').text.strip()

print(results)

https://www.indeed.com/jobs?q=data+analyst&l=
Page 1 of 16,728 jobs


In [4]:
# Scraping to page limit of 70 pages
page = range(0,710,10)

page_string = map(str, page) 

for page in list(page_string): 
    url = f"https://www.indeed.com/jobs?q=data+analyst&start={page}"
    print(url)
    time.sleep(5)
    
    # Retrieve page with the requests module
    response = requests.get(url)
    
    # Create BeautifulSoup object; parse with 'lxml'
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Retrieve the parent divs for all job postings
    results = soup.find_all('div', class_='result')
    
    # loop over results to get job post data
    for result in results:
        try:
            job_title = result.find('a', class_='jobtitle').text.strip()
            job_index = 1
            company = result.find('span', class_='company').text.strip()
            job_id = result.get('id')
            location = result.find(class_='location').text
            
            # adding data into lists 
            job_title_list.append(job_title)
            job_title_index.append(job_index)
            company_list.append(company)
            location_list.append(location)
            job_id_list.append(job_id)
            
        except:
            pass

https://www.indeed.com/jobs?q=data+analyst&start=0
https://www.indeed.com/jobs?q=data+analyst&start=10
https://www.indeed.com/jobs?q=data+analyst&start=20
https://www.indeed.com/jobs?q=data+analyst&start=30
https://www.indeed.com/jobs?q=data+analyst&start=40
https://www.indeed.com/jobs?q=data+analyst&start=50
https://www.indeed.com/jobs?q=data+analyst&start=60
https://www.indeed.com/jobs?q=data+analyst&start=70
https://www.indeed.com/jobs?q=data+analyst&start=80
https://www.indeed.com/jobs?q=data+analyst&start=90
https://www.indeed.com/jobs?q=data+analyst&start=100
https://www.indeed.com/jobs?q=data+analyst&start=110
https://www.indeed.com/jobs?q=data+analyst&start=120
https://www.indeed.com/jobs?q=data+analyst&start=130
https://www.indeed.com/jobs?q=data+analyst&start=140
https://www.indeed.com/jobs?q=data+analyst&start=150
https://www.indeed.com/jobs?q=data+analyst&start=160
https://www.indeed.com/jobs?q=data+analyst&start=170
https://www.indeed.com/jobs?q=data+analyst&start=180
http

In [5]:
# Number of unique jobs added into dataset 
unique = set(job_id_list)
len(unique)

681

## Data Scientist (index = 2)

In [None]:
# Search query for Data Scientist roles
url = f"https://www.indeed.com/jobs?q=data+scientist&l="

# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

# Collecting data on total number of job postings 
results = soup.find(id='searchCountPages').text.strip()

print(results) 

In [None]:
# Scraping to page limit of 70 pages
page = range(0,710,10)

page_string = map(str, page) 

for page in list(page_string): 
    url = f"https://www.indeed.com/jobs?q=data+scientist&start={page}"
    print(url)
    time.sleep(5)
    
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'lxml'
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Retrieve the parent divs for all postings
    results = soup.find_all('div', class_='result')
    
    # Loop over results to get job posting data
    for result in results:
        try:
            job_title = result.find('a', class_='jobtitle').text.strip()
            job_index = 2
            company = result.find('span', class_='company').text.strip()
            job_id = result.get('id')
            location = result.find(class_='location').text

            # Adding data to lists
            job_title_list.append(job_title)
            job_title_index.append(job_index)
            company_list.append(company)
            location_list.append(location)
            job_id_list.append(job_id)
            
        except:
            pass

In [None]:
# Number of unique jobs added into dataset 
unique = set(job_id_list)
len(unique)

## Data Engineer (index = 3)

In [None]:
# Search query for Data Engineer roles
url = f"https://www.indeed.com/jobs?q=data+engineer&l="

# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')
results = soup.find(id='searchCountPages').text.strip()

print(results) 

In [None]:
# Scraping to page limit of 70 pages
page = range(0,710,10)

# Changing integer to string
page_string = map(str, page) 

for page in list(page_string): 
    url = f"https://www.indeed.com/jobs?q=data+engineer&start={page}"
    print(url)
    time.sleep(5)
    
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'lxml'
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Retrieve the parent divs for all postings
    results = soup.find_all('div', class_='result')
    
    # loop over results to get job posting data
    for result in results:
        try:
            job_title = result.find('a', class_='jobtitle').text.strip()
            job_index = 3
            company = result.find('span', class_='company').text.strip()
            job_id = result.get('id')
            location = result.find(class_='location').text

            # Adding data to lists
            job_title_list.append(job_title)
            job_title_index.append(job_index)
            company_list.append(company)
            location_list.append(location)
            job_id_list.append(job_id)
            
        except:
            pass

In [None]:
# Number of unique jobs added into dataset 
unique = set(job_id_list)
len(unique)

## Machine Learning (index = 4)

In [None]:
# Search query for Machine Learning roles
url = f"https://www.indeed.com/jobs?q=machine+learning&l="

# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')
results = soup.find(id='searchCountPages').text.strip()

print(results) 

In [None]:
# Manually checked how many pages there are
page = range(0,710,10)

# Changing integer to string
page_string = map(str, page) 

for page in list(page_string): 
    url = f"https://www.indeed.com/jobs?q=machine+learning&start={page}"
    print(url)
    
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'lxml'
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Retrieve the parent divs for all postings
    results = soup.find_all('div', class_='result')
    
    # loop over results to get job posting data
    for result in results:
        try:
            job_title = result.find('a', class_='jobtitle').text.strip()
            job_index = 4
            company = result.find('span', class_='company').text.strip()
            job_id = result.get('id')
            location = result.find(class_='location').text

            # Adding data to lists
            job_title_list.append(job_title)
            job_title_index.append(job_index)
            company_list.append(company)
            location_list.append(location)
            job_id_list.append(job_id)
            
        except:
            pass

In [None]:
# Number of unique jobs added into dataset 
unique = set(job_id_list)
len(unique)

## Data Compilation into one table 

In [None]:
# Compiling all lists into one dataframe

US_jobmarket = {"Job ID" : job_id_list,
                "Job Title Index" : job_title_index,
                "Job Title" : job_title_list, 
                "Company Name" : company_list, 
                "Company Location" : location_list}

In [None]:
US_jobmarket_df = pd.DataFrame(US_jobmarket)

In [None]:
US_jobmarket_df.head()

In [None]:
# Export to csv file 
US_jobmarket_df.to_csv("../Clean Data/US-JobMarket.csv")