In [1]:
#Importing the required libraries
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen 
from requests import get
import numpy as np

In [2]:
#libraries imported for handling rate limits and console formatting
from time import time, sleep
from random import randint
from warnings import warn
from IPython.core.display import clear_output

  from IPython.core.display import clear_output


In [3]:
#Creating empty lists to store all the items
job_title = []
company_name = []
job_location = []
salary = []
job_description = []
date_posted = []
job_rating = []

In [4]:
# Preparing the monitoring of the loop
start_time = time()
request = 0

In [5]:
# variables used for modifying url parameters
job_type = [str(i) for i in ['Data+Analyst', 'Data+Scientist', 'Database+Administrator', 'Machine+Learning+Engineer', 'Data+Engineer']]
the_state = [str(i) for i in ['Virginia', 'New+York', 'California', 'Texas', 'Washington+State']]

In [6]:
#For every job in the list
for job in job_type:
    #For every state in the list
    for state in the_state:

        # Make a get request
        response = get('https://www.indeed.com/jobs?q='+ str(job) + '&l=' + str(state) + '&sort=date')

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        request += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(request, request/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(request, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if request > 100:
            warn('Number of requests was greater than expected.')
            break
        
        # Parse the content of the request with BeautifulSoup
        bs = BeautifulSoup(response.text, 'html.parser')

        #Select all the jobs on the page
        job_containers = bs.find_all('div', {'class':{'job_seen_beacon'}})

        for container in job_containers:
            
            #The name
            job_name = container.find('h2',{'class':{'jobTitle'}}) #GOOD
            job_title.append(job_name.text[3:])

            #The company
            company = container.find('span',{'class':'companyName'}).text #GOOD
            company_name.append(company)

            #The location
            location = container.find(['div','span'], {'class':{'companyLocation'}}) #GOOD
            job_location.append(location.text)

            try:
                #The salary
                earnings = container.find('span', {'class':{'salary-snippet'}}) #GOOD
                salary.append(earnings.text)
            except:
                #Appending missing if salary element is not found
                salary.append('missing')

            #The description
            description = container.find('div',{'class':'job-snippet'}) #GOOD
            job_description.append(description.text)
            
            #The date
            date = container.find('span',{'class':'date'}).text #GOOD
            date_posted.append(date)

            try:
                #The rating
                ratings = container.find('span', {'class':{'ratingNumber'}})
                job_rating.append(ratings.text)
            except:
                #Appending missing if salary element is not found
                job_rating.append('missing')

Request:25; Frequency: 0.08266287420122864 requests/s


In [7]:
#Creating the dataframe with the lists
job_df = pd.DataFrame({'Job': job_title,
'Company': company_name,
'Location': job_location,
'Salary': salary,
'Job Description': job_description,
'Date Posted': date_posted,
'Job Rating': job_rating
})

In [8]:
#Replacing \n with an empty character
job_df=job_df.replace('\n','', regex=True)

job_df.head()

Unnamed: 0,Job,Company,Location,Salary,Job Description,Date Posted,Job Rating
0,Business Intelligence Analyst/Training Coordin...,Peraton,"McLean, VA 22102",missing,Employs data warehouse analysis and design exp...,PostedJust posted,3.2
1,Data Analyst,Chenega Corporation,"Alexandria, VA 22202 (Aurora Highlands area)",missing,Perform data entry and metadata entry for elec...,PostedToday,3.6
2,Data Processing Analyst I,"Arrow Electronics, Inc.","Remote in Shenandoah, VA 22849",missing,Identifies and resolves key issues and pattern...,PostedToday,3.5
3,Business Analyst III,Fairfax County Government,"Fairfax, VA",missing,At least 5 years of experience in providing ap...,PostedToday,4.0
4,Financial Analyst 2-5803,Huntington Ingalls Industries Inc.,"Virginia Beach, VA 23462 (Northwest area)",missing,Strong computer skills to quickly decompose an...,PostedToday,3.7


In [9]:
#Splitting the location into multiple columns and adding it back to the job_df
job_df = job_df.join(job_df['Location'].str.split(',', expand=True).add_prefix('Location').fillna(np.nan))
job_df.head()

Unnamed: 0,Job,Company,Location,Salary,Job Description,Date Posted,Job Rating,Location0,Location1
0,Business Intelligence Analyst/Training Coordin...,Peraton,"McLean, VA 22102",missing,Employs data warehouse analysis and design exp...,PostedJust posted,3.2,McLean,VA 22102
1,Data Analyst,Chenega Corporation,"Alexandria, VA 22202 (Aurora Highlands area)",missing,Perform data entry and metadata entry for elec...,PostedToday,3.6,Alexandria,VA 22202 (Aurora Highlands area)
2,Data Processing Analyst I,"Arrow Electronics, Inc.","Remote in Shenandoah, VA 22849",missing,Identifies and resolves key issues and pattern...,PostedToday,3.5,Remote in Shenandoah,VA 22849
3,Business Analyst III,Fairfax County Government,"Fairfax, VA",missing,At least 5 years of experience in providing ap...,PostedToday,4.0,Fairfax,VA
4,Financial Analyst 2-5803,Huntington Ingalls Industries Inc.,"Virginia Beach, VA 23462 (Northwest area)",missing,Strong computer skills to quickly decompose an...,PostedToday,3.7,Virginia Beach,VA 23462 (Northwest area)


In [10]:
#Renaming the new columns to the correct names
job_df = job_df.rename(columns = {"Location0":'City'})
job_df = job_df.rename(columns = {"Location1":'State/Zip'})
job_df

Unnamed: 0,Job,Company,Location,Salary,Job Description,Date Posted,Job Rating,City,State/Zip
0,Business Intelligence Analyst/Training Coordin...,Peraton,"McLean, VA 22102",missing,Employs data warehouse analysis and design exp...,PostedJust posted,3.2,McLean,VA 22102
1,Data Analyst,Chenega Corporation,"Alexandria, VA 22202 (Aurora Highlands area)",missing,Perform data entry and metadata entry for elec...,PostedToday,3.6,Alexandria,VA 22202 (Aurora Highlands area)
2,Data Processing Analyst I,"Arrow Electronics, Inc.","Remote in Shenandoah, VA 22849",missing,Identifies and resolves key issues and pattern...,PostedToday,3.5,Remote in Shenandoah,VA 22849
3,Business Analyst III,Fairfax County Government,"Fairfax, VA",missing,At least 5 years of experience in providing ap...,PostedToday,4.0,Fairfax,VA
4,Financial Analyst 2-5803,Huntington Ingalls Industries Inc.,"Virginia Beach, VA 23462 (Northwest area)",missing,Strong computer skills to quickly decompose an...,PostedToday,3.7,Virginia Beach,VA 23462 (Northwest area)
...,...,...,...,...,...,...,...,...,...
370,Data Governance Engineer (Remote),CSAA Insurance Group,"Remote in Seattle, WA",missing,Helps in ensuring that Data Governance policie...,Posted1 day ago,3.4,Remote in Seattle,WA
371,"Software Dev Engineer II, Advertising Data Man...",Amazon.com Services LLC,"Seattle, WA+28 locations",missing,Mentor and assist in the career development of...,PostedToday,3.5,Seattle,WA+28 locations
372,Data Analyst/Engineer,"Naylor, LLC",United States,missing,Experience building and optimizing ‘big data’ ...,Posted1 day ago,3.2,United States,
373,Sr. Data Engineer,DISYS,United States,missing,Integrate data from multiple sources to produc...,Posted1 day ago,3.4,United States,


In [11]:
#Getting the counts of each city
get_percentage = job_df['City'].value_counts()
#Total counts divided by the sum times 100 to get the percent
percentage = 100 * (get_percentage / get_percentage.sum())
percentage

New York                          9.866667
Seattle                           8.266667
Dallas                            4.800000
McLean                            4.000000
Austin                            3.200000
                                    ...   
Richmond                          0.266667
Fredericksburg                    0.266667
Herndon                           0.266667
Remote in Virginia+4 locations    0.266667
Remote in Vancouver               0.266667
Name: City, Length: 143, dtype: float64

In [12]:
#saving the clean dataset as a csv file
job_df.to_csv('job.csv')