In [2]:
#importing libraries
import pandas as pd
from bs4 import BeautifulSoup
from requests import get

from time import time, sleep
from random import randint
from warnings import warn
from IPython.core.display import clear_output

In [5]:
#lists to modify URL
jobs = ["data+analyst", "data+scientist", "database+administrator", "machine+learning+engineer", "data+engineer"]
states = ["Virginia", "New+York", "California", "Texas", "Washington+State"]

In [105]:
#empty lists to store data
titles = []
company_names = []
locations = []
salaries = []
descriptions = []
dates = []
ratings = []

#to monitor loop
start_time = time()
request = 0

In [106]:
#for every job
for job in jobs:
    #for every state
    for state in states:
    
        #make a get request
        response = get('https://www.indeed.com/jobs?q=' + job + '&l=' + state + '&sort=date')
    
        #pause loop
        sleep(randint(7,14))
    
        #monitor requests
        request += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(request, request/elapsed_time))

        #throw warning
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(request, response.status_code))

        #parse content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        
        #job titles
        jobs_div = soup.find_all(name='a', attrs={'data-tn-element': 'jobTitle'})
        
        for job_a in jobs_div:
            job_attrs = job_a.attrs
            titles.append(job_attrs['title'])
        
        #company names
        company_div = soup.find_all('span', attrs={'class': 'company'})

        for name in company_div:
            company_names.append(name.text.strip())
        
        #locations
        location_div = soup.find_all('div', attrs={'class': 'recJobLoc'}) 

        for location in location_div:
            location_attrs = location.attrs
            locations.append(location_attrs['data-rc-loc'])
            
        #salaries
        salary_div = soup.find_all('div', attrs={'class': 'jobsearch-SerpJobCard'})

        for sal_div in salary_div:
            salary_span = sal_div.find('span', attrs={'class': 'salaryText'})
            if salary_span:
                salaries.append(salary_span.string.strip())
            else:
                salaries.append('Missing')
    
        #job descriptions
        descr_div = soup.find_all('div', attrs={'class': 'summary'})
        
        for descr in descr_div:
            descriptions.append(descr.text)
            
        #dates
        date_div = soup.find_all('span', attrs={'class': 'date'})
        
        for date in date_div:
            dates.append(date.text.strip())
            
        #ratings
        rating_div = soup.find_all('div', attrs={'class': 'jobsearch-SerpJobCard'})

        for rating in rating_div:
            rating_span = rating.find('span', attrs={'class':  'ratingsContent'})
            if rating_span:
                ratings.append(float(rating_span.text.strip().replace(',', '.')))
            else:
                ratings.append('Missing')

Request:1; Frequency: 0.12378764266827266 requests/s
Request:2; Frequency: 0.10043627105813545 requests/s
Request:3; Frequency: 0.08693692685645958 requests/s
Request:4; Frequency: 0.09034308795828623 requests/s
Request:5; Frequency: 0.09476671123150818 requests/s
Request:6; Frequency: 0.09311048104242585 requests/s
Request:7; Frequency: 0.08873187257452665 requests/s
Request:8; Frequency: 0.09154054598019234 requests/s
Request:9; Frequency: 0.09186104321365793 requests/s
Request:10; Frequency: 0.08778272680109722 requests/s
Request:11; Frequency: 0.09059524352225178 requests/s
Request:12; Frequency: 0.08821418628534802 requests/s
Request:13; Frequency: 0.08934602785851124 requests/s
Request:14; Frequency: 0.09153493848272566 requests/s
Request:15; Frequency: 0.09292415530277583 requests/s
Request:16; Frequency: 0.09460583553107071 requests/s
Request:17; Frequency: 0.09361086570895671 requests/s
Request:18; Frequency: 0.09364590722821665 requests/s
Request:19; Frequency: 0.095128063393

In [124]:
#creating dataframe using the lists
indeed_dataset = pd.DataFrame({'Job Title': titles,
'Company Name': company_names,
'Location': locations,
'Salary': salaries,
'Job Description': descriptions,
'Date of Posting': dates,
'Rating of the Company': ratings,
})

In [125]:
indeed_dataset = indeed_dataset.replace('\n','', regex=True)
indeed_dataset.head()

Unnamed: 0,Job Title,Company Name,Location,Salary,Job Description,Date of Posting,Rating of the Company
0,"Associate Data Analyst, Advancement",EAB,"Richmond, VA",Missing,Utilize data visualization tools to create rep...,Just posted,3.8
1,Sr. Business Analyst - Customer Protection,Capital One - US,"Glen Allen, VA",Missing,The Retail and Direct Bank (RDB) Bank Customer...,Just posted,3.9
2,Senior Business Analyst - Bank Customer Protec...,Capital One - US,"Glen Allen, VA",Missing,The Retail and Direct Bank (RDB) Bank Customer...,Just posted,3.9
3,PROGRAM ANALYST,US Department of the Air Force,"Langley AFB, VA","$92,588 - $120,365 a year","Ability to plan, organize, and lead others in ...",Just posted,4.4
4,Analyst - Data Practice,Enterprise Knowledge (EK),"Arlington, VA",Missing,Enterprise Knowledge (EK) is seeking a full-ti...,Today,Missing


In [134]:
#splitting job location column
indeed_dataset[['City', 'State']] = indeed_dataset['Location'].str.split(',', 1, expand=True)
del indeed_dataset['Location']
indeed_dataset.head()

Unnamed: 0,Job Title,Company Name,Salary,Job Description,Date of Posting,Rating of the Company,City,State
0,"Associate Data Analyst, Advancement",EAB,Missing,Utilize data visualization tools to create rep...,Just posted,3.8,Richmond,VA
1,Sr. Business Analyst - Customer Protection,Capital One - US,Missing,The Retail and Direct Bank (RDB) Bank Customer...,Just posted,3.9,Glen Allen,VA
2,Senior Business Analyst - Bank Customer Protec...,Capital One - US,Missing,The Retail and Direct Bank (RDB) Bank Customer...,Just posted,3.9,Glen Allen,VA
3,PROGRAM ANALYST,US Department of the Air Force,"$92,588 - $120,365 a year","Ability to plan, organize, and lead others in ...",Just posted,4.4,Langley AFB,VA
4,Analyst - Data Practice,Enterprise Knowledge (EK),Missing,Enterprise Knowledge (EK) is seeking a full-ti...,Today,Missing,Arlington,VA


In [157]:
#calculating and printing percentage of jobs in each city
cities = indeed_dataset['City']

city_percentages = cities.value_counts(normalize = True)*100

city_percentages.to_frame()

with pd.option_context('display.max_rows', None):
    print(city_percentages)

New York                            13.066667
Seattle                             12.533333
San Francisco                        5.066667
Austin                               4.533333
Arlington                            4.000000
McLean                               3.200000
Dallas                               3.200000
Reston                               2.400000
Plano                                2.400000
Houston                              2.133333
San Antonio                          2.133333
California                           1.600000
Chantilly                            1.600000
Washington State                     1.600000
Carlsbad                             1.333333
Mountain View                        1.333333
Springfield                          1.066667
Buffalo                              1.066667
San Diego                            1.066667
Los Angeles                          1.066667
Bellevue                             1.066667
Texas                             

In [158]:
#creating csv file
indeed_dataset.to_csv('Chang_Calvin_INST447_PA2.csv')