### Retrieving Web Pages

In [1]:
import urllib.request, urllib.parse, urllib.error

In [2]:
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')

In [3]:
for line in fhand:
    print(line.decode().strip())

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


### Parsing Web Pages

In [4]:
from bs4 import BeautifulSoup

In [5]:
url = 'https://www.linkedin.com/jobs/search/?geoId=90000070&location=New%20York%20City%20Metropolitan%20Area'

In [6]:
html = urllib.request.urlopen(url).read()

In [7]:
soup = BeautifulSoup(html, 'html.parser')

In [8]:
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print(tag.get('href',None))

/?trk=guest_job_search_nav-header-logo
https://www.linkedin.com/start/join?source=jobs_registration&session_redirect=https%3A%2F%2Fwww.linkedin.com%2Fjobs%2Fsearch%2F%3FgeoId%3D90000070%26location%3DNew%2520York%2520City%2520Metropolitan%2520Area&trk=guest_job_search_nav-header-join
https://www.linkedin.com/uas/login?emailAddress=&fromSignIn=true&session_redirect=https%3A%2F%2Fwww.linkedin.com%2Fjobs%2Fsearch%2F%3FgeoId%3D90000070%26location%3DNew%2520York%2520City%2520Metropolitan%2520Area&trk=guest_job_search_nav-header-signin
https://www.linkedin.com/uas/login?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fjobs%2Fsearch%2F%3FgeoId%3D90000070%26location%3DNew%2520York%2520City%2520Metropolitan%2520Area&amp;emailAddress=&amp;fromSignIn=&trk=guest_job_search_sign-in-hidden-module
https://www.linkedin.com/jobs/view/coordinator-content-distribution-at-mgm-1704097851?refId=0bf9743b-afd5-405c-a93d-b4b41de9718a&position=1&pageNum=0&trk=guest_job_search_job-result-card_result-card_ful

## An example walk through

### Defining selection criteria

### 1. In the level 1 page

**Selecting the post link**

In [9]:
import re
Link = []
for tag in tags:
    tag = tag.get('href',None)
    if re.search('https://www.linkedin.com/jobs/view/', tag):
        Link.append(tag)

**Selecting the Company Name**

In [10]:
company_list = soup.find_all(class_='result-card__subtitle-link job-result-card__subtitle-link')

In [11]:
Company = []
for company in company_list:
    Company.append(company.text)

**Selecting the Title Name**

In [12]:
tilte_list = soup.find_all(class_='result-card__title job-result-card__title')

In [13]:
Title = []
for title in tilte_list:
    Title.append(title.text)

**Selecting the Location**

In [14]:
location_list = soup.find_all(class_='job-result-card__location')

In [15]:
Location = []
for location in location_list:
    Location.append(location.text)

### 2. In the level 2 page -- Post detailed Information

In [16]:
url = Link[0]

In [17]:
html = urllib.request.urlopen(url).read()

In [18]:
soup = BeautifulSoup(html, 'html.parser')

In [19]:
company_name = soup.find(class_='topcard__org-name-link topcard__flavor--black-link').text

In [20]:
title = soup.find(class_='topcard__title').text

In [21]:
city = re.findall('"addressLocality":"(.+?)"',soup.text)[0]
state = re.findall('"addressRegion":"(.+?)"',soup.text)[0]
country = re.findall('"addressCountry":"(.+?)"',soup.text)[0]

In [22]:
level = re.findall('"experienceRequirements":"(.+?)"',soup.text)[0]

In [23]:
industry = re.findall('"industry":"(.+?)"',soup.text)

### Put them together

In [24]:
def getlink(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    Link = []
    tags = soup('a')
    for tag in tags:
        tag = tag.get('href',None)
        if re.search('https://www.linkedin.com/jobs/view/', tag):
            Link.append(tag)
    return(Link)

In [25]:
def getjd(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    
    company_name = soup.find(class_='topcard__org-name-link topcard__flavor--black-link')
    if company_name is None:
        company_name = 'Not Available'
    else:
        company_name = company_name.text
        
    title = soup.find(class_='topcard__title')
    if len(title) is None:
        title = 'Not Available'
    else:
        title = title.text
        
    city = re.findall('"addressLocality":"(.+?)"',soup.text)
    if len(city) == 0:
        city = 'Not Available'
    elif len(city) == 1:
        city = city[0]
        
    state = re.findall('"addressRegion":"(.+?)"',soup.text)
    if len(state) == 0:
        state = 'Not Available'
    elif len(state) == 1:
        state = state[0]
        
    country = re.findall('"addressCountry":"(.+?)"',soup.text)
    if len(country) == 0:
        country = 'Not Available'
    elif len(country) == 1:
        country = country[0]
        
    level = re.findall('"experienceRequirements":"(.+?)"',soup.text)
    if len(level) == 0:
        level = 'Not Available'
    elif len(level) == 1:
        level = level[0]
        
    industry = re.findall('"industry":"(.+?)"',soup.text)
    if len(industry) == 0:
        industry = 'Not Available'
    
    url = url
    result = [company_name,title,city,state,country,level,industry,url]
    return(result)

In [26]:
import time
linklist = []
for i in range(40):
    value = i*25
    url = 'https://www.linkedin.com/jobs/search/?start='+str(value)
    urllist = getlink(url)
    linklist = linklist + urllist
    time.sleep(2)

In [29]:
import csv
with open('LinkedIn Job posts.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(["Company","Title","City","State","Country","Level","Industry","URL"])
    for i in range(len(linklist)):
        link = linklist[i]
        result = getjd(link)
        writer.writerow(result)
        time.sleep(10)