# Scraping LinkedIn Jobs list

## Description:
In this project we scrape couple of jobs on linked in that contain the keyword 'Python' in USA. 
We are using both Selenium and Beautiful Soup.

In [1]:
# Import our libraries
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time

## Accessing the website

In [2]:
keyword = 'python' # We can change it to anything eg. mechanical engineer
url = f'https://www.linkedin.com/jobs/search?keywords={keyword}'
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
driver.get(url)
action = webdriver.ActionChains(driver)

## Scrolling down the page and clicking See More button

In [3]:
SCROLL_PAUSE_TIME = 5
see_more_button = driver.find_element_by_xpath('//*[@id="main-content"]/section[2]/button')
i = 0  # This variable limits our search results

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")

    # What if the website got stuck? we scroll up then down to refresh
    if new_height == last_height:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        driver.execute_script("window.scrollTo(0, -100);")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        try:
            # Clicking see more if visible
            see_more_button.click()
            i = i + 1
        except:
            continue
        
    if i >= 10:
        break    
    last_height = new_height

## Scraping the page source after loading the data

In [4]:
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')

In [5]:
all_results = soup.find('ul', class_="jobs-search__results-list")
cards = all_results.find_all('li')

In [6]:
len(cards)

274

## Constructing the Data Frame

In [7]:
results = [] # This list will contain all the data

for card in cards:
    try:
        link = card.div.a.get('href')  #JobLink
    except:
        link = ''
    try:    
        title = card.find('h3', class_="base-search-card__title").text.strip() #JobTitle
    except:
        title = ''
    try:    
        company = card.find('h4', class_='base-search-card__subtitle').text.strip() #CompanyName
    except:
        company = ''
    try:    
        location = card.find('span', class_="job-search-card__location").text.strip() #JobLocation
    except:
        location = ''
    try:
        salary = card.find('span', class_="job-search-card__salary-info").text.strip() #JobSalary
    except:
        salary = ''   
    try:     
        date = card.time.get('datetime') #DatePosted
    except:
        date = ''    

    dic = {'title':title, 'company':company, 'location':location, 'salary':salary, 'date':date, 'link':link}
    results.append(dic)


In [8]:
df = pd.DataFrame(results)

In [9]:
df.head()

Unnamed: 0,title,company,location,salary,date,link
0,Python Developer Entry Level,TEKtalent Inc,"Denver, CO","$80,000 - $100,000",2021-12-29,https://www.linkedin.com/jobs/view/python-deve...
1,Junior Data Analyst,National Instruments,United States,,2021-12-30,
2,Artificial Intelligence Intern,Stats Perform,"Chicago, IL",,2021-12-30,https://www.linkedin.com/jobs/view/artificial-...
3,Software Engineer I,Vericast,"Austin, TX",,2021-12-30,https://www.linkedin.com/jobs/view/software-en...
4,"Remote - Urgent Hiring - Job Role - ""Python De...",TechFetch.com - On Demand Tech Workforce hirin...,"Home, KS",,2021-12-30,https://www.linkedin.com/jobs/view/remote-urge...


## Sorting by date posted and exporting to CSV

In [10]:
df.date = pd.to_datetime(df.date)
df.sort_values(by='date', ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)

In [11]:
df.to_csv('linkedin_jobs.csv')