# Web Scraping Job Data Analyst in LinkedIn with Python

Requirement :
- Python > 3.0 installed
- Jupyter Notebook installed using PIP
- Selenium installed using PIP
- Pandas installed using PIP
- Openpyxl installed using PIP

### 1. Import package

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
import time
import pandas as pd

### 2. Opening and scrolling job listing

In [None]:
#driver
browser = webdriver.Edge()

#link url
url_linkedin = "https://www.linkedin.com/jobs/search/?keywords=data%20analyst&location=Indonesia"

#open browser
browser.maximize_window()
browser.get(url_linkedin)

# Determine how many jobs we want to scrape, and calculate how many time we need to scroll down
no_of_jobs = 1000
n_scroll = int(no_of_jobs/25)+1
print(n_scroll)
i = 1
browser.execute_script("return document.body.scrollHeight") #scroll to top
while i <= n_scroll:
    browser.execute_script("return document.body.scrollHeight")
    time.sleep(2) # wair for 2 seconds
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") #scroll to the bottom of page
    time.sleep(3)
    i = i + 1
    try:
        button = browser.find_element(By.XPATH,"/html/body/div[1]/div/main/section[2]/button")
        #time.sleep(2)
        button.click()
        time.sleep(1)
        print("Load More...")
    except:
        browser.execute_script("return document.body.scrollHeight")
        time.sleep(3)

print("total jobs:")        
jobs = browser.find_element(By.CLASS_NAME,"jobs-search__results-list").find_elements(By.TAG_NAME,'li') # return a list
print( len(jobs))

### 3. Get main attributes of each job

In [None]:
job_title = []
company_name = []
location = []
date = []
for job in jobs:
    # title
    job_title0 = job.find_element(By.CSS_SELECTOR,'h3').get_attribute('innerText')
    job_title.append(job_title0)

    # company
    company_name0 = job.find_element(By.CSS_SELECTOR,'h4').get_attribute('innerText')
    company_name.append(company_name0)


    # location
    location0 = job.find_element(By.CLASS_NAME,'job-search-card__location').get_attribute('innerText')
    location.append(location0)

    # date
    date0 = job.find_element(By.CSS_SELECTOR,'div>div>time').get_attribute('datetime')
    date.append(date0)

In [None]:
jd = []
seniority = []
emp_type = []
job_func = []
industries = []
for item in range(len(jobs)):
    job_func0=[]
    industries0=[]
    # clicking job to view job details
    try:
        job_click_path = f'/html/body/div[1]/div/main/section[2]/ul/li[{item+1}]/div/a'
        job_click = job.find_element(By.XPATH,job_click_path).click()
    except:
        job_click_path = f'/html/body/div[1]/div/main/section[2]/ul/li[{item+1}]/a/div/img'
        
    print(job_click_path)
    job_click = job.find_element(By.XPATH,job_click_path).click()
    time.sleep(3)
    
    # job description
    try:
        jd_path = '/html/body/div[1]/div/section/div[2]/div/section[1]/div/div/section'
        jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')
    except:
        jd_path = '/html/body/div[1]/div/section/div[2]/div/section[2]/div/div/section/div'
        jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')    
        is_benefit = True
        
    try:
        assert 'Base pay range' in jd0
    except:
        is_benefit = False
        
    if is_benefit==True :
        jd_path = '/html/body/div[1]/div/section/div[2]/div/section[2]/div/div/section/div'
        jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')
        jd.append(jd0)
        jd_path2 = '/html/body/div[1]/div/section/div[2]/div/section[2]/div'
    else:
        jd.append(jd0)
        jd_path2 = '/html/body/div[1]/div/section/div[2]/div/section[1]/div'
    
    # seniority
    try:
        seniority_path = jd_path2 + '/ul/li[1]/span'
        seniority0 = job.find_element(By.XPATH,seniority_path).get_attribute('innerText')
        seniority.append(seniority0)
    except:
        seniority.append('') #handling if seniority is not available
    
    # job employment
    try:
        emp_type_path = jd_path2 + '/ul/li[2]/span'
        emp_type0 = job.find_element(By.XPATH,emp_type_path).get_attribute('innerText')
        emp_type.append(emp_type0)
    except:
        emp_type.append('') #handling if employment type is not available
    
    # job function
    try:
        job_func_path = jd_path2 + '/ul/li[3]/span'
        job_func0 = job.find_element(By.XPATH,job_func_path).get_attribute('innerText')
        job_func.append(job_func0)
    except:
        job_func.append('') #handling if job function is not available
    
    # industry
    try:
        industries_path = jd_path2 + '/ul/li[4]/span'
        industries0 = job.find_element(By.XPATH,industries_path).get_attribute('innerText')
        industries.append(industries0)
    except:
        industries.append('') #handling if job industry is not available

### 4. Insert data to data frame

In [None]:
job_data = pd.DataFrame({'Date': date,
                        'Company': company_name,
                        'Title': job_title,
                        'Location': location,
                        'Description' : jd,
                        'Level': seniority,
                        'Type': emp_type,
                        'Function': job_func,
                        'Industry': industries
                        })

In [None]:
job_data.head(5)

### 5. Export data to output

In [None]:
output_file_path = "C:\\Users\\Alvin\\Documents\\Output\\DataScraping.xlsx"
job_data.to_excel(output_file_path, index = False)