# "Web Scraping with Selenium to Find a Job" 

We will go through 3 main tasks to implement our project:

Task 1: Importing libraries.

Task 2: Define functions.

Task 3: Web scraping with selenium.

# Task 1 : Importing libraries

In [1]:
import time
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore')

# Task 2 : Define Functions

In [2]:
# Sleep function 
def sleep(x):
    time.sleep(x)

# Wait for a certain measure of time before throwing an exception
def wait(x):
    driver.implicitly_wait(x)

# Click Function
def click_bann_byID(ID):
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.ID, ID)
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)

# Find Elements Function
def find_elements_HPCO(H,P,C,O):
    if website_name == 'jobware':
        header = driver.find_elements(By.TAG_NAME, H)
    else:
        header = driver.find_elements(By.CLASS_NAME, H)
    publish = driver.find_elements(By.CLASS_NAME, P)
    company = driver.find_elements(By.CLASS_NAME, C)
    ort = driver.find_elements(By.CLASS_NAME, O) 

    list_header = [title.text for title in header]
    list_publish = [pub.text for pub in publish]
    list_company = [comp.text for comp in company]
    list_ort = [o.text for o in ort]
    return list_header, list_publish, list_company, list_ort

# Scroll Down Function
def scroll_down(x):
    n=0
    while n < x:
        n+=1
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_UP).perform()
        sleep(0.10)
        actions.key_down(Keys.PAGE_DOWN).perform()
        wait(10)
        sleep(2.5)

# Web Scraping with Selenium

## 01 - STEPSTONE

In [3]:
'''
Title : Web Scrapping by Selenium 
Project Purpose: From StepStone scrap data for some Job Titels
1 - Create Driver
2 - Go to Website
3 - Create ActionChain Object
    3.1 - Click Banned 
4 - Take Title and Infos from Page
    4.1 - Create Lists 
    4.2 - Create DataFrame
    4.3 - Repeat Process
    4.4 - Print and Save DataFrame
'''

print('---------------------- StepStone Job Searching Selenium Project ----------------------')
start=datetime.now()  
# Link Descriptions
link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

website_name = 'stepstone'
job_name = 'Data Analyst'
#job_name = 'Business Analyst'
ort_ = 'Rietberg'
radius = 50
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'

driver.get(link)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'ccmgt_explicit_accept'
click_bann_byID(ID)

# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
H, P, C, O = 'resultlist-12iu5pk', 'resultlist-3asi6i', 'resultlist-1v262t5', 'resultlist-dettfq'
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

# 4.2 - Description and Page number of results
description = driver.find_elements(By.CLASS_NAME, 'resultlist-1pq4x2u')
result = driver.find_elements(By.CLASS_NAME, 'resultlist-xeyevn')


# 4.3 - Get Links
header = driver.find_elements(By.CLASS_NAME, H)
list_link = [link.get_attribute('href') for link in header]

# 4.4 - Get Texts for each finding
list_description = [des.text for des in description]
print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# 4.5 - Total Search Page Number
list_result = [res.text for res in result]
number_of_page = int(list_result[0].split(' ')[-1])
print(f'Number of Jobs Pages = {number_of_page}')

# 4.6 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T


# 4.7 Repeat Process for every Web Page
while  page_number < number_of_page:
    page_number+=1
    
    # 4.7.1 - Go to another page
    link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'
    driver.get(link)
    wait(10)
    sleep(1.5)
    
    # 4.7.2 - Find the elements and get the Texts
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O) 
    description = driver.find_elements(By.CLASS_NAME, 'resultlist-1pq4x2u')
    list_description = [des.text for des in description]
    header = driver.find_elements(By.CLASS_NAME, H)
    list_link = [link.get_attribute('href') for link in header]
 
    # 4.7.3 - Create new page Dataframe
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df2 = pd.DataFrame.from_dict(d, orient='index')
    df2 = df2.T
    
    # 4.7.4 - Concatenate the DataFrames
    df = pd.concat([df,df2], axis=0, ignore_index=True)
    print(f'Page Number : {page_number}, DataFrame Shape : {df2.shape}')
    

# 5.1 - Save Data as csv 
print(f'DataFrame End : {df.shape}')
df['website'] = website_name

path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
job_name2 = job_name.replace(' ', '-')
time_ = datetime.today().strftime('%Y-%m-%d')
df.to_csv(f'{path}/{job_name2}-{time_}.csv', index=False)

# 6 - Quit
end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()

---------------------- StepStone Job Searching Selenium Project ----------------------
Header 25 Publish 25 Company 25 Ort 25 Desc 25 Link 25
Number of Jobs Pages = 21
Page Number : 2, DataFrame Shape : (25, 6)
Page Number : 3, DataFrame Shape : (25, 6)
Page Number : 4, DataFrame Shape : (25, 6)
Page Number : 5, DataFrame Shape : (25, 6)
Page Number : 6, DataFrame Shape : (25, 6)
Page Number : 7, DataFrame Shape : (25, 6)
Page Number : 8, DataFrame Shape : (25, 6)
Page Number : 9, DataFrame Shape : (25, 6)
Page Number : 10, DataFrame Shape : (25, 6)
Page Number : 11, DataFrame Shape : (25, 6)
Page Number : 12, DataFrame Shape : (25, 6)
Page Number : 13, DataFrame Shape : (25, 6)
Page Number : 14, DataFrame Shape : (25, 6)
Page Number : 15, DataFrame Shape : (25, 6)
Page Number : 16, DataFrame Shape : (25, 6)
Page Number : 17, DataFrame Shape : (25, 6)
Page Number : 18, DataFrame Shape : (25, 6)
Page Number : 19, DataFrame Shape : (25, 6)
Page Number : 20, DataFrame Shape : (25, 6)
Page

## 02 - JOBWARE

In [4]:
print('---------------------- Jobware Job Searching Selenium Project ----------------------')

start=datetime.now()  
# 0 Link Descriptions
link_original = 'https://www.jobware.de/jobsuche?jw_jobname=data%20analyst&jw_jobort=333**%20Rietberg&jw_ort_distance=50'

website_name = 'jobware'
radius = 50
page_number = 0

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '%20').lower()
ort_link = ort_.capitalize()
link = f'https://www.jobware.de/jobsuche?jw_jobname={job_link}&jw_jobort=333**%20{ort_}&jw_ort_distance={radius}'

driver.get(link)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
actions = ActionChains(driver)
akzeptieren = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[3]/div[2]/button')
actions.click(akzeptieren).perform()
wait(10)
sleep(0.5)


# 4 -  Take Infos from Page
# 4.1 - Headers, Company, City, Description
H, P, C, O = 'h2', 'date', 'company', 'location'
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)
description = driver.find_elements(By.CLASS_NAME, 'task')
list_description = [des.text for des in description]

links = driver.find_elements(By.CLASS_NAME, 'job')
list_link = [link.get_attribute('href') for link in links]

print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# 4.2 - Total Search Page Number
result = driver.find_elements(By.CLASS_NAME, 'result-sort')
list_result = [res.text for res in result]
print(list_result)

# 4.3 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T

# 5.1 - Save Data as csv
print(f'DataFrame End : {df.shape}')
df['website'] = website_name

df.to_csv(f'{path}/{job_name2}-{time_}.csv', mode='a', index=False, header=False)

# 6.1 Quit
end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()

---------------------- Jobware Job Searching Selenium Project ----------------------
Header 15 Publish 15 Company 15 Ort 15 Desc 14 Link 15
['15 Treffer\nSortierung: Relevanz - Datum']
DataFrame End : (15, 6)
Code Runned No Problem
Time = 0:00:12.665651


## 03 - LINKEDIN

In [5]:
print('---------------------- Linkedin Job Searching Selenium Project ----------------------')
   

start=datetime.now()  
# 0 Link Descriptions
link_original = 'https://www.linkedin.com/jobs/search/?currentJobId=3199974140&distance=25&keywords=data%20analyst&location=Rietberg' 

website_name =  'linkedin'
radius = 40
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '%20').lower()

link2 = f'https://www.linkedin.com/jobs/search/?distance=25&keywords={job_link}&location={ort_}'
driver.get(link2)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
actions = ActionChains(driver)
akzeptieren = driver.find_element(By.TAG_NAME, 'button')
actions.click(akzeptieren).perform()
wait(10)
sleep(0.5)

# 3.2 - Scroll Down Function
scroll_down(4)

# 4 -  Take Infos from Page
# 4.1 - Headers, Company, City, Description
H, P, C, O = 'base-search-card__title', 'job-search-card__listdate', 'hidden-nested-link', 'job-search-card__location'
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

# 4.2 - Link Lists
links = driver.find_elements(By.CLASS_NAME, 'base-card__full-link')
list_link = [link.get_attribute('href') for link in links]

print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# 4.3 - Total Search Page Number
result = driver.find_elements(By.CLASS_NAME, 'results-context-header__context')
list_result = [res.text for res in result]
print(f'Number of Jobs Pages = {list_result}')


# 4.4 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T
df['description'] = None
df['website'] = website_name

# 5.1 - Save Data as csv 
print(f'DataFrame End : {df.shape}')
df.loc[df.website =='linkedin', 'city'] = df.loc[df.website =='linkedin', 'city'].str.replace(', Kuzey Ren-Vestfalya, Almanya', '')
df.to_csv(f'{path}/{job_name2}-{time_}.csv', mode='a', index=False, header=False)

# 4.5 Quit
end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()

df.head()

---------------------- Linkedin Job Searching Selenium Project ----------------------
Header 150 Publish 147 Company 135 Ort 150 Desc 14 Link 135
Number of Jobs Pages = ['Rietberg, Kuzey Ren-Vestfalya, Almanya konumunda 869 Data Analyst iş ilanı (49 yeni)']
DataFrame End : (150, 7)
Code Runned No Problem
Time = 0:00:47.908240


Unnamed: 0,job_title,publish,company,city,description,link,website
0,Data Analyst (w/m/d) Power BI mit Remote-Anteil,1 hafta önce,ATLAS TITAN Mitte GmbH,Gütersloh,,https://de.linkedin.com/jobs/view/data-analyst...,linkedin
1,Data Analyst / Scientist (m/w/d),3 hafta önce,Dr. Wolff Group,Bielefeld,,https://de.linkedin.com/jobs/view/data-analyst...,linkedin
2,Data / Business Analyst (m/w/d),3 hafta önce,Dr. Wolff Group,Bielefeld,,https://de.linkedin.com/jobs/view/data-busines...,linkedin
3,Data Analyst für Datenmodellierung und Kundenb...,1 ay önce,Lurse,Salzkotten,,https://de.linkedin.com/jobs/view/data-analyst...,linkedin
4,"Online Data Analyst (m,f,d)",3 gün önce,TELUS International,Bielefeld,,https://de.linkedin.com/jobs/view/online-data-...,linkedin
