In [1]:
import time
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore')

In [2]:
def sleep(x):
    time.sleep(x)
    
def wait(x):
    driver.implicitly_wait(x)
    
def click_bann_byID(ID):
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.ID, ID)
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)

def find_elements_HPCO(H,P,C,O):
    if website_name == 'jobware':
        header = driver.find_elements(By.TAG_NAME, H)
    else:
        header = driver.find_elements(By.CLASS_NAME, H)
    publish = driver.find_elements(By.CLASS_NAME, P)
    company = driver.find_elements(By.CLASS_NAME, C)
    ort = driver.find_elements(By.CLASS_NAME, O) 

    list_header = [title.text for title in header]
    list_publish = [pub.text for pub in publish]
    list_company = [comp.text for comp in company]
    list_ort = [o.text for o in ort]
    return list_header, list_publish, list_company, list_ort


def scroll_down(x):
    n=0
    while n < x:
        n+=1
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_UP).perform()
        sleep(0.10)
        actions.key_down(Keys.PAGE_DOWN).perform()
        wait(10)
        sleep(2.5)


def scroll_down_first(x):
    n=0
    while n < x:
        n+=1
        driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
        wait(10)
        sleep(2)

# 01 - STEPSTONE

In [3]:
'''
Title : Web Scrapping by Selenium 
Project Purpose: From StepStone scrap data for some Job Titels
1 - Create Driver
2 - Go to Website
3 - Create ActionChain Object
    3.1 - Click Banned Accept
4 - Take Title and Infos from Page
    4.1 - Create Lists 
    4.2 - Create DataFrame
    4.3 - Repeat Process
    4.4 - Print and Save DataFrame
'''

print('---------------------- StepStone Job Searching Selenium Project ----------------------')
start=datetime.now()  
# 0 Link Descriptions
link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

website_name = 'stepstone'
job_name = 'Data Analyst'
#job_name = 'Business Analyst'
ort_ = 'Rietberg'
radius = 50
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'

driver.get(link)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'ccmgt_explicit_accept'
click_bann_byID(ID)

# 4 -  Take Infos from Page
# Headers, Publish_Time ,Company, City
H, P, C, O = 'resultlist-12iu5pk', 'resultlist-3asi6i', 'resultlist-1v262t5', 'resultlist-dettfq'
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

# Description and Page number of results
description = driver.find_elements(By.CLASS_NAME, 'resultlist-1pq4x2u')
result = driver.find_elements(By.CLASS_NAME, 'resultlist-xeyevn')


# Get Links
header = driver.find_elements(By.CLASS_NAME, H)
list_link = [link.get_attribute('href') for link in header]

# 4.1 - Get Texts for each finding
list_description = [des.text for des in description]
print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# Total Search Page Number
list_result = [res.text for res in result]
number_of_page = int(list_result[0].split(' ')[-1])
print(f'Number of Jobs Pages = {number_of_page}')

# 4.2 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T


# 4.3 Repeat Process for every Web Page
while  page_number < number_of_page:
    page_number+=1
    
    # 4.1 - Go to another page
    link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'
    driver.get(link)
    wait(10)
    sleep(1.5)
    
    # 4.2 - Find the elements and get the Texts
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O) 
    description = driver.find_elements(By.CLASS_NAME, 'resultlist-1pq4x2u')
    list_description = [des.text for des in description]
    header = driver.find_elements(By.CLASS_NAME, H)
    list_link = [link.get_attribute('href') for link in header]
 
    # 4.3 - Create new page Dataframe
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df2 = pd.DataFrame.from_dict(d, orient='index')
    df2 = df2.T
    
    # 4.4 - Concatenate the DataFrames
    df = pd.concat([df,df2], axis=0, ignore_index=True)
    print(f'Page Number : {page_number}, DataFrame Shape : {df2.shape}')
    

# 4.4 Save Data as csv 
print(f'DataFrame End : {df.shape}')
df['website'] = website_name
# 4.3 - Save DataFrame
# 4.3.1 - to csv
path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
job_name2 = job_name.replace(' ', '-')
time_ = datetime.today().strftime('%Y-%m-%d')
df.to_csv(f'{path}/{job_name2}-{time_}.csv', index=False)

end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()

---------------------- StepStone Job Searching Selenium Project ----------------------
Header 25 Publish 25 Company 25 Ort 25 Desc 25 Link 25
Number of Jobs Pages = 21
Page Number : 2, DataFrame Shape : (25, 6)


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=107.0.5304.110)
Stacktrace:
0   chromedriver                        0x00000001064e62c8 chromedriver + 4752072
1   chromedriver                        0x0000000106466463 chromedriver + 4228195
2   chromedriver                        0x00000001060c9b18 chromedriver + 441112
3   chromedriver                        0x00000001060a6210 chromedriver + 295440
4   chromedriver                        0x000000010612be3d chromedriver + 843325
5   chromedriver                        0x000000010613f719 chromedriver + 923417
6   chromedriver                        0x0000000106127b33 chromedriver + 826163
7   chromedriver                        0x00000001060f89fd chromedriver + 633341
8   chromedriver                        0x00000001060fa051 chromedriver + 639057
9   chromedriver                        0x00000001064b330e chromedriver + 4543246
10  chromedriver                        0x00000001064b7a88 chromedriver + 4561544
11  chromedriver                        0x00000001064bf6df chromedriver + 4593375
12  chromedriver                        0x00000001064b88fa chromedriver + 4565242
13  chromedriver                        0x000000010648e2cf chromedriver + 4391631
14  chromedriver                        0x00000001064d75b8 chromedriver + 4691384
15  chromedriver                        0x00000001064d7739 chromedriver + 4691769
16  chromedriver                        0x00000001064ed81e chromedriver + 4782110
17  libsystem_pthread.dylib             0x00007fff20521950 _pthread_start + 224
18  libsystem_pthread.dylib             0x00007fff2051d47b thread_start + 15


# 02 - JOBWARE

In [None]:
print('---------------------- Jobware Job Searching Selenium Project ----------------------')

start=datetime.now()  
# 0 Link Descriptions
link_original = 'https://www.jobware.de/jobsuche?jw_jobname=data%20analyst&jw_jobort=333**%20Rietberg&jw_ort_distance=50'

website_name = 'jobware'
radius = 50
page_number = 0

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '%20').lower()
ort_link = ort_.capitalize()
link = f'https://www.jobware.de/jobsuche?jw_jobname={job_link}&jw_jobort=333**%20{ort_}&jw_ort_distance={radius}'

driver.get(link)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
actions = ActionChains(driver)
akzeptieren = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[3]/div[2]/button')
actions.click(akzeptieren).perform()
wait(10)
sleep(0.5)
#dsgvo-1B76C4DA4B-orange dsgvo-1B76C4DA4B-accept

# 4 -  Take Infos from Page
# Headers, Company, City, Description
H, P, C, O = 'h2', 'date', 'company', 'location'
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)
description = driver.find_elements(By.CLASS_NAME, 'task')
list_description = [des.text for des in description]

links = driver.find_elements(By.CLASS_NAME, 'job')
list_link = [link.get_attribute('href') for link in links]

print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# Total Search Page Number
result = driver.find_elements(By.CLASS_NAME, 'result-sort')
list_result = [res.text for res in result]
print(list_result)

# 4.2 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T

# 4.4 Save Data as csv and xlsx    
print(f'DataFrame End : {df.shape}')
df['website'] = website_name
# 4.3 - Save DataFrame
# 4.3.1 - to csv
df.to_csv(f'{path}/{job_name2}-{time_}.csv', mode='a', index=False, header=False)

end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()

# 03 - LINKEDIN (Chrome)

In [None]:
print('---------------------- Linkedin Job Searching Selenium Project ----------------------')
   

start=datetime.now()  
# 0 Link Descriptions
link_original = 'https://www.linkedin.com/jobs/search/?currentJobId=3199974140&distance=25&keywords=data%20analyst&location=Rietberg' 

website_name =  'linkedin'
radius = 40
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '%20').lower()

link2 = f'https://www.linkedin.com/jobs/search/?distance=25&keywords={job_link}&location={ort_}'
driver.get(link2)
wait(10)
sleep(2)


#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
actions = ActionChains(driver)
akzeptieren = driver.find_element(By.TAG_NAME, 'button')
actions.click(akzeptieren).perform()
wait(10)
sleep(0.5)

# 3.1 - 
scroller = driver.find_element(By.CLASS_NAME, 'infinite-scroller__show-more-button')

scroll_down(4)

# 4 -  Take Infos from Page
# Headers, Company, City, Description
H, P, C, O = 'base-search-card__title', 'job-search-card__listdate', 'hidden-nested-link', 'job-search-card__location'
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

#description = driver.find_elements(By.CLASS_NAME, 'resultlist-1pq4x2u')
result = driver.find_elements(By.CLASS_NAME, 'results-context-header__context')


# Link Lists
links = driver.find_elements(By.CLASS_NAME, 'base-card__full-link')
list_link = [link.get_attribute('href') for link in links]

print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# Total Search Page Number
list_result = [res.text for res in result]
print(f'Number of Jobs Pages = {list_result}')


# 4.2 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T
df['description'] = None
df['website'] = website_name

# 4.4 Save Data as csv and xlsx    
print(f'DataFrame End : {df.shape}')
# 4.3 - Save DataFrame
# 4.3.1 - to csv
df.loc[df.website =='linkedin', 'city'] = df.loc[df.website =='linkedin', 'city'].str.replace(', Kuzey Ren-Vestfalya, Almanya', '')
df.to_csv(f'{path}/{job_name2}-{time_}.csv', mode='a', index=False, header=False)

# 4.3.2 - to excel
# install openpyxl
#df.to_excel(f'{path}/{job_name2}-{time_}.xlsx', sheet_name='Sheet3')

end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()

df.head()