# "Web Scraping with Selenium to Find a Job" 

We will go through 3 main tasks to implement our project:

Task 1: Importing libraries.

Task 2: Define functions.

Task 3: Web scraping with selenium.

# Task 1 : Importing libraries

In [2]:
import time
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore')

# Task 2 : Define Functions

In [3]:
# Sleep function 
def sleep(x):
    time.sleep(x)

# Wait for a certain measure of time before throwing an exception
def wait(x):
    driver.implicitly_wait(x)

# Click Function
def click_bann_byID(ID):
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.ID, ID)
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)

# Find Elements Function
def find_elements_HPCO(H,P,C,O):
    header = driver.find_elements(By.CLASS_NAME, H)
    publish = driver.find_elements(By.CLASS_NAME, P)
    company = driver.find_elements(By.CLASS_NAME, C)
    ort = driver.find_elements(By.CLASS_NAME, O) 

    list_header = [title.text for title in header]
    list_publish = [pub.text for pub in publish]
    list_company = [comp.text for comp in company]
    list_ort = [o.text for o in ort]
    return list_header, list_publish, list_company, list_ort

# Scroll Down Function
def scroll_down(x):
    n=0
    while n < x:
        n+=1
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_UP).perform()
        sleep(0.10)
        actions.key_down(Keys.PAGE_DOWN).perform()
        wait(10)
        sleep(2.5)

# Web Scraping with Selenium

## 01 - STEPSTONE

In [18]:
'''
Title : Web Scrapping by Selenium 
Project Purpose: From StepStone scrap data for some Job Titels
1 - Create Driver
2 - Go to Website
3 - Create ActionChain Object
    3.1 - Click Banned 
4 - Take Title and Infos from Page
    4.1 - Create Lists 
    4.2 - Create DataFrame
    4.3 - Repeat Process
    4.4 - Print and Save DataFrame
'''

print('---------------------- StepStone Job Searching Selenium Project ----------------------')
start=datetime.now()  
# Link Descriptions
link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

website_name = 'stepstone'
job_name = 'Data Engineer'
#job_name = 'Data Analyst'
#job_name = 'Data Scientist'
ort_ = 'Rietberg'
radius = 100
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)
print('Create Driver')


#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}&sort=2&action=sort_publish'

driver.get(link)
wait(5)
sleep(2)
print('Go to Website')
#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'ccmgt_explicit_accept'
click_bann_byID(ID)
print('Banned')

# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
H, P, C, O = ('res-29pyh9', 'res-rf8k2x', 'res-hbyqhf', 'res-1wf9en7')
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)
print('h pc o')
# 4.2 - Description and Page number of results
description = driver.find_elements(By.CLASS_NAME, 'res-17md5or')
#result = driver.find_elements(By.CLASS_NAME, 'resultlist-1jx3vjx')


# 4.3 - Get Links 'res-1dwe62q'
list_link01  = driver.find_elements(By.CLASS_NAME, 'res-1dwe62q')
list_link = [link.get_attribute('href') for link in list_link01]

# 4.4 - Get Texts for each finding
list_description = [des.text for des in description]
print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company[1:]), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# 4.5 - Total Search Page Number
#list_result = [res.text for res in result]
#number_of_page = int(list_result[-2])
#print(f'Number of Jobs Pages = {number_of_page}')

# 4.6 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T

number_of_page = 6
# 4.7 Repeat Process for every Web Page
while  page_number < number_of_page:
    page_number+=1
    
    # 4.7.1 - Go to another page
    link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'
    driver.get(link)
    wait(5)
    sleep(1.5)
    
    # 4.7.2 - Find the elements and get the Texts
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O) 
    description = driver.find_elements(By.CLASS_NAME, 'res-17md5or')
    list_description = [des.text for des in description]
    list_link01  = driver.find_elements(By.CLASS_NAME, 'res-1dwe62q')
    list_link = [link.get_attribute('href') for link in list_link01]
 
    # 4.7.3 - Create new page Dataframe
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df2 = pd.DataFrame.from_dict(d, orient='index')
    df2 = df2.T
    
    # 4.7.4 - Concatenate the DataFrames
    df = pd.concat([df,df2], axis=0, ignore_index=True)
    print(f'Page Number : {page_number}, DataFrame Shape : {df2.shape}')


# 5.1 - Save Data as csv 
print(f'DataFrame End : {df.shape}')
df['website'] = website_name
time_ = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
df['date'] = time_
job_name2 = job_name.replace(' ', '_')
df['search_title'] = job_name2

path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
job_name3 = job_name.replace(' ', '-')
time_ = datetime.today().strftime('%Y-%m-%d')
df.to_csv(f'{path}/{job_name3}-{website_name}-{time_}.csv', index=False)

# 6 - Quit
end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()


---------------------- StepStone Job Searching Selenium Project ----------------------
Create Driver
Go to Website
Banned
h pc o
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
Page Number : 2, DataFrame Shape : (25, 6)
Page Number : 3, DataFrame Shape : (25, 6)
Page Number : 4, DataFrame Shape : (25, 6)
Page Number : 5, DataFrame Shape : (25, 6)
Page Number : 6, DataFrame Shape : (25, 6)
DataFrame End : (150, 6)
Code Runned No Problem
Time = 0:01:04.508044


In [19]:
df.head()

Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Junior Data Engineer (m/w/d),vor 1 Tag,Amprion GmbH,Halle,Zur Unterstützung unserer Bereiche Anwendungsb...,https://www.stepstone.de/stellenangebote--Juni...,stepstone,2023-03-10 10:38:15,Data_Engineer
1,Data Expert (m/w/d),vor 2 Tagen,Ratiodata SE,Dortmund,Abhängig von der Rolle verstärken Sie unser Te...,https://www.stepstone.de/stellenangebote--Data...,stepstone,2023-03-10 10:38:15,Data_Engineer
2,Client Engineer (m/w/d),vor 2 Tagen,FALKE KGaA,"Hannover (Wedemark), Münster, Duisburg",Ratiodata SE - Die Ratiodata SE zählt zu den g...,https://www.stepstone.de/stellenangebote--Clie...,stepstone,2023-03-10 10:38:15,Data_Engineer
3,Data Engineer DWH / BI (m/w/d),vor 5 Tagen,DACHSER SE,Schmallenberg,Verstärken Sie das Team unseres Hauptsitzes am...,https://www.stepstone.de/stellenangebote--Data...,stepstone,2023-03-10 10:38:15,Data_Engineer
4,Data Integration Engineer (m/w/d) EDI,vor 6 Tagen,BabyOne Franchise- und Systemzentrale GmbH,"Berlin, Hamburg, München, Köln, Frankfurt a. M...","Berlin, Hamburg, München, Köln, Frankfurt a. M...",https://www.stepstone.de/stellenangebote--Data...,stepstone,2023-03-10 10:38:15,Data_Engineer


In [15]:
list_link

[]

## 03 - LINKEDIN

In [10]:
print('---------------------- Linkedin Job Searching Selenium Project ----------------------')
   
job_name = 'Business Analyst'
start=datetime.now()  
# 0 Link Descriptions
link_original = 'https://www.linkedin.com/jobs/search/?currentJobId=3199974140&distance=25&keywords=data%20analyst&location=Rietberg' 

website_name =  'linkedin'
radius = 40
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '%20').lower()

link2 = f'https://www.linkedin.com/jobs/search/?distance=25&keywords={job_link}&location={ort_}'
driver.get(link2)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
actions = ActionChains(driver)
akzeptieren = driver.find_element(By.TAG_NAME, 'button')
actions.click(akzeptieren).perform()
wait(10)
sleep(0.5)

# 3.2 - Scroll Down Function
scroll_down(4)

# 4 -  Take Infos from Page
# 4.1 - Headers, Company, City, Description
H, P, C, O = 'base-search-card__title', 'job-search-card__listdate', 'hidden-nested-link', 'job-search-card__location'
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

# 4.2 - Link Lists
links = driver.find_elements(By.CLASS_NAME, 'base-card__full-link')
list_link = [link.get_attribute('href') for link in links]

print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# 4.3 - Total Search Page Number
result = driver.find_elements(By.CLASS_NAME, 'results-context-header__context')
list_result = [res.text for res in result]
print(f'Number of Jobs Pages = {list_result}')


# 4.4 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T
df['description'] = None
df['website'] = website_name
df['date'] = time_
job_name2 = job_name.replace(' ', '_')
df['search_title'] = job_name2

# 5.1 - Save Data as csv 
print(f'DataFrame End : {df.shape}')
df.loc[df.website =='linkedin', 'city'] = df.loc[df.website =='linkedin', 'city'].str.replace(', Kuzey Ren-Vestfalya, Almanya', '')
df.to_csv(f'{path}/{job_name3}-{time_}.csv', mode='a', index=False, header=False)

# 4.5 Quit
end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()

df.head()

---------------------- Linkedin Job Searching Selenium Project ----------------------
Header 48 Publish 48 Company 43 Ort 48 Desc 0 Link 43
Number of Jobs Pages = ['Rietberg, Kuzey Ren-Vestfalya, Almanya konumunda 48 Business Analyst iş ilanı']
DataFrame End : (48, 9)
Code Runned No Problem
Time = 0:00:39.972423


Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Business Analyst (M/W/D),2 gün önce,Positive Thinking Company,Bielefeld,,https://de.linkedin.com/jobs/view/business-ana...,linkedin,2023-01-23,Business_Analyst
1,Business Analyst (m/w/d) Sales,6 gün önce,Würth Industrie Service GmbH & Co. KG,Bielefeld,,https://de.linkedin.com/jobs/view/business-ana...,linkedin,2023-01-23,Business_Analyst
2,Business Analyst (m/w/d) Sales,6 gün önce,Würth Industrie Service GmbH & Co. KG,Paderborn,,https://de.linkedin.com/jobs/view/business-ana...,linkedin,2023-01-23,Business_Analyst
3,"Business Analyst agree21 , Junior oder Profi (...",2 ay önce,Beckmann & Partner CONSULT,Bielefeld,,https://de.linkedin.com/jobs/view/business-ana...,linkedin,2023-01-23,Business_Analyst
4,Product Owner / Business Analyst (m/w/d) I E-C...,2 ay önce,Arvato Systems,Bielefeld,,https://de.linkedin.com/jobs/view/product-owne...,linkedin,2023-01-23,Business_Analyst


# Stepstone, Jobware and Linkedin 

In [8]:
'''
Title : Web Scrapping by Selenium 
Project Purpose: From StepStone scrap data for some Job Titels
1 - Create Driver
2 - Go to Website
3 - Create ActionChain Object
    3.1 - Click Banned 
4 - Take Title and Infos from Page
    4.1 - Create Lists 
    4.2 - Create DataFrame
    4.3 - Repeat Process
    4.4 - Print and Save DataFrame
'''

job_list = ['Business Analyst', 'Data Scientist'] #, 'Data Analyst' 
n=0
start_01=datetime.now()
while n<len(job_list):
    print('---------------------- StepStone Job Searching Selenium Project ----------------------')
    start=datetime.now()  
    n+=1
    # Link Descriptions
    link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

    website_name = 'stepstone'
    job_name = job_list[n-1] 
    print(n, job_name)
    ort_ = 'Rietberg'
    radius = 50
    page_number = 1

    #  1 - Create Driver
    Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
    driver = webdriver.Chrome(Path)

    #  2 - Go to Website
    job_link = job_name.replace(' ', '-').lower()
    ort_link = ort_.lower()
    link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'

    driver.get(link)
    wait(10)
    sleep(2)

    #  3 - ActionChain Object created
    # 3.1 - Click Banned Accept
    ID = 'ccmgt_explicit_accept'
    click_bann_byID(ID)


    # 4 -  Take Infos from Page
    # 4.1 - Headers, Publish_Time ,Company, City
    H, P, C, O = 'resultlist-1uvdp0v', 'resultlist-w7zbt7', 'resultlist-1va1dj8', 'resultlist-suri3e'
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

    # 4.2 - Description and Page number of results
    description = driver.find_elements(By.CLASS_NAME, 'resultlist-1fp8oay')
    result = driver.find_elements(By.CLASS_NAME, 'resultlist-1jx3vjx')


    # 4.3 - Get Links
    header = driver.find_elements(By.CLASS_NAME, H)
    list_link = [link.get_attribute('href') for link in header]

    # 4.4 - Get Texts for each finding
    list_description = [des.text for des in description]
    print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company[1:]), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

    # 4.5 - Total Search Page Number
    list_result = [res.text for res in result]
    number_of_page = int(list_result[-2])
    print(f'Number of Jobs Pages = {number_of_page}')

    # 4.6 - DataFrame df
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df = pd.DataFrame.from_dict(d, orient='index')
    df = df.T


    # 4.7 Repeat Process for every Web Page
    while  page_number < number_of_page:
        page_number+=1

        # 4.7.1 - Go to another page
        link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'
        driver.get(link)
        wait(10)
        sleep(1.5)

        # 4.7.2 - Find the elements and get the Texts
        list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O) 
        description = driver.find_elements(By.CLASS_NAME, 'resultlist-1pq4x2u')
        list_description = [des.text for des in description]
        header = driver.find_elements(By.CLASS_NAME, H)
        list_link = [link.get_attribute('href') for link in header]

        # 4.7.3 - Create new page Dataframe
        d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
        df2 = pd.DataFrame.from_dict(d, orient='index')
        df2 = df2.T

        # 4.7.4 - Concatenate the DataFrames
        df = pd.concat([df,df2], axis=0, ignore_index=True)
        print(f'Page Number : {page_number}, DataFrame Shape : {df2.shape}')


    # 5.1 - Save Data as csv 
    print(f'DataFrame End : {df.shape}')
    df['website'] = website_name
    time_ = datetime.today().strftime('%Y-%m-%d')
    df['date'] = time_
    job_name2 = job_name.replace(' ', '_')
    df['search_title'] = job_name2

    path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
    job_name3 = job_name.replace(' ', '-')
    time_ = datetime.today().strftime('%Y-%m-%d')
    df.to_csv(f'{path}/{job_name3}-{time_}.csv', index=False)

    # 6 - Quit
    end =datetime.now() 
    print('Code Runned No Problem')
    print(f'Time = {end - start}')
    sleep(1)
    driver.quit()

    print('---------------------- Jobware Job Searching Selenium Project ----------------------')

    start=datetime.now()  
    # 0 Link Descriptions
    link_original = 'https://www.jobware.de/jobsuche?jw_jobname=data%20analyst&jw_jobort=333**%20Rietberg&jw_ort_distance=50'

    website_name = 'jobware'
    radius = 50
    page_number = 0

    #  1 - Create Driver
    Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
    driver = webdriver.Chrome(Path)

    #  2 - Go to Website
    job_link = job_name.replace(' ', '%20').lower()
    ort_link = ort_.capitalize()
    link = f'https://www.jobware.de/jobsuche?jw_jobname={job_link}&jw_jobort=333**%20{ort_}&jw_ort_distance={radius}'

    driver.get(link)
    wait(10)
    sleep(2)

    #  3 - ActionChain Object created
    # 3.1 - Click Banned Accept
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[3]/div[2]/button')
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)


    # 4 -  Take Infos from Page
    # 4.1 - Headers, Company, City, Description
    H, P, C, O = 'h2', 'date', 'company', 'location'
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)
    description = driver.find_elements(By.CLASS_NAME, 'task')
    list_description = [des.text for des in description]

    links = driver.find_elements(By.CLASS_NAME, 'job')
    list_link = [link.get_attribute('href') for link in links]

    print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

    # 4.2 - Total Search Page Number
    result = driver.find_elements(By.CLASS_NAME, 'result-sort')
    list_result = [res.text for res in result]
    print(list_result)

    # 4.3 - DataFrame df
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df = pd.DataFrame.from_dict(d, orient='index')
    df = df.T

    # 5.1 - Save Data as csv
    print(f'DataFrame End : {df.shape}')
    df['website'] = website_name
    time_ = datetime.today().strftime('%Y-%m-%d')
    df['date'] = time_
    job_name2 = job_name.replace(' ', '_')
    df['search_title'] = job_name2



    df.to_csv(f'{path}/{job_name3}-{time_}.csv', mode='a', index=False, header=False)

    # 6.1 Quit
    end =datetime.now() 
    print('Code Runned No Problem')
    print(f'Time = {end - start}')
    sleep(5)
    driver.quit()

    print('---------------------- Linkedin Job Searching Selenium Project ----------------------')


    start=datetime.now()  
    # 0 Link Descriptions
    link_original = 'https://www.linkedin.com/jobs/search/?currentJobId=3199974140&distance=25&keywords=data%20analyst&location=Rietberg' 

    website_name =  'linkedin'
    radius = 40
    page_number = 1

    #  1 - Create Driver
    Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
    driver = webdriver.Chrome(Path)

    #  2 - Go to Website
    job_link = job_name.replace(' ', '%20').lower()

    link2 = f'https://www.linkedin.com/jobs/search/?distance=25&keywords={job_link}&location={ort_}'
    driver.get(link2)
    wait(10)
    sleep(2)

    #  3 - ActionChain Object created
    # 3.1 - Click Banned Accept
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.TAG_NAME, 'button')
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)

    # 3.2 - Scroll Down Function
    scroll_down(4)

    # 4 -  Take Infos from Page
    # 4.1 - Headers, Company, City, Description
    H, P, C, O = 'base-search-card__title', 'job-search-card__listdate', 'hidden-nested-link', 'job-search-card__location'
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

    # 4.2 - Link Lists
    links = driver.find_elements(By.CLASS_NAME, 'base-card__full-link')
    list_link = [link.get_attribute('href') for link in links]

    print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

    # 4.3 - Total Search Page Number
    result = driver.find_elements(By.CLASS_NAME, 'results-context-header__context')
    list_result = [res.text for res in result]
    print(f'Number of Jobs Pages = {list_result}')


    # 4.4 - DataFrame df
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df = pd.DataFrame.from_dict(d, orient='index')
    df = df.T
    df['description'] = None
    df['website'] = website_name
    df['date'] = time_
    job_name2 = job_name.replace(' ', '_')
    df['search_title'] = job_name2

    # 5.1 - Save Data as csv 
    print(f'DataFrame End : {df.shape}')
    df.loc[df.website =='linkedin', 'city'] = df.loc[df.website =='linkedin', 'city'].str.replace(', Kuzey Ren-Vestfalya, Almanya', '')
    df.to_csv(f'{path}/{job_name3}-{time_}.csv', mode='a', index=False, header=False)

    # 4.5 Quit
    end =datetime.now() 
    print('Code Runned No Problem')
    print(f'Time = {end - start}')
    sleep(5)
    driver.quit()

    df.head()
    
end_01 =datetime.now()
print(f'Total Time: {end_01 - start_01}')

---------------------- StepStone Job Searching Selenium Project ----------------------
1 Business Analyst
Header 25 Publish 25 Company 25 Ort 25 Desc 25 Link 25
Number of Jobs Pages = 3
Page Number : 2, DataFrame Shape : (25, 6)
Page Number : 3, DataFrame Shape : (8, 6)
DataFrame End : (58, 6)
Code Runned No Problem
Time = 0:01:06.876843
---------------------- Jobware Job Searching Selenium Project ----------------------


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=109.0.5414.87)
Stacktrace:
0   chromedriver                        0x0000000105b937a8 chromedriver + 4896680
1   chromedriver                        0x0000000105b12db3 chromedriver + 4369843
2   chromedriver                        0x000000010575ca37 chromedriver + 477751
3   chromedriver                        0x0000000105731fb5 chromedriver + 303029
4   chromedriver                        0x00000001057cc77f chromedriver + 935807
5   chromedriver                        0x00000001057e1923 chromedriver + 1022243
6   chromedriver                        0x00000001057c7233 chromedriver + 913971
7   chromedriver                        0x000000010579145c chromedriver + 693340
8   chromedriver                        0x00000001057929de chromedriver + 698846
9   chromedriver                        0x0000000105b61ace chromedriver + 4692686
10  chromedriver                        0x0000000105b65da1 chromedriver + 4709793
11  chromedriver                        0x0000000105b665aa chromedriver + 4711850
12  chromedriver                        0x0000000105b6d62f chromedriver + 4740655
13  chromedriver                        0x0000000105b66caa chromedriver + 4713642
14  chromedriver                        0x0000000105b3a992 chromedriver + 4532626
15  chromedriver                        0x0000000105b853c8 chromedriver + 4838344
16  chromedriver                        0x0000000105b85545 chromedriver + 4838725
17  chromedriver                        0x0000000105b9b6ef chromedriver + 4929263
18  libsystem_pthread.dylib             0x00007fff204ff950 _pthread_start + 224
19  libsystem_pthread.dylib             0x00007fff204fb47b thread_start + 15


# Stepstone , Linkedin

In [15]:
'''
Title : Web Scrapping by Selenium 
Project Purpose: From StepStone scrap data for some Job Titels
1 - Create Driver
2 - Go to Website
3 - Create ActionChain Object
    3.1 - Click Banned 
4 - Take Title and Infos from Page
    4.1 - Create Lists 
    4.2 - Create DataFrame
    4.3 - Repeat Process
    4.4 - Print and Save DataFrame
'''

job_list = ['Business Analyst', 'Data Scientist'] #, 'Data Analyst' 
n=0
start_01=datetime.now()
while n<len(job_list):
    print('---------------------- StepStone Job Searching Selenium Project ----------------------')
    start=datetime.now()  
    n+=1
    # Link Descriptions
    link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

    website_name = 'stepstone'
    job_name = job_list[n-1] 
    print(n, job_name)
    ort_ = 'Rietberg'
    radius = 50
    page_number = 1

    #  1 - Create Driver
    Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
    driver = webdriver.Chrome(Path)

    #  2 - Go to Website
    job_link = job_name.replace(' ', '-').lower()
    ort_link = ort_.lower()
    link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'

    driver.get(link)
    wait(10)
    sleep(2)

    #  3 - ActionChain Object created
    # 3.1 - Click Banned Accept
    ID = 'ccmgt_explicit_accept'
    click_bann_byID(ID)


    # 4 -  Take Infos from Page
    # 4.1 - Headers, Publish_Time ,Company, City
    H, P, C, O = 'resultlist-1uvdp0v', 'resultlist-w7zbt7', 'resultlist-1va1dj8', 'resultlist-suri3e'
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

    # 4.2 - Description and Page number of results
    description = driver.find_elements(By.CLASS_NAME, 'resultlist-1fp8oay')
    result = driver.find_elements(By.CLASS_NAME, 'resultlist-1jx3vjx')


    # 4.3 - Get Links
    header = driver.find_elements(By.CLASS_NAME, H)
    list_link = [link.get_attribute('href') for link in header]

    # 4.4 - Get Texts for each finding
    list_description = [des.text for des in description]
    print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company[1:]), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

    # 4.5 - Total Search Page Number
    list_result = [res.text for res in result]
    number_of_page = int(list_result[-2])
    print(f'Number of Jobs Pages = {number_of_page}')

    # 4.6 - DataFrame df
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df = pd.DataFrame.from_dict(d, orient='index')
    df = df.T


    # 4.7 Repeat Process for every Web Page
    while  page_number < number_of_page:
        page_number+=1

        # 4.7.1 - Go to another page
        link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'
        driver.get(link)
        wait(10)
        sleep(0.5)

        # 4.7.2 - Find the elements and get the Texts
        list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O) 
        description = driver.find_elements(By.CLASS_NAME, 'resultlist-1pq4x2u')
        list_description = [des.text for des in description]
        header = driver.find_elements(By.CLASS_NAME, H)
        list_link = [link.get_attribute('href') for link in header]

        # 4.7.3 - Create new page Dataframe
        d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
        df2 = pd.DataFrame.from_dict(d, orient='index')
        df2 = df2.T

        # 4.7.4 - Concatenate the DataFrames
        df = pd.concat([df,df2], axis=0, ignore_index=True)
        print(f'Page Number : {page_number}, DataFrame Shape : {df2.shape}')


    # 5.1 - Save Data as csv 
    print(f'DataFrame End : {df.shape}')
    df['website'] = website_name
    time_ = datetime.today().strftime('%Y-%m-%d')
    df['date'] = time_
    job_name2 = job_name.replace(' ', '_')
    df['search_title'] = job_name2

    path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
    job_name3 = job_name.replace(' ', '-')
    time_ = datetime.today().strftime('%Y-%m-%d')
    df.to_csv(f'{path}/{job_name3}-{time_}.csv', index=False)

    # 6 - Quit
    end =datetime.now() 
    print('Code Runned No Problem')
    print(f'Time = {end - start}')
    sleep(1)
    driver.quit()

    
    print('---------------------- Linkedin Job Searching Selenium Project ----------------------')


    start=datetime.now()  
    # 0 Link Descriptions
    link_original = 'https://www.linkedin.com/jobs/search/?currentJobId=3199974140&distance=25&keywords=data%20analyst&location=Rietberg' 

    website_name =  'linkedin'
    radius = 40
    page_number = 1

    #  1 - Create Driver
    Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
    driver = webdriver.Chrome(Path)

    #  2 - Go to Website
    job_link = job_name.replace(' ', '%20').lower()

    link2 = f'https://www.linkedin.com/jobs/search/?distance=25&keywords={job_link}&location={ort_}'
    driver.get(link2)
    wait(10)
    sleep(2)

    #  3 - ActionChain Object created
    # 3.1 - Click Banned Accept
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.TAG_NAME, 'button')
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)

    # 3.2 - Scroll Down Function
    scroll_down(4)

    # 4 -  Take Infos from Page
    # 4.1 - Headers, Company, City, Description
    H, P, C, O = 'base-search-card__title', 'job-search-card__listdate', 'hidden-nested-link', 'job-search-card__location'
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

    # 4.2 - Link Lists
    links = driver.find_elements(By.CLASS_NAME, 'base-card__full-link')
    list_link = [link.get_attribute('href') for link in links]

    print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

    # 4.3 - Total Search Page Number
    result = driver.find_elements(By.CLASS_NAME, 'results-context-header__context')
    list_result = [res.text for res in result]
    print(f'Number of Jobs Pages = {list_result}')


    # 4.4 - DataFrame df
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df = pd.DataFrame.from_dict(d, orient='index')
    df = df.T
    df['description'] = None
    df['website'] = website_name
    df['date'] = time_
    job_name2 = job_name.replace(' ', '_')
    df['search_title'] = job_name2

    # 5.1 - Save Data as csv 
    print(f'DataFrame End : {df.shape}')
    df.loc[df.website =='linkedin', 'city'] = df.loc[df.website =='linkedin', 'city'].str.replace(', Kuzey Ren-Vestfalya, Almanya', '')
    df.to_csv(f'{path}/{job_name3}-{website_name}-{time_}.csv', mode='a', index=False, header=False)

    # 4.5 Quit
    end =datetime.now() 
    print('Code Runned No Problem')
    print(f'Time = {end - start}')
    sleep(5)
    driver.quit()

    df.head()
    
end_01 =datetime.now()
print(f'Total Time: {end_01 - start_01}')

---------------------- StepStone Job Searching Selenium Project ----------------------
1 Business Analyst
Header 25 Publish 25 Company 25 Ort 25 Desc 25 Link 25
Number of Jobs Pages = 3
Page Number : 2, DataFrame Shape : (25, 6)
Page Number : 3, DataFrame Shape : (6, 6)
DataFrame End : (56, 6)
Code Runned No Problem
Time = 0:01:02.657421
---------------------- Linkedin Job Searching Selenium Project ----------------------
Header 25 Publish 25 Company 23 Ort 25 Desc 0 Link 23
Number of Jobs Pages = ['Rietberg, Kuzey Ren-Vestfalya, Almanya konumunda 47 Business Analyst iş ilanı']
DataFrame End : (25, 9)
Code Runned No Problem
Time = 0:00:37.934671
---------------------- StepStone Job Searching Selenium Project ----------------------
2 Data Scientist
Header 25 Publish 25 Company 25 Ort 25 Desc 25 Link 25
Number of Jobs Pages = 2
Page Number : 2, DataFrame Shape : (1, 6)
DataFrame End : (26, 6)
Code Runned No Problem
Time = 0:00:44.833504
---------------------- Linkedin Job Searching Selen

In [None]:
df.shape

In [14]:
job_list = ['Business Analyst', 'Data Scientist', 'Data Analyst' ]
n=0

while n<3:
    n+=1
    job_name = job_list[n-1]
    print(n)
    print(job_name)

1
Business Analyst
2
Data Scientist
3
Data Analyst


In [1]:
pwd

'/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search'

# Links with searching status

In [None]:
https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=1&sort=2&action=sort_publish

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=2&sort=2&action=sort_publish

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=2&sort=1&action=facet_selected%3bage%3bage_7&ag=age_7 # son 1 hafta

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=1&sort=1&action=facet_selected%3bage%3bage_1&ag=age_1 # son 24 saat


https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&radius=100&sort=relevance 

https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&radius=100&sort=date # newest first

In [3]:
# Find Element Function
def find_element(H):
    header = driver.find_elements(By.CLASS_NAME, H)
    list_header = [title.text for title in header]
    return list_header

# Xing 

In [4]:
print('---------------------- Xing Job Searching Selenium Project ----------------------')
start=datetime.now()  
# Link Descriptions
link_original_xing = 'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page=1&radius=100'

website_name = 'xing'
job_name = 'Data Engineer'
#job_name = 'Data Analyst'
#job_name = 'Data Scientist'
ort_ = 'Rietberg'
radius = 50
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page=1&radius=100&sort=date'

driver.get(link)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'consent-accept-button'
click_bann_byID(ID)

# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
H = 'utils-line-clamp-lineClamp2-dfe26aab'
D = 'list-item-job-teaser-list-item-highlight-bb8ddbb6'
L = 'list-item-job-teaser-list-item-location-a5b28738'
ALL = 'list-item-job-teaser-list-item-listItem-f04c772e'


list_header = find_element(H)
list_description = find_element(D)
list_ort = find_element(L)
list_all = find_element(ALL)

list_publish = []
list_full_time = [] 
for i in list_all:
    date = i.split('\n')[-2]
    time_ = i.split('\n')[-3]
    list_publish.append(date)
    list_full_time.append(time_)

list_title =[]
list_company = []
n = 0
while n < len(list_header):
    list_title.append(list_header[n])
    list_company.append(list_header[n+1])
    n += 2

# 4.3 - Get Links
Link = 'list-item-job-teaser-list-item-listItem-f04c772e'
header = driver.find_elements(By.CLASS_NAME, Link)
list_link = [link.get_attribute('href') for link in header]

# 4.4 - DataFrame df
d = dict(job_title=np.array(list_title), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T
df['website'] = website_name
time_now = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
df['date'] = time_now
df['search_title'] = job_name


# 5.1 - Save Data as csv 
print(f'DataFrame End : {df.shape}')
path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
df.to_csv(f'{path}/{job_name}.csv', mode='a', index=False, header=False)

list_of_list = [list_header, list_description, list_ort, list_publish, list_link]
print([len(i) for i in list_of_list])

sleep(2)
driver.quit()
print('Finish', time_now)

df.head()

---------------------- Xing Job Searching Selenium Project ----------------------
DataFrame End : (20, 9)
[40, 20, 20, 20, 20]
Finish 2023-03-10 09:04:55


Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Senior Software Engineer (m/w/d),Vor 7 Stunden,inserve GmbH,Osnabrück,Engineering und Data Streaming Dein Spielfeld?,https://www.xing.com/jobs/osnabrueck-senior-so...,xing,2023-03-10 09:04:55,Data Engineer
1,"Lead Engineer, Powertrain Verification",Vor 8 Stunden,Vestas Wind Systems A/S,Dortmund,Responsibilities As Lead Engineer - Powertrain...,https://www.xing.com/jobs/dortmund-lead-engine...,xing,2023-03-10 09:04:55,Data Engineer
2,Data Engineer Sports (all genders),Vor 15 Stunden,adesso SE,Dortmund,"Pipelines, Data Ingest und Date Processing.",https://www.xing.com/jobs/dortmund-data-engine...,xing,2023-03-10 09:04:55,Data Engineer
3,Data Architect (all genders),Vor 15 Stunden,adesso SE,Dortmund,Das Competence Center Data Engineering ist ver...,https://www.xing.com/jobs/dortmund-data-archit...,xing,2023-03-10 09:04:55,Data Engineer
4,Data Engineer (all genders),Vor 15 Stunden,adesso SE,Münster,Als Data Engineer bist du Teil eines innovativ...,https://www.xing.com/jobs/muenster-data-engine...,xing,2023-03-10 09:04:55,Data Engineer


# Connect Database POSTGRESQL

In [7]:
import psycopg2

conn = psycopg2.connect(database="JOB",
						user='postgres', password=1984,
						host='127.0.0.1', port='5432'
)

conn.autocommit = True
cursor = conn.cursor()


sql = '''CREATE TABLE IF NOT EXISTS dataeng(job_title varchar(300) NOT NULL,\
publish varchar(30),\
company varchar(300),\
city varchar(300),\
description varchar(300),\
link varchar(300),\
website varchar(30),\
date timestamp,\
search_title varchar(20));'''

cursor.execute(sql)

#### conda install -c anaconda sqlalchemy
from sqlalchemy import create_engine
# connection string: driver://username:password@server/database
engine = create_engine('postgresql+psycopg2://postgres:1984@localhost/JOB')

#  Note:  if_exists can be append, replace, fail.  
df.to_sql('dataeng', engine, if_exists='append', index = False)


sql2 = '''select company from dataeng Where publish LIKE '%hours%' '''
cursor.execute(sql2)
for i in cursor.fetchall():
	print(i)

conn.commit()
conn.close()

('Brunel',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Vesterling AG',)


In [6]:
import sys
!{sys.executable} -m pip install sqlalchemy

Collecting sqlalchemy
  Downloading SQLAlchemy-2.0.5.post1-cp310-cp310-macosx_10_9_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting typing-extensions>=4.2.0
  Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Collecting greenlet!=0.4.17
  Downloading greenlet-2.0.2-cp310-cp310-macosx_11_0_x86_64.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.1/242.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: typing-extensions, greenlet, sqlalchemy
Successfully installed greenlet-2.0.2 sqlalchemy-2.0.5.post1 typing-extensions-4.5.0


In [60]:
list_link[0]

'https://www.xing.com/jobs/dortmund-data-engineer-98044745'

In [50]:
list_link[1]

'Junior Software Developer / Data Engineer (m/w/d)\nDortmund\nMotionMiners GmbH\n€53,500 – €69,500\nFull-time\nYesterday\nSave job'

In [51]:
list_link[2]

'Data Engineer - Data Warehouse / Datenpflege / Home Office (m/w/d)\nBochum\nCampusjäger by Workwise\n4.7\nFull-time\nEngineer - Data Warehouse / Datenpflege / Home Office (m/w/d) klingt vielversprechend?\n20 hours ago\nSave job'

In [62]:
list_link[2].split('\n')[-1]

'https://www.xing.com/jobs/bochum-data-engineer-data-warehouse-datenpflege-home-office-98097563'

In [53]:
list_link[0].split('\n')

['Data Engineer (m/w/d)',
 'Dortmund',
 'BRUDERKOPF GmbH & Co. KG',
 'Full-time',
 'Für ihn suchen wir einen Data Engineer (m/w/d) . Wahlweise ist die Arbeit am Standort unseres Mandanten oder remote möglich.',
 '2 days ago',
 'Save job']

In [88]:
df.link

0     https://www.xing.com/jobs/muenster-senior-data...
1     https://www.xing.com/jobs/dortmund-senior-data...
2     https://www.xing.com/jobs/paderborn-senior-dat...
3     https://www.xing.com/jobs/gelsenkirchen-big-da...
4     https://www.xing.com/jobs/bochum-big-data-deve...
5     https://www.xing.com/jobs/bielefeld-big-data-d...
6     https://www.xing.com/jobs/muenster-big-data-de...
7     https://www.xing.com/jobs/dortmund-big-data-de...
8     https://www.xing.com/jobs/kassel-senior-data-e...
9     https://www.xing.com/jobs/kassel-it-data-engin...
10    https://www.xing.com/jobs/dortmund-architect-d...
11    https://www.xing.com/jobs/muenster-cloud-archi...
12    https://www.xing.com/jobs/paderborn-cloud-arch...
13    https://www.xing.com/jobs/dortmund-cloud-archi...
14    https://www.xing.com/jobs/paderborn-system-eng...
15    https://www.xing.com/jobs/paderborn-sas-admini...
16    https://www.xing.com/jobs/dortmund-sas-adminis...
17    https://www.xing.com/jobs/paderborn-linux-

In [102]:
insert_values = [(1, 'James', 12000, 'D1'), (2, 'Cellim', 20000, 'D1'), (3, 'Abdullah', 18000, 'D3')]

for record in insert_values:
    print(record)

(1, 'James', 12000, 'D1')
(2, 'Cellim', 20000, 'D1')
(3, 'Abdullah', 18000, 'D3')


In [107]:
for i in df.iloc[0]:
    print(i)

Data Engineer (m/w/d)
7 hours ago
Brunel
Osnabrück
Ihre Aufgabe Als Data Engineer entwickeln Sie Data Management Lösungen im Anwendungsspezifischen Bereich unseres Kunden.
https://www.xing.com/jobs/osnabrueck-data-engineer-98150533
xing
2023-03-08-09:00:09
Data Engineer


In [106]:
df.iloc[0]

job_title                                   Data Engineer (m/w/d)
publish                                               7 hours ago
company                                                    Brunel
city                                                    Osnabrück
description     Ihre Aufgabe Als Data Engineer entwickeln Sie ...
link            https://www.xing.com/jobs/osnabrueck-data-engi...
website                                                      xing
date                                          2023-03-08-09:00:09
search_title                                        Data Engineer
Name: 0, dtype: object