# "Web Scraping with Selenium to Find a Job" 

We will go through 3 main tasks to implement our project:

Task 1: Importing libraries.

Task 2: Define functions.

Task 3: Web scraping with selenium.

# Task 1 : Importing libraries

In [2]:
import time
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore')

# Task 2 : Define Functions

In [3]:
# Sleep function 
def sleep(x):
    time.sleep(x)

# Wait for a certain measure of time before throwing an exception
def wait(x):
    driver.implicitly_wait(x)

# Click Function
def click_bann_byID(ID):
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.ID, ID)
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)


# Find Element Function
def find_element(H):
    header = driver.find_elements(By.CLASS_NAME, H)
    list_header = [title.text for title in header]
    return list_header


# Find Elements Function
def find_elements_HPCO(H,P,C,O):
    header = driver.find_elements(By.CLASS_NAME, H)
    publish = driver.find_elements(By.CLASS_NAME, P)
    company = driver.find_elements(By.CLASS_NAME, C)
    ort = driver.find_elements(By.CLASS_NAME, O) 

    list_header = [title.text for title in header]
    list_publish = [pub.text for pub in publish]
    list_company = [comp.text for comp in company]
    list_ort = [o.text for o in ort]
    return list_header, list_publish, list_company, list_ort

# Scroll Down Function
def scroll_down(x):
    n=0
    while n < x:
        n+=1
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_DOWN).perform()
        sleep(1.5)
        actions.key_down(Keys.PAGE_UP).perform()
        sleep(0.10)
        actions.key_down(Keys.PAGE_DOWN).perform()
        wait(10)
        sleep(2.5)

# Web Scraping with Selenium

## 01 - STEPSTONE

In [18]:
'''
Title : Web Scrapping by Selenium 
Project Purpose: From StepStone scrap data for some Job Titels
1 - Create Driver
2 - Go to Website
3 - Create ActionChain Object
    3.1 - Click Banned 
4 - Take Title and Infos from Page
    4.1 - Create Lists 
    4.2 - Create DataFrame
    4.3 - Repeat Process
    4.4 - Print and Save DataFrame
'''

print('---------------------- StepStone Job Searching Selenium Project ----------------------')
start=datetime.now()  
# Link Descriptions
link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

website_name = 'stepstone'
job_name = 'Data Engineer'
#job_name = 'Data Analyst'
#job_name = 'Data Scientist'
ort_ = 'Rietberg'
radius = 100
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)
print('Create Driver')


#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}&sort=2&action=sort_publish'

driver.get(link)
wait(5)
sleep(2)
print('Go to Website')
#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'ccmgt_explicit_accept'
click_bann_byID(ID)
print('Banned')

# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
H, P, C, O = ('res-29pyh9', 'res-rf8k2x', 'res-hbyqhf', 'res-1wf9en7')
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)
print('h pc o')
# 4.2 - Description and Page number of results
description = driver.find_elements(By.CLASS_NAME, 'res-17md5or')
#result = driver.find_elements(By.CLASS_NAME, 'resultlist-1jx3vjx')


# 4.3 - Get Links 'res-1dwe62q'
list_link01  = driver.find_elements(By.CLASS_NAME, 'res-1dwe62q')
list_link = [link.get_attribute('href') for link in list_link01]

# 4.4 - Get Texts for each finding
list_description = [des.text for des in description]
print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company[1:]), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))

# 4.5 - Total Search Page Number
#list_result = [res.text for res in result]
#number_of_page = int(list_result[-2])
#print(f'Number of Jobs Pages = {number_of_page}')

# 4.6 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T

number_of_page = 6
# 4.7 Repeat Process for every Web Page
while  page_number < number_of_page:
    page_number+=1
    
    # 4.7.1 - Go to another page
    link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}'
    driver.get(link)
    wait(5)
    sleep(1.5)
    
    # 4.7.2 - Find the elements and get the Texts
    list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O) 
    description = driver.find_elements(By.CLASS_NAME, 'res-17md5or')
    list_description = [des.text for des in description]
    list_link01  = driver.find_elements(By.CLASS_NAME, 'res-1dwe62q')
    list_link = [link.get_attribute('href') for link in list_link01]
 
    # 4.7.3 - Create new page Dataframe
    d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
    df2 = pd.DataFrame.from_dict(d, orient='index')
    df2 = df2.T
    
    # 4.7.4 - Concatenate the DataFrames
    df = pd.concat([df,df2], axis=0, ignore_index=True)
    print(f'Page Number : {page_number}, DataFrame Shape : {df2.shape}')


# 5.1 - Save Data as csv 
print(f'DataFrame End : {df.shape}')
df['website'] = website_name
time_ = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
df['date'] = time_
job_name2 = job_name.replace(' ', '_')
df['search_title'] = job_name2

path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
job_name3 = job_name.replace(' ', '-')
time_ = datetime.today().strftime('%Y-%m-%d')
df.to_csv(f'{path}/{job_name3}-{website_name}-{time_}.csv', index=False)

# 6 - Quit
end =datetime.now() 
print('Code Runned No Problem')
print(f'Time = {end - start}')
sleep(5)
driver.quit()


---------------------- StepStone Job Searching Selenium Project ----------------------
Create Driver
Go to Website
Banned
h pc o
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
Page Number : 2, DataFrame Shape : (25, 6)
Page Number : 3, DataFrame Shape : (25, 6)
Page Number : 4, DataFrame Shape : (25, 6)
Page Number : 5, DataFrame Shape : (25, 6)
Page Number : 6, DataFrame Shape : (25, 6)
DataFrame End : (150, 6)
Code Runned No Problem
Time = 0:01:04.508044


# Links with searching status

In [None]:
'''
https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=1&sort=2&action=sort_publish

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=2&sort=2&action=sort_publish

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=2&sort=1&action=facet_selected%3bage%3bage_7&ag=age_7 # son 1 hafta

https://www.stepstone.de/jobs/data-engineer/in-rietberg?radius=100&page=1&sort=1&action=facet_selected%3bage%3bage_1&ag=age_1 # son 24 saat


https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&radius=100&sort=relevance 

https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&radius=100&sort=date # newest first
'''

# Xing 

In [4]:
print('---------------------- Xing Job Searching Selenium Project ----------------------')
start=datetime.now()  
# Link Descriptions
link_original_xing = 'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page=1&radius=100'

website_name = 'xing'
job_name = 'Data Engineer'
#job_name = 'Data Analyst'
#job_name = 'Data Scientist'
ort_ = 'Rietberg'
radius = 50
page_number = 1

#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/chromedriver'
driver = webdriver.Chrome(Path)

#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page=1&radius=100&sort=date'

driver.get(link)
wait(10)
sleep(2)

#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'consent-accept-button'
click_bann_byID(ID)

# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
H = 'utils-line-clamp-lineClamp2-dfe26aab'
D = 'list-item-job-teaser-list-item-highlight-bb8ddbb6'
L = 'list-item-job-teaser-list-item-location-a5b28738'
ALL = 'list-item-job-teaser-list-item-listItem-f04c772e'


list_header = find_element(H)
list_description = find_element(D)
list_ort = find_element(L)
list_all = find_element(ALL)

list_publish = []
list_full_time = [] 
for i in list_all:
    date = i.split('\n')[-2]
    time_ = i.split('\n')[-3]
    list_publish.append(date)
    list_full_time.append(time_)

list_title =[]
list_company = []
n = 0
while n < len(list_header):
    list_title.append(list_header[n])
    list_company.append(list_header[n+1])
    n += 2

# 4.3 - Get Links
Link = 'list-item-job-teaser-list-item-listItem-f04c772e'
header = driver.find_elements(By.CLASS_NAME, Link)
list_link = [link.get_attribute('href') for link in header]

# 4.4 - DataFrame df
d = dict(job_title=np.array(list_title), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df = pd.DataFrame.from_dict(d, orient='index')
df = df.T
df['website'] = website_name
time_now = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
df['date'] = time_now
df['search_title'] = job_name


# 5.1 - Save Data as csv 
print(f'DataFrame End : {df.shape}')
path = '/Users/macbook/Desktop/projects/Github_Repositories/Portfolio Projects/02 - Web_Scraping_Job_Search/data'
df.to_csv(f'{path}/{job_name}.csv', mode='a', index=False, header=False)

list_of_list = [list_header, list_description, list_ort, list_publish, list_link]
print([len(i) for i in list_of_list])

sleep(2)
driver.quit()
print('Finish', time_now)

df.head()

---------------------- Xing Job Searching Selenium Project ----------------------
DataFrame End : (20, 9)
[40, 20, 20, 20, 20]
Finish 2023-03-10 09:04:55


Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Senior Software Engineer (m/w/d),Vor 7 Stunden,inserve GmbH,Osnabrück,Engineering und Data Streaming Dein Spielfeld?,https://www.xing.com/jobs/osnabrueck-senior-so...,xing,2023-03-10 09:04:55,Data Engineer
1,"Lead Engineer, Powertrain Verification",Vor 8 Stunden,Vestas Wind Systems A/S,Dortmund,Responsibilities As Lead Engineer - Powertrain...,https://www.xing.com/jobs/dortmund-lead-engine...,xing,2023-03-10 09:04:55,Data Engineer
2,Data Engineer Sports (all genders),Vor 15 Stunden,adesso SE,Dortmund,"Pipelines, Data Ingest und Date Processing.",https://www.xing.com/jobs/dortmund-data-engine...,xing,2023-03-10 09:04:55,Data Engineer
3,Data Architect (all genders),Vor 15 Stunden,adesso SE,Dortmund,Das Competence Center Data Engineering ist ver...,https://www.xing.com/jobs/dortmund-data-archit...,xing,2023-03-10 09:04:55,Data Engineer
4,Data Engineer (all genders),Vor 15 Stunden,adesso SE,Münster,Als Data Engineer bist du Teil eines innovativ...,https://www.xing.com/jobs/muenster-data-engine...,xing,2023-03-10 09:04:55,Data Engineer


# Connect Database POSTGRESQL

In [7]:
import psycopg2
from sqlalchemy import create_engine

conn = psycopg2.connect(database="JOB",
						user='postgres', password=1984,
						host='127.0.0.1', port='5432'
)

conn.autocommit = True
cursor = conn.cursor()


sql = '''CREATE TABLE IF NOT EXISTS dataeng(job_title varchar(300) NOT NULL,\
publish varchar(30),\
company varchar(300),\
city varchar(300),\
description varchar(300),\
link varchar(300),\
website varchar(30),\
date timestamp,\
search_title varchar(20));'''

cursor.execute(sql)


# connection string: driver://username:password@server/database
engine = create_engine('postgresql+psycopg2://postgres:1984@localhost/JOB')

#  Note:  if_exists can be append, replace, fail.  
df.to_sql('dataeng', engine, if_exists='append', index = False)


sql2 = '''select company from dataeng Where publish LIKE '%hours%' '''
cursor.execute(sql2)
for i in cursor.fetchall():
	print(i)

conn.commit()
conn.close()

('Brunel',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Campusjäger by Workwise',)
('Vesterling AG',)


In [1]:
pwd

'/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scoraing_portfolio_deneme'