In [1]:
import time
import logging
import numpy as np
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import psycopg2
from sqlalchemy import create_engine

import warnings
warnings.simplefilter(action='ignore')

# Sleep function 
def sleep(x):
    time.sleep(x)

# Wait for a certain measure of time before throwing an exception
def wait(x):
    driver.implicitly_wait(x)

# Click Function
def click_bann_byID(ID):
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.ID, ID)
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)


# Find Element Function
def find_element(E):
    elements = driver.find_elements(By.CLASS_NAME, E)
    list_elements = [element.text for element in elements]
    return list_elements


# Find Elements Function
def find_elements_HPCO(H,P,C,O):
    header = driver.find_elements(By.CLASS_NAME, H)
    publish = driver.find_elements(By.CLASS_NAME, P)
    company = driver.find_elements(By.CLASS_NAME, C)
    ort = driver.find_elements(By.CLASS_NAME, O) 

    list_header = [title.text for title in header]
    list_publish = [pub.text for pub in publish]
    list_company = [comp.text for comp in company]
    list_ort = [o.text for o in ort]
    return list_header, list_publish, list_company, list_ort

def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("log/logfile.txt","a") as f:
        f.write(timestamp + ',' + message + '\n')

logging.basicConfig(level=logging.DEBUG, filename='log/logs.log', filemode='a',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt="%Y-%m-%d %H:%M:%S",)

logging.debug('debug')
logging.info('info')
logging.warning('warning')
logging.error('error', exc_info=True)
logging.critical('critical')


In [2]:
print('---------------------- StepStone Job Searching Selenium Project ----------------------')
start=datetime.now()
log('start_stepstone')
# Link Descriptions
link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

website_name = 'stepstone'
job_name = 'Data Engineer'
ort_ = 'Rietberg'
radius = 100
page_number = 1


#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scpraing_portfolio_deneme/chromedriver'
driver = webdriver.Chrome(Path)
print('Create Driver')


#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}&sort=2&action=sort_publish'


driver.get(link)
wait(5)
sleep(2)
print('Go to Website')
#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'ccmgt_explicit_accept'
click_bann_byID(ID)
print('Banned')


# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
H, P, C, O = ('res-29pyh9', 'res-rf8k2x', 'res-hbyqhf', 'res-1wf9en7')
list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)


# 4.2 - Description and Page number of results
description = driver.find_elements(By.CLASS_NAME, 'res-17md5or')


# 4.3 - Get Links 'res-1dwe62q'
list_link01  = driver.find_elements(By.CLASS_NAME, 'res-1dwe62q')
list_link = [link.get_attribute('href') for link in list_link01]

# 4.4 - Get Texts for each finding
list_description = [des.text for des in description]
print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company[1:]), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))


# 4.6 - DataFrame df
d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df01 = pd.DataFrame.from_dict(d, orient='index')
df01 = df01.T


print(f'DataFrame End : {df01.shape}')
df01['website'] = website_name
time_ = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
df01['date'] = time_
df01['search_title'] = job_name


# 5.1 - Quit
end =datetime.now() 
print('Code Runned No Problem')
log('end_stepstone')
print(f'Time = {end - start}')
sleep(0.5)
driver.quit()


# 6.1 Dataframe first 5 Rows
df01.head(2)

---------------------- StepStone Job Searching Selenium Project ----------------------
Create Driver
Go to Website
Banned
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
DataFrame End : (25, 6)
Code Runned No Problem
Time = 0:00:20.414201


Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Cloud Architect / Cloud Data Engineer / DevOps...,vor 9 Stunden,REMONDIS IT Services GmbH & Co. KG,"Münster, Baden",saracus ist eines der führenden unabhängigen B...,https://www.stepstone.de/stellenangebote--Clou...,stepstone,2023-03-18 12:46:25,Data Engineer
1,Data Engineer (m/w/d),vor 2 Tagen,Riverty,Lünen,In Deiner Position als Data Engineer hilfst Du...,https://www.stepstone.de/stellenangebote--Data...,stepstone,2023-03-18 12:46:25,Data Engineer


In [3]:
print('---------------------- Xing Job Searching Selenium Project ----------------------')
start=datetime.now()
log('start_xing')  


# Link Descriptions
link_original_xing = 'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page=1&radius=100'
website_name = 'xing'
job_name = 'Data Engineer'
ort_ = 'Rietberg'
radius = 50
page_number = 1


#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scpraing_portfolio_deneme/chromedriver'
driver = webdriver.Chrome(Path)


#  2 - Go to Website
job_link = job_name.replace(' ', '-').lower()
ort_link = ort_.lower()
link = f'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page=1&radius=100&sort=date'
driver.get(link)
wait(10)
sleep(2)


#  3 - ActionChain Object created
# 3.1 - Click Banned Accept
ID = 'consent-accept-button'
click_bann_byID(ID)


# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
H = 'utils-line-clamp-lineClamp2-dfe26aab'
D = 'list-item-job-teaser-list-item-highlight-bb8ddbb6'
L = 'list-item-job-teaser-list-item-location-a5b28738'
ALL = 'list-item-job-teaser-list-item-listItem-f04c772e'


# 4 -  Take Infos from Page
# 4.1 - Headers, Publish_Time ,Company, City
list_header = find_element(H)
list_description = find_element(D)
list_ort = find_element(L)
list_all = find_element(ALL)


list_publish = []
list_full_time = [] 
for i in list_all:
    date = i.split('\n')[-2]
    time_ = i.split('\n')[-3]
    list_publish.append(date)
    list_full_time.append(time_)


list_title =[]
list_company = []
n = 0
while n < len(list_header):
    list_title.append(list_header[n])
    list_company.append(list_header[n+1])
    n += 2


# 4.3 - Get Links
Link = 'list-item-job-teaser-list-item-listItem-f04c772e'
header = driver.find_elements(By.CLASS_NAME, Link)
list_link = [link.get_attribute('href') for link in header]


# 4.4 - DataFrame df
d = dict(job_title=np.array(list_title), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
df02 = pd.DataFrame.from_dict(d, orient='index')
df02 = df02.T
df02['website'] = website_name
time_now = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
df02['date'] = time_now
df02['search_title'] = job_name


list_of_list = [list_header, list_description, list_ort, list_publish, list_link]
print([len(i) for i in list_of_list])


# 5.1 Quit Driver
sleep(1)
driver.quit()
print('Finish', time_now)
log('end_xing') 


# 6.1 Dataframe firts 5 Rows
df02.head(2)

---------------------- Xing Job Searching Selenium Project ----------------------
[40, 20, 20, 20, 20]
Finish 2023-03-18 12:46:53


Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Data Engineer (m/w/d),Vor 10 Stunden,Brunel,Osnabrück,Sie haben ein Interesse an Projekten und Proze...,https://www.xing.com/jobs/osnabrueck-data-engi...,xing,2023-03-18 12:46:53,Data Engineer
1,Data Engineer (m/w/d),Vor 11 Stunden,BRUDERKOPF GmbH & Co. KG,Dortmund,Für ihn suchen wir einen Data Engineer (m/w/d)...,https://www.xing.com/jobs/dortmund-data-engine...,xing,2023-03-18 12:46:53,Data Engineer


In [4]:

print('--------------------------------------- Connect to Database for alldata ---------------------------------------')
log('connected_database')
conn = psycopg2.connect(database="JOB",
			user='postgres', password=1984,
			host='127.0.0.1', port='5432'
)

conn.autocommit = True
cursor = conn.cursor()


sql = '''CREATE TABLE IF NOT EXISTS alldata(id serial PRIMARY KEY,
job_title text ,\
publish varchar(30),\
company text,\
city varchar(300),\
description text,\
link text,\
website varchar(30),\
date timestamp,\
search_title varchar(30));'''


cursor.execute(sql)


# connection string: driver://username:password@server/database
engine = create_engine('postgresql+psycopg2://postgres:1984@localhost/JOB')


#  Note:  if_exists can be append, replace, fail.  
df01.to_sql('alldata', engine, if_exists='append', index = False)
df02.to_sql('alldata', engine, if_exists='append', index = False)


sql2 = '''SELECT company FROM alldata WHERE publish LIKE '%Stunde%' ORDER BY publish LIMIT 5'''
cursor.execute(sql2)
for i in cursor.fetchall():
	print(i)

# Commit 
conn.commit()
log('finish_all') 


# Save Dataframes to Csv
path = '/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scpraing_portfolio_deneme/data'
today_ = datetime.today().strftime('%Y-%m-%d')
#df01.to_csv(f'{path}/{job_name}.csv', header=True) >> First time
df01.to_csv(f'{path}/{job_name}_{today_}.csv', mode='a', header=False)
df02.to_csv(f'{path}/{job_name}_{today_}.csv', mode='a', header=False)


print(f'Today {today_} mycode Runned.')

--------------------------------------- Connect to Database ---------------------------------------
('Brunel',)
('Porta Möbel GmbH & Co KG',)
('BRUDERKOPF GmbH & Co. KG',)
('Computer Futures, ein Geschäftszweig von SThree',)
('Computer Futures',)
Today 2023-03-18 mycode Runned.


In [None]:
print ('------------------- BEGIN PROSECURE -------------------')


#01. Add new datas to dailyhours
sql03 = '''INSERT INTO dailyhours \
SELECT * FROM alldata \
WHERE Date_Trunc('day', date) = CURRENT_DATE \
      AND lower(publish) LIKE '%stunde%''' 


# 02. Links to go (Data Engineer, Cloud Engineer)
sql04 = '''INSERT INTO linkstogo \
SELECT id, CURRENT_DATE, job_title, company,link \
FROM \
	(Select *, \
		Row_Number() Over (Partition BY job_title,company ) as RN \
		From dailyhours) t1 \
Where RN < 2 AND (lower(job_title) LIKE '%data engineer%' or lower(job_title) LIKE '%cloud%') \
             AND Date_Trunc('day', date) = CURRENT_DATE \
Order by 4,2 \ '''


cursor.execute(sql03)
cursor.execute(sql04)
conn.commit()


In [207]:
print('--------------------------------------- Connect to Database for jobdescription ---------------------------------------')
log('connected_database02')

engine = create_engine('postgresql+psycopg2://postgres:1984@localhost/JOB')

sql = '''SELECT link, id FROM linkstogo '''
cursor.execute(sql)
listLink = []
for i in cursor.fetchall():
	print(i)
	listLink.append(i)

print(len(listLink))

print('--------------------------------------- Go to Links ---------------------------------------')

Path = '/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scpraing_portfolio_deneme/chromedriver'
listDetails = []
listids = []
df03 = pd.DataFrame(columns=['job_id', 'job_desc', 'job_mitbring', 'job_bieten'])

n = 0
while n < len(listLink):
	if listLink[n][0].find('stepstone') == 12:
                driver = webdriver.Chrome(Path)	
                driver.get(listLink[n][0])
                print(f'Go to {n+1} Link')
                wait(5)
                sleep(1)
                #-------------
                #  3 - ActionChain Object created
                # 3.1 - Click Banned Accept
                ID = 'ccmgt_explicit_accept'
                click_bann_byID(ID)

                A =  'at-section-text-description-content'
                M = 'at-section-text-profile-content'
                B = 'at-section-text-weoffer-content'
                list_aufgaben = find_element(A)
                list_mitbring = find_element(M)
                list_bieten = find_element(B)
                
                d = dict(job_id={str(listLink[n][1])}, job_desc={list_aufgaben[0]}, job_mitbring={list_mitbring[0]}, job_bieten={list_bieten[0]})
                df04 = pd.DataFrame.from_dict(d, orient='index')
                df04 = df04.T

                df03 = pd.concat([df03,df04], axis=0, ignore_index=True)
		
                driver.quit()
                n+=1
	else:
		driver = webdriver.Chrome(Path)	
		driver.get(listLink[n][0])
		print(f'Go to {n+1} Link')
		wait(5)
		sleep(1)

		# 3.1 - Click Banned Accept
		ID = 'consent-accept-button'
		click_bann_byID(ID)

		E = 'html-description-html-description-header-c7005820'
		list_uber = find_element(E)
		for i in list_uber:
			listDetails.append(i)
			listids.append(str(listLink[n][1]))
		
		driver.quit()
		n += 1


# 4.4 - DataFrame df
d = dict(job_id=np.array(listids), job_desc=np.array(listDetails), job_mitbring={}, job_bieten={})
df05 = pd.DataFrame.from_dict(d, orient='index')
df05 = df05.T
df06 = pd.concat([df03,df05], axis=0, ignore_index=True)

# Send to Database 
df06.to_sql('jobdescription', engine, if_exists='append', index = False)

print(f'Web scraping finished')

# Close Database Connection
conn.close()

df06.head(10)


--------------------------------------- Connect to Database ---------------------------------------
('https://www.xing.com/jobs/bochum-data-engineer-data-warehouse-datenpflege-home-office-98364182', 253)
('https://www.xing.com/jobs/bochum-data-engineer-python-planerai-kundenservice-home-office-98396103', 341)
('https://www.xing.com/jobs/muenster-data-engineer-98342219', 256)
('https://www.stepstone.de/stellenangebote--Data-Engineer-m-w-d-Home-Office-bundesweit-Media-Plan-GmbH--9269823-inline.html', 91)
('https://www.xing.com/jobs/porta-westfalica-senior-data-engineer-88848148', 299)
('https://www.stepstone.de/stellenangebote--Cloud-Engineer-m-w-d-Muenster-Web-Computing-GmbH--9338001-inline.html', 328)
('https://www.stepstone.de/stellenangebote--Cloud-Engineer-m-w-d-Guetersloh-Hamburg-Rostock-Leipzig-Arvato-Systems-Digital-GmbH--9343526-inline.html', 373)
('https://www.stepstone.de/stellenangebote--System-Engineer-Public-Cloud-Basis-m-w-d-Karlsruhe-Muenster-Atruvia-AG--9305315-inline.ht

In [213]:
df06

Unnamed: 0,job_id,job_mitbring,job_bieten,job_desc
0,91,Du bringst fundierte praktische Erfahrung im o...,"Kein langweiliger 9 to 5 job, sondern die Mögl...","Verantwortung für die Gestaltung, Implementier..."
1,328,Du hast eine abgeschlossene Ausbildung oder ei...,Fortbildungen: Wir leben das Motto “Stillstand...,Aufbau und Betrieb\nder CI-Infrastruktur für d...
2,373,Du verfügst über ein erfolgreich abgeschlossen...,Spannende Projekte und Herausforderungen | Fla...,"Als international agierender IT-Spezialist, su..."
3,375,Du verfügst über ein abgeschlossenes Studium d...,"Neben einem attraktiven Tarifgehalt, 30 Tagen ...",Du konzipierst und entwickelst die Hyperscaler...
4,404,Erfolgreich abgeschlossenes Studium mit einem ...,Intensive und zielgerichtete Einarbeitungsphas...,"Analyse, Konzeption, Entwicklung und Optimieru..."
5,253,,,Über WEKO Wohnen GmbH:\nWir sind ein familieng...
6,341,,,Über PlanerAI GmbH:\nWenn Softwarelösung und N...
7,256,,,Über uns\nMit über 100 Standorten in der DACH-...
8,430,,,BRUDERKOPF ist der vertrauensvolle und unabhän...
9,433,,,BRUDERKOPF ist der vertrauensvolle und unabhän...
