In [5]:
import time
import logging
import numpy as np
import pandas as pd
import openpyxl
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import psycopg2
from sqlalchemy import create_engine

import warnings
warnings.simplefilter(action='ignore')

# Sleep function 
def sleep(x):
    time.sleep(x)

# Wait for a certain measure of time before throwing an exception
def wait(x):
    driver.implicitly_wait(x)

# Click Function
def click_bann_byID(ID):
    actions = ActionChains(driver)
    akzeptieren = driver.find_element(By.ID, ID)
    actions.click(akzeptieren).perform()
    wait(10)
    sleep(0.5)


# Find Element Function
def find_element(H):
    header = driver.find_elements(By.CLASS_NAME, H)
    list_header = [title.text for title in header]
    return list_header


# Find Elements Function
def find_elements_HPCO(H,P,C,O):
    header = driver.find_elements(By.CLASS_NAME, H)
    publish = driver.find_elements(By.CLASS_NAME, P)
    company = driver.find_elements(By.CLASS_NAME, C)
    ort = driver.find_elements(By.CLASS_NAME, O) 

    list_header = [title.text for title in header]
    list_publish = [pub.text for pub in publish]
    list_company = [comp.text for comp in company]
    list_ort = [o.text for o in ort]
    return list_header, list_publish, list_company, list_ort

def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("logfile.txt","a") as f:
        f.write(timestamp + ',' + message + '\n')

logging.basicConfig(level=logging.DEBUG, filename='logs.log', filemode='a',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt="%Y-%m-%d %H:%M:%S",)

logging.debug('debug')
logging.info('info')
logging.warning('warning')
logging.error('error', exc_info=True)
logging.critical('critical')


In [3]:
print('---------------------- StepStone Job Searching Selenium Project ----------------------')
start=datetime.now()
log('start_stepstone')


# Link Descriptions
link_original_stepstone = 'https://www.stepstone.de/jobs/data-analyst/in-rietberg?radius=50&page=2'

website_name = 'stepstone'
job_name = 'Data Engineer'
ort_ = 'Rietberg'
radius = 100


#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scpraing_portfolio_deneme/chromedriver'
driver = webdriver.Chrome(Path)
print('Create Driver')

for i in range(2,7):
        page_number = i
        #  2 - Go to Website
        job_link = job_name.replace(' ', '-').lower()
        ort_link = ort_.lower()
        link = f'https://www.stepstone.de/jobs/{job_link}/in-{ort_link}?radius={radius}&page={page_number}&sort=2&action=sort_publish'

        driver.get(link)
        wait(5)
        sleep(1)
        print('Go to Website')
        #  3 - ActionChain Object created
        # 3.1 - Click Banned Accept
        if page_number == 2 :
                ID = 'ccmgt_explicit_accept'
                click_bann_byID(ID)
                print('Banned')
        else:
                print(f'Page Numer = {page_number}')
        

        # 4 -  Take Infos from Page
        # 4.1 - Headers, Publish_Time ,Company, City
        H, P, C, O = ('res-29pyh9', 'res-rf8k2x', 'res-hbyqhf', 'res-1wf9en7')
        list_header, list_publish, list_company, list_ort = find_elements_HPCO(H,P,C,O)

        # 4.2 - Description and Page number of results
        description = driver.find_elements(By.CLASS_NAME, 'res-17md5or')

        # 4.3 - Get Links 'res-1dwe62q'
        list_link01  = driver.find_elements(By.CLASS_NAME, 'res-1dwe62q')
        list_link = [link.get_attribute('href') for link in list_link01]

        # 4.4 - Get Texts for each finding
        list_description = [des.text for des in description]
        print('Header',len(list_header), 'Publish',len(list_publish), 'Company',len(list_company[1:]), 'Ort',len(list_ort), 'Desc', len(list_description), 'Link',len(list_link))


        # 4.6 - DataFrame df
        d = dict(job_title=np.array(list_header), publish=np.array(list_publish), company=np.array(list_company[1:]), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
        df01 = pd.DataFrame.from_dict(d, orient='index')
        df01 = df01.T

        # 5.1 - Save Data as csv 
        print(f'DataFrame End : {df01.shape}')
        df01['website'] = website_name
        time_ = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
        df01['date'] = time_
        df01['search_title'] = job_name

        path = '/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scpraing_portfolio_deneme'
        if page_number == 2 :
                df01.to_csv(f'{path}/{job_name}_first.csv', header=True)
        else :
                df01.to_csv(f'{path}/{job_name}_first.csv', mode='a', header=False)
# 6 - Quit
end =datetime.now() 
print('Code Runned No Problem')
log('end_stepstone')
print(f'Time = {end - start}')
sleep(0.5)
driver.quit()
df01.head(2)

---------------------- StepStone Job Searching Selenium Project ----------------------
Create Driver
Go to Website
Banned
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
DataFrame End : (25, 6)
Go to Website
Page Numer = 3
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
DataFrame End : (25, 6)
Go to Website
Page Numer = 4
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
DataFrame End : (25, 6)
Go to Website
Page Numer = 5
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
DataFrame End : (25, 6)
Go to Website
Page Numer = 6
Header 25 Publish 25 Company 24 Ort 25 Desc 25 Link 25
DataFrame End : (25, 6)
Code Runned No Problem
Time = 0:01:24.841145


Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Project Engineer / Software Developer (f/w/d) ...,vor 2 Tagen,Otto (GmbH & Co KG),"Bremen, Augsburg, Bad Vilbel, Berlin, Dessau, ...","Our modular software toolkit, which is based o...",https://www.stepstone.de/stellenangebote--Proj...,stepstone,2023-03-14 20:30:39,Data Engineer
1,"Software Engineer | Java, Spring, GCP | Team M...",vor 2 Tagen,Otto (GmbH & Co KG),"Hamburg, Home-Office","GCP Cloud (z. B. pubsub, cloudrun, cloud datas...",https://www.stepstone.de/stellenangebote--Soft...,stepstone,2023-03-14 20:30:39,Data Engineer


In [7]:
    print('---------------------- Xing Job Searching Selenium Project ----------------------')
start=datetime.now()
log('start_xing')  
# Link Descriptions
link_original_xing = 'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page=1&radius=100'

website_name = 'xing'
job_name = 'Data Engineer'
ort_ = 'Rietberg'
radius = 100


#  1 - Create Driver
Path = '/Users/macbook/Desktop/projects/Github_Repositories/Trainings/web_scpraing_portfolio_deneme/chromedriver'
driver = webdriver.Chrome(Path)

for i in range(2,7):
        page_number = i

        #  2 - Go to Website
        job_link = job_name.replace(' ', '-').lower()
        ort_link = ort_.lower()
        link = f'https://www.xing.com/jobs/search?keywords=Data%20Engineer&location=Rietberg&page={page_number}&radius={radius}&sort=date'

        driver.get(link)
        wait(5)
        sleep(1)

        #  3 - ActionChain Object created
        # 3.1 - Click Banned Accept
        if page_number == 2:
            ID = 'consent-accept-button'
            click_bann_byID(ID)
        else:
            print(f'Page Numebr = {page_number}')

        # 4 -  Take Infos from Page
        # 4.1 - Headers, Publish_Time ,Company, City
        H = 'utils-line-clamp-lineClamp2-dfe26aab'
        D = 'list-item-job-teaser-list-item-highlight-bb8ddbb6'
        L = 'list-item-job-teaser-list-item-location-a5b28738'
        ALL = 'list-item-job-teaser-list-item-listItem-f04c772e'


        list_header = find_element(H)
        list_description = find_element(D)
        list_ort = find_element(L)
        list_all = find_element(ALL)

        list_publish = []
        list_full_time = [] 
        for i in list_all:
            date = i.split('\n')[-2]
            time_ = i.split('\n')[-3]
            list_publish.append(date)
            list_full_time.append(time_)

        list_title =[]
        list_company = []
        n = 0
        while n < len(list_header):
            list_title.append(list_header[n])
            list_company.append(list_header[n+1])
            n += 2

        # 4.3 - Get Links
        Link = 'list-item-job-teaser-list-item-listItem-f04c772e'
        header = driver.find_elements(By.CLASS_NAME, Link)
        list_link = [link.get_attribute('href') for link in header]

        # 4.4 - DataFrame df
        d = dict(job_title=np.array(list_title), publish=np.array(list_publish), company=np.array(list_company), city=np.array(list_ort) , description=np.array(list_description), link=np.array(list_link))
        df02 = pd.DataFrame.from_dict(d, orient='index')
        df02 = df02.T
        df02['website'] = website_name
        time_now = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
        df02['date'] = time_now
        df02['search_title'] = job_name

        list_of_list = [list_header, list_description, list_ort, list_publish, list_link]
        print([len(i) for i in list_of_list])

        df02.to_csv(f'{path}/{job_name}_first.csv', mode='a', header=False)

# Quit
end =datetime.now() 
print('Code Runned No Problem')
log('end_Xing')
print(f'Time = {end - start}')
sleep(0.5)
driver.quit()

df02.head(2)

---------------------- Xing Job Searching Selenium Project ----------------------
[40, 20, 20, 20, 20]
Page Numebr = 3
[40, 20, 20, 20, 20]
Page Numebr = 4
[40, 20, 20, 20, 20]
Page Numebr = 5
[40, 20, 20, 20, 20]
Page Numebr = 6
[40, 20, 20, 20, 20]
Code Runned No Problem
Time = 0:00:53.874477


Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Data Engineer DWH / BI (m/w/d),Vor 25 Tagen,FALKE KGaA,Schmallenberg,Verstärken Sie das Team unseres Hauptsitzes am...,https://www.xing.com/jobs/schmallenberg-data-e...,xing,2023-03-14 20:48:07,Data Engineer
1,Teamlead Data Engineer (m/w/d) | Dortmund,Vor 26 Tagen,ADVERGY GmbH,Dortmund,Aufgaben Leitung eines wachsenden Data Enginee...,https://www.xing.com/jobs/dortmund-teamlead-da...,xing,2023-03-14 20:48:07,Data Engineer


In [28]:
df = pd.read_excel("Data Engineer_first.xlsx", index_col=[0])
df.head()

Unnamed: 0,job_title,publish,company,city,description,link,website,date,search_title
0,Production Quality Supervisor (m/w/d),vor 15 Stunden,Polizeipräsidium Gelsenkirchen,Bochum,Molex CVS Bochum GmbH * Bochum * Feste Anstell...,https://www.stepstone.de/stellenangebote--Prod...,stepstone,2023-03-14 20:29:42,Data Engineer
1,Sachbearbeiter/in Intel-Officer-Sentinel (w/m/...,vor 18 Stunden,Beckhoff Automation GmbH & Co. KG,Gelsenkirchen,eine abgeschlossene einschlägige Hochschulausb...,https://www.stepstone.de/stellenangebote--Sach...,stepstone,2023-03-14 20:29:42,Data Engineer
2,Data Scientist (m/w/d) Application / (Data Ana...,vor 1 Tag,Beckhoff Automation GmbH & Co. KG,Verl,Kunden- und Applikationssupport für eine Auswa...,https://www.stepstone.de/stellenangebote--Data...,stepstone,2023-03-14 20:29:42,Data Engineer
3,Data Scientist (m/w/d) Application / Support (...,vor 1 Tag,Manufactum GmbH,Verl,erfolgreich abgeschlossenes Studium der Fachri...,https://www.stepstone.de/stellenangebote--Data...,stepstone,2023-03-14 20:29:42,Data Engineer
4,Process- & Data Architect (d/m/w),vor 1 Tag,Ratbacher GmbH,Waltrop bei Dortmund,Für unsere Zentrale in Waltrop (bei Dortmund) ...,https://www.stepstone.de/stellenangebote--Proc...,stepstone,2023-03-14 20:29:42,Data Engineer


In [29]:

print('--------------------------------------- Connect to Database ---------------------------------------')

conn = psycopg2.connect(database="JOB",
			user='postgres', password=1984,
			host='127.0.0.1', port='5432'
)

conn.autocommit = True
cursor = conn.cursor()

log('connected_database')

sql = '''CREATE TABLE IF NOT EXISTS alldata(id serial PRIMARY KEY,
job_title text ,\
publish varchar(30),\
company text,\
city varchar(300),\
description text,\
link text,\
website varchar(30),\
date timestamp,\
search_title varchar(30));'''

cursor.execute(sql)

# connection string: driver://username:password@server/database
engine = create_engine('postgresql+psycopg2://postgres:1984@localhost/JOB')

#  Note:  if_exists can be append, replace, fail.  
df.to_sql('alldata', engine, if_exists='append', index = False)


sql2 = '''SELECT company FROM alldata WHERE publish LIKE '%Stunde%' ORDER BY publish LIMIT 5  '''
cursor.execute(sql2)
for i in cursor.fetchall():
	print(i)

conn.commit()
log('finish_all') 
conn.close()

--------------------------------------- Connect to Database ---------------------------------------
('DATAGROUP',)
('FROMMER LEGAL',)
('Polizeipräsidium Gelsenkirchen',)
('DATAGROUP',)
('Beckhoff Automation GmbH & Co. KG',)


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225 entries, 0 to 19
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   job_title     218 non-null    object        
 1   publish       225 non-null    object        
 2   company       221 non-null    object        
 3   city          225 non-null    object        
 4   description   217 non-null    object        
 5   link          225 non-null    object        
 6   website       225 non-null    object        
 7   date          225 non-null    datetime64[ns]
 8   search_title  225 non-null    object        
dtypes: datetime64[ns](1), object(8)
memory usage: 17.6+ KB


In [14]:
df.search_title.value_counts()

Data Engineer    225
Name: search_title, dtype: int64

In [15]:
df.website.value_counts()

stepstone    125
xing         100
Name: website, dtype: int64

In [17]:
df.head(50).link.value_counts()

https://www.stepstone.de/stellenangebote--Production-Quality-Supervisor-m-w-d-Bochum-Molex-CVS-Bochum-GmbH--9329451-inline.html                                                                                                                1
https://www.stepstone.de/stellenangebote--Absolvent-Career-Starter-als-Software-Developer-m-w-d-Berlin-Bremen-Duesseldorf-Frankfurt-Guetersloh-Hamburg-Karlsruhe-Koeln-Leipzig-und-weitere-Reply--9287869-inline.html                          1
https://www.stepstone.de/stellenangebote--System-Engineer-Microsoft-Teams-m-w-d-Karlsruhe-Muenster-Atruvia-AG--9179364-inline.html                                                                                                             1
https://www.stepstone.de/stellenangebote--Senior-IT-Cloud-Architekt-Banking-m-w-d-Berlin-Frankfurt-a-M-Karlsruhe-Koeln-Muenchen-Muenster-Nuernberg-Passau-Stuttgart-msg-systems-ag--8998163-inline.html                                        1
https://www.stepstone.de/stellenange