In [82]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import pprint
from tqdm import tqdm_notebook as tqdm # this is a fancy progress bar!
import pandas as pd

## Construct URL

In [61]:
#Creating the URL
main_link = 'https://www.kariera.gr/jobs'
query = '?title='
job_roles = ['data+analyst', 'data+scientist', 'data+engineer']

page_ext = '&page='

link = main_link + query + '+'.join(job_roles) + page_ext +'0'
link

'https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=0'

In [62]:
response = requests.get(link, timeout= 15)

In [63]:
response.status_code

200

## Print required fields

In [64]:
soup = BeautifulSoup(response.content, 'html.parser')

In [35]:
job_ad = soup.find_all('section', class_ = 'ZVuAW2ak')[0]

In [87]:
# job id
job_ad.find('a', class_ = 'h4 tGc1KEdv').get('href').replace('?sponsored','').split('/')[-1]

'21637'

In [36]:
# job title

job_ad.find('a', class_ = 'h4 tGc1KEdv').text


'Διαχείριση Αιτημάτων Leads/Digital Campaigns WIND & NOVA'

In [41]:
# Comapny
job_ad.find('a', attrs={'data-testid': 'job-card-company-name-link'}).text


'BFS Group'

In [43]:
# location
job_ad.find('span', attrs={'data-testid': 'location'}).text

'Άγιος Δημήτριος Αττικής'

In [45]:
# job occupation
job_ad.find('span', attrs={'data-testid': 'occupationType'}).text


'Μερική απασχόληση'

In [90]:
# Create a function to gather all data
def parse_page(soup):
    jobs = soup.find_all('section', class_ = 'ZVuAW2ak')

    data = []
    for job in tqdm(jobs):
        sleep(0.6)
        try:
            job_id = int(job.find('a', class_ = 'h4 tGc1KEdv').get('href').replace('?sponsored','').split('/')[-1])
            title = job.find('a', class_ = 'h4 tGc1KEdv').text
            if job.find('a', attrs={'data-testid': 'job-card-company-name-link'}):
                company = job.find('a', attrs={'data-testid': 'job-card-company-name-link'}).text
            else: 
                company = None
            location = job.find('span', attrs={'data-testid': 'location'}).text
            occupation = job.find('span', attrs={'data-testid': 'occupationType'}).text
            data.append([job_id, title, company, location, occupation])
        except:
            print('Error parsing job')
    return data

In [91]:
data = parse_page(soup)
data

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for job in tqdm(jobs):


  0%|          | 0/21 [00:00<?, ?it/s]

[[21851,
  'Software Quality Assurance Engineer',
  'ATOS GREECE',
  'Αθήνα',
  'Πλήρης απασχόληση'],
 [21852,
  'Backend Software Development Engineer',
  'ATOS GREECE',
  'Αθήνα',
  'Πλήρης απασχόληση'],
 [23402,
  'Software Engineers στη Θέρμη Θεσσαλονίκης',
  'Lidl Ελλάς ',
  'Θέρμη',
  'Πλήρης απασχόληση'],
 [21998,
  'Senior Cloud Engineer',
  'Executive Level',
  'Αθήνα',
  'Πλήρης απασχόληση'],
 [24951,
  'CREDIT CONTROLLER/ΥΠΕΥΘΥΝΟΣ ΠΙΣΤΩΤΙΚΟΥ ΕΛΕΓΧΟΥ',
  None,
  'Νέα Φιλαδέλφεια Αττικής',
  'Πλήρης απασχόληση'],
 [24972,
  'Content Creator',
  'Hiring Solutions | kariera.gr',
  'Αθήνα',
  'Πλήρης απασχόληση'],
 [24973,
  'Ecommerce Content Editor',
  'Hiring Solutions | kariera.gr',
  'Αθήνα',
  'Πλήρης απασχόληση'],
 [24978,
  'Project Manager (Content)',
  'Hiring Solutions | kariera.gr',
  'Αθήνα',
  'Πλήρης απασχόληση'],
 [24979,
  'Content Manager',
  'Hiring Solutions | kariera.gr',
  'Αθήνα',
  'Πλήρης απασχόληση'],
 [24960,
  'CRM Sales Support Officer',
  'PwC Greece

## Retrieve total pages

In [92]:
total_pages = int(soup.find('ul', class_="ant-pagination").find_all('li', class_ = 'ant-pagination-item')[-1].text)
total_pages

22

In [94]:
# Retrieve data from all pages
all_data = []
for page in range(total_pages):
    link = main_link + query + '+'.join(job_roles) + page_ext + str(page)
    response = requests.get(link, timeout= 15)
    if response.status_code == 200:
        print(f"Iterating page {page+1}")
        print(link)
        soup = BeautifulSoup(response.content, 'html.parser')
        page_data = parse_page(soup)
        
        all_data.extend(page_data)
        
len(all_data)

Iterating page 1
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for job in tqdm(jobs):


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 2
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=1


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 3
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=2


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 4
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=3


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 5
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=4


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 6
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=5


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 7
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=6


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 8
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=7


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 9
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=8


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 10
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=9


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 11
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=10


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 12
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=11


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 13
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=12


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 14
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=13


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 15
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=14


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 16
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=15


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 17
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=16


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 18
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=17


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 19
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=18


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 20
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=19


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 21
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=20


  0%|          | 0/24 [00:00<?, ?it/s]

Iterating page 22
https://www.kariera.gr/jobs?title=data+analyst+data+scientist+data+engineer&page=21


  0%|          | 0/21 [00:00<?, ?it/s]

525

## Parse to dataframe

In [116]:
jobs_df = pd.DataFrame(all_data, columns=["job_id", "title", "company", "location", "occupation"])
jobs_df

Unnamed: 0,job_id,title,company,location,occupation
0,21851,Software Quality Assurance Engineer,ATOS GREECE,Αθήνα,Πλήρης απασχόληση
1,21852,Backend Software Development Engineer,ATOS GREECE,Αθήνα,Πλήρης απασχόληση
2,23402,Software Engineers στη Θέρμη Θεσσαλονίκης,Lidl Ελλάς,Θέρμη,Πλήρης απασχόληση
3,21998,Senior Cloud Engineer,Executive Level,Αθήνα,Πλήρης απασχόληση
4,21047,Software Engineer (Python) – Data Scientist,METIS CYBERSPACE TECHNOLOGY SA,Αθήνα,Πλήρης απασχόληση
...,...,...,...,...,...
520,22238,Junior Controller / Thessaloniki,Sani/Ikos Group,Θεσσαλονίκη,Πλήρης απασχόληση
521,22044,Product Manager,WIND HELLAS TELECOMMUNICATIONS,Αθήνα,Πλήρης απασχόληση
522,21970,Marketing Assistant,VARIO A.E.,Ταύρος Αττικής,Πλήρης απασχόληση
523,21807,Production Finance Controller,OPTIMAL HR GROUP,Θεσσαλονίκη,Πλήρης απασχόληση


In [117]:
#remove duplicates because in each page we had the same promoted jobs
jobs_df = jobs_df.drop_duplicates('job_id', keep='first')
jobs_df

Unnamed: 0,job_id,title,company,location,occupation
0,21851,Software Quality Assurance Engineer,ATOS GREECE,Αθήνα,Πλήρης απασχόληση
1,21852,Backend Software Development Engineer,ATOS GREECE,Αθήνα,Πλήρης απασχόληση
2,23402,Software Engineers στη Θέρμη Θεσσαλονίκης,Lidl Ελλάς,Θέρμη,Πλήρης απασχόληση
3,21998,Senior Cloud Engineer,Executive Level,Αθήνα,Πλήρης απασχόληση
4,21047,Software Engineer (Python) – Data Scientist,METIS CYBERSPACE TECHNOLOGY SA,Αθήνα,Πλήρης απασχόληση
...,...,...,...,...,...
520,22238,Junior Controller / Thessaloniki,Sani/Ikos Group,Θεσσαλονίκη,Πλήρης απασχόληση
521,22044,Product Manager,WIND HELLAS TELECOMMUNICATIONS,Αθήνα,Πλήρης απασχόληση
522,21970,Marketing Assistant,VARIO A.E.,Ταύρος Αττικής,Πλήρης απασχόληση
523,21807,Production Finance Controller,OPTIMAL HR GROUP,Θεσσαλονίκη,Πλήρης απασχόληση


## Retrieve content

In [121]:
# URL of a specific job ad
job_ad_url = f"https://www.kariera.gr/jobs/{jobs_df.loc[0, ['job_id']].values[0]}"
job_ad_url

'https://www.kariera.gr/jobs/21851'

In [122]:
response = requests.get(job_ad_url, timeout= 15)
response.status_code

200

In [123]:
soup = BeautifulSoup(response.content, 'html.parser')

In [146]:
description_div = soup.find('div', attrs={'data-testid': 'html-renderer-job-main-body-text'})
descripition = ''
# <div class="__2DY4wJ3z hi8OBmAZ" data-testid="html-renderer-job-main-body-text"><div>
for item in description_div.find_all():
    print(item.text)
    
descripition    




Description

Atos is a global leader in digital transformation with 105,000 employees and annual revenue of over € 11 billion. European number one in cybersecurity, cloud and high performance computing, the Group provides tailored end-to-end solutions for all industries in 71 countries. A pioneer in decarbonization services and products, Atos is committed to a secure and decarbonized digital for its clients. Atos operates under the brands Atos and Atos|Syntel. Atos is a SE (Societas Europaea), listed on the Next 20 Paris stock index.
The purpose of Atos is to help design the future of the information space. Its expertise and services support the development of knowledge, education and research in a multicultural approach and contribute to the development of scientific and technological excellence. Across the world, the Group enables its customers and employees, and members of societies at large to live, work and develop sustainably, in a safe and secure information space. www.atos.n

''

In [137]:
def get_description (job_id):
    sleep(0.6)
    job_ad_url = f"https://www.kariera.gr/jobs/{job_id}"
    response = requests.get(job_ad_url, timeout= 15)
    print(f"Retrieving content for {job_ad_url}")
    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        description_div = soup.find('div', attrs={'data-testid': 'html-renderer-job-main-body-text'})
        descripition = ''
        # <div class="__2DY4wJ3z hi8OBmAZ" data-testid="html-renderer-job-main-body-text"><div>
        for item in description_div.find_all():
            descripition+=item.text

        return descripition   
    else:
        print(f"failed")

In [144]:
# add the content to our previous dataframe
jobs_df['content'] = jobs_df.apply(lambda x: get_description(x.job_id), axis=1) 


Retrieving content for https://www.kariera.gr/jobs/21851
Retrieving content for https://www.kariera.gr/jobs/21852
Retrieving content for https://www.kariera.gr/jobs/23402
Retrieving content for https://www.kariera.gr/jobs/21998
Retrieving content for https://www.kariera.gr/jobs/21047
Retrieving content for https://www.kariera.gr/jobs/24349
Retrieving content for https://www.kariera.gr/jobs/24345
Retrieving content for https://www.kariera.gr/jobs/24625
Retrieving content for https://www.kariera.gr/jobs/20169
Retrieving content for https://www.kariera.gr/jobs/25237
Retrieving content for https://www.kariera.gr/jobs/24017
Retrieving content for https://www.kariera.gr/jobs/21092
Retrieving content for https://www.kariera.gr/jobs/25247
Retrieving content for https://www.kariera.gr/jobs/23390
Retrieving content for https://www.kariera.gr/jobs/20689
Retrieving content for https://www.kariera.gr/jobs/20173
Retrieving content for https://www.kariera.gr/jobs/20163
Retrieving content for https://

Retrieving content for https://www.kariera.gr/jobs/20864
Retrieving content for https://www.kariera.gr/jobs/20762
Retrieving content for https://www.kariera.gr/jobs/20681
Retrieving content for https://www.kariera.gr/jobs/20594
Retrieving content for https://www.kariera.gr/jobs/20568
Retrieving content for https://www.kariera.gr/jobs/20580
Retrieving content for https://www.kariera.gr/jobs/20531
Retrieving content for https://www.kariera.gr/jobs/20453
Retrieving content for https://www.kariera.gr/jobs/20371
Retrieving content for https://www.kariera.gr/jobs/20252
Retrieving content for https://www.kariera.gr/jobs/22767
Retrieving content for https://www.kariera.gr/jobs/25348
Retrieving content for https://www.kariera.gr/jobs/25040
Retrieving content for https://www.kariera.gr/jobs/24946
Retrieving content for https://www.kariera.gr/jobs/24078
Retrieving content for https://www.kariera.gr/jobs/24749
Retrieving content for https://www.kariera.gr/jobs/24661
Retrieving content for https://

Retrieving content for https://www.kariera.gr/jobs/24557
Retrieving content for https://www.kariera.gr/jobs/24101
Retrieving content for https://www.kariera.gr/jobs/23881
Retrieving content for https://www.kariera.gr/jobs/21551
Retrieving content for https://www.kariera.gr/jobs/22626
Retrieving content for https://www.kariera.gr/jobs/23148
Retrieving content for https://www.kariera.gr/jobs/21346
Retrieving content for https://www.kariera.gr/jobs/24949
Retrieving content for https://www.kariera.gr/jobs/24921
Retrieving content for https://www.kariera.gr/jobs/22759
Retrieving content for https://www.kariera.gr/jobs/21167
Retrieving content for https://www.kariera.gr/jobs/20381
Retrieving content for https://www.kariera.gr/jobs/22333
Retrieving content for https://www.kariera.gr/jobs/21389
Retrieving content for https://www.kariera.gr/jobs/20292
Retrieving content for https://www.kariera.gr/jobs/23841
Retrieving content for https://www.kariera.gr/jobs/25560
Retrieving content for https://

Retrieving content for https://www.kariera.gr/jobs/23536
Retrieving content for https://www.kariera.gr/jobs/23535
Retrieving content for https://www.kariera.gr/jobs/23190
Retrieving content for https://www.kariera.gr/jobs/22695
Retrieving content for https://www.kariera.gr/jobs/22238
Retrieving content for https://www.kariera.gr/jobs/22044
Retrieving content for https://www.kariera.gr/jobs/21970
Retrieving content for https://www.kariera.gr/jobs/21807
Retrieving content for https://www.kariera.gr/jobs/20247


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jobs_df['content'] = jobs_df.apply(lambda x: get_description(x.job_id), axis=1)


In [145]:
jobs_df

Unnamed: 0,job_id,title,company,location,occupation,content
0,21851,Software Quality Assurance Engineer,ATOS GREECE,Αθήνα,Πλήρης απασχόληση,\n\n\nDescription\n\nAtos is a global leader i...
1,21852,Backend Software Development Engineer,ATOS GREECE,Αθήνα,Πλήρης απασχόληση,\n\n\nDescription\n\nAtos is a global leader i...
2,23402,Software Engineers στη Θέρμη Θεσσαλονίκης,Lidl Ελλάς,Θέρμη,Πλήρης απασχόληση,\n\nΠοιοι είμαστε\n\n\nΠάνω από 3.500 ΙΤ profe...
3,21998,Senior Cloud Engineer,Executive Level,Αθήνα,Πλήρης απασχόληση,Senior Cloud EngineerSenior Cloud Engineer Th...
4,21047,Software Engineer (Python) – Data Scientist,METIS CYBERSPACE TECHNOLOGY SA,Αθήνα,Πλήρης απασχόληση,Job DescriptionJob DescriptionAs a Python Soft...
...,...,...,...,...,...,...
520,22238,Junior Controller / Thessaloniki,Sani/Ikos Group,Θεσσαλονίκη,Πλήρης απασχόληση,Who do you want to become? Who do you want to...
521,22044,Product Manager,WIND HELLAS TELECOMMUNICATIONS,Αθήνα,Πλήρης απασχόληση,"At WIND, we believe in a world of connection, ..."
522,21970,Marketing Assistant,VARIO A.E.,Ταύρος Αττικής,Πλήρης απασχόληση,\n\n\nDescription\n\nΗ VARIO Α.Ε. ιδρύθηκε το ...
523,21807,Production Finance Controller,OPTIMAL HR GROUP,Θεσσαλονίκη,Πλήρης απασχόληση,Production Finance ControllerProduction Financ...
