In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import urllib.parse
import re, json

In [3]:
from scrapfly import ScrapflyClient, ScrapeConfig
from urllib.parse import urlencode
# import asyncio 

scrapfly = ScrapflyClient(key="scp-live-10421fbda886456f885ee18e9e914588")

result = scrapfly.scrape(ScrapeConfig(
    url="https://www.indeed.com/jobs?q=python&l=Texas",
    asp=True,
))
print(result.selector.xpath('//h1').get())

AttributeError: 'str' object has no attribute 'text'

In [38]:


def parse_search_page(html: str):
    data = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', html)
    data = json.loads(data[0])
    return {
        "results": data["metaData"]["mosaicProviderJobCardsModel"]["results"],
        "meta": data["metaData"]["mosaicProviderJobCardsModel"]["tierSummaries"],
    }

async def scrape_search(query: str, location: str, max_results: int = 50):
    def make_page_url(offset):
        parameters = {"q": query, "l": location, "filter": 0, "start": offset}
        return "https://www.indeed.com/jobs?" + urlencode(parameters)

    print(f"scraping first page of search: {query=}, {location=}")
    result_first_page = await scrapfly.async_scrape(ScrapeConfig(make_page_url(0), asp=True))
    data_first_page = parse_search_page(result_first_page.content)

    results = data_first_page["results"]
    total_results = sum(category["jobCount"] for category in data_first_page["meta"])
    # there's a page limit on indeed.com of 1000 results per search
    if total_results > max_results:
        total_results = max_results
    print(f"scraping remaining {total_results - 10 / 10} pages")
    other_pages = [
        ScrapeConfig(make_page_url(offset), asp=True) 
        for offset in range(10, total_results + 10, 10)
    ]
    async for result in scrapfly.concurrent_scrape(other_pages):
        results.extend(parse_search_page(result.content))
    return results

In [33]:
import re
import json
from typing import List
from scrapfly import ScrapeConfig, ScrapflyClient

scrapfly = ScrapflyClient(key="scp-live-10421fbda886456f885ee18e9e914588")


def parse_job_page(html):
    """parse job data from job listing page"""
    data = re.findall(r"_initialData=(\{.+?\});", html)
    data = json.loads(data[0])
    return data["jobInfoWrapperModel"]["jobInfoModel"]


async def scrape_jobs(job_keys: List[str]):
    """scrape job details from job page for given job keys"""
    urls = [f"https://www.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk={job_key}" for job_key in job_keys]
    to_scrape = [ScrapeConfig(url=url, asp=True) for url in urls]
    scraped = []
    async for result in scrapfly.concurrent_scrape(to_scrape):
        scraped.append(parse_job_page(result.content))
    return scraped

In [None]:
parse_job_page('https://www.indeed.com/jobs?q=python&l=Texas')

In [36]:
import re
import json
import httpx
import asyncio
from typing import List


def parse_job_page(html):
    """parse job data from job listing page"""
    data = re.findall(r"_initialData=(\{.+?\});", html)
    data = json.loads(data[0])
    return data["jobInfoWrapperModel"]["jobInfoModel"]


async def scrape_jobs(client: httpx.AsyncClient, job_keys: List[str]):
    """scrape job details from job page for given job keys"""
    urls = [f"https://www.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk={job_key}" for job_key in job_keys]
    scraped = []
    for response in await asyncio.gather(*[client.get(url=url) for url in urls]):
        scraped.append(parse_job_page(response.text))
    return scraped

In [37]:
async def main():
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Connection": "keep-alive",
        "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
    }
    async with httpx.AsyncClient(headers=HEADERS) as client:
        job_data = await scrape_jobs(client, ["a82cf0bd2092efa3"])
        print(job_data[0]['sanitizedJobDescription']['content'])
        print(job_data)

asyncio.run(main())

  m = tuple(map(os.fspath, m))


RuntimeError: asyncio.run() cannot be called from a running event loop

In [8]:
from scrapfly import ScrapflyClient, ScrapeConfig

scrapfly = ScrapflyClient(key="scp-live-10421fbda886456f885ee18e9e914588")

searched_position = 'python'
location = 'indonesia'

result = scrapfly.scrape(ScrapeConfig(
    url = "https://www.indeed.com/jobs?q={}&l={}".format(searched_position,location),
    asp=True,
))
print(result.selector.xpath('//h1').get())
# result


<h1 class="css-novqjp e1tiznh50">python jobs</h1>


In [6]:

def convert_columns_data_type(df, cols, datatype):
    for col in cols:
        df[col] = df[col].astype(datatype)
        
def scrape_job_details(page_source):
    content = BeautifulSoup(page_source, 'lxml')
    # print(content)
    jobs_list = []    
    for post in content.select('.job_seen_beacon'):

        data = {
            "job_title": post.find('span', id=lambda x: x and x.startswith('jobTitle')).text,
            "company": post.find('span', class_='css-92r8pb').text,
            "location": post.find('div', class_='css-1p0sjhy').text,
            "posted_date":  post.find('span', attrs={'data-testid': 'myJobsStateDate'}).text,
            "job_description": post.find('div', class_='css-9446fg').text,
            "job_type": post.find('span', class_='css-12bzcbs').text,
            "scrapped_date": pd.to_datetime('today').strftime('%Y-%m-%d')
        }

        try: data["rating"] = post.find('span', attrs={'data-testid': 'holistic-rating'}).text
        except: data["rating"] = 0
        jobs_list.append(data)

    df = pd.DataFrame(jobs_list)
    if 'rating' in df.columns:
        convert_columns_data_type(df, ['rating'], np.float32) # Convert rating to float

    return df

new_results = []
job_search = input('Enter searched position: ')
job_query = urllib.parse.quote(job_search)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

for i in range(10):
    url = 'https://id.indeed.com/jobs?q={}&start={}&l=&from=searchOnHP'.format(job_query,10 + i*10)
    response = requests.get(url, headers=headers)
    result = scrape_job_details(response.content)
    new_results.append(result)
combined_data = pd.concat(new_results, ignore_index=True)

combined_data.to_csv('indeed.csv')


<!DOCTYPE html>
<html dir="ltr" lang="in">
<head>
<link href="/images/favicon.ico" rel="shortcut icon"/>
<title>Lowongan Frontend Developer bulan 13 Mei 2024 | Indeed.com</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="Ada perusahaan yang sedang membuka kesempatan lowongan kerja Frontend Developer, Front End Developer, Web Developer, Developer dan banyak lagi melalui Indeed.com." name="description"/>
<meta content="origin-when-cross-origin" name="referrer"/>
<meta content="noindex" name="robots"/>
<link href="jobs?q=frontend+developer&amp;l=&amp;from=searchOnHP&amp;jlid=dd616958bd9ddc12&amp;forceLocation=-1&amp;rbsalmin=0&amp;rbsalmax=0&amp;start=0" rel="prev"/>
<link href="/jobs?q=frontend+developer&amp;l=&amp;from=searchOnHP&amp;jlid=dd616958bd9ddc12&amp;forceLocation=-1&amp;rbsalmin=0&amp;rbsalmax=0&amp;start=20" rel="next"/>
<link href="https://id.indeed.com/q-frontend-developer-lowongan.html" rel="canonical"/>
<link href="android-app://c

AttributeError: 'NoneType' object has no attribute 'text'