<a href="https://colab.research.google.com/github/az-my/hana-cara/blob/main/Jobstreet_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install all webscraping and webcrawling library**

In [None]:
!pip install beautifulsoup4 requests selenium scrapy playwright MechanicalSoup pandas pyquery urllib3

In [None]:
!pip install PyGithub


# **Get Total Record**

In [17]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Function to fetch job data from a given URL
def fetch_job_data(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to fetch data. Status code: {response.status_code}")
            return None
    except requests.RequestException as e:
        print(f"Error occurred: {e}")
        return None

# Function to format the listing date
def format_date(date_string):
    try:
        date_object = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
        return date_object.strftime("%d/%m/%Y %H:%M:%S")
    except ValueError:
        return 'N/A'

# Function to calculate time difference from the listing date to the current date
def calculate_time_difference(date_string):
    try:
        listing_date = datetime.strptime(date_string, "%d/%m/%Y %H:%M:%S")
        current_date = datetime.now()

        time_difference = current_date - listing_date

        if time_difference < timedelta(days=1):
            return f"{int(time_difference.total_seconds() / 3600)} hours"
        elif time_difference < timedelta(days=7):
            return f"{int(time_difference.days)} days"
        else:
            return f"{int(time_difference.days / 7)} weeks"
    except ValueError:
        return 'N/A'

# Function to extract job details from each entry in the fetched data
def extract_job_details(entry):
    job_id = entry.get('id', 'N/A')
    formatted_date = format_date(entry.get('listingDate', 'N/A'))
    time_difference = calculate_time_difference(formatted_date)
    return {
        "Job_Title": entry.get('title', 'N/A'),
        "Company": entry['advertiser'].get('description', 'N/A'),
        "Location": entry.get('locationWhereValue', 'N/A'),
        "Posted_On": formatted_date,
        "Time_Elapsed": time_difference,
        "Salary": entry.get('salary', 'N/A'),
        "Apply_Here": f"https://www.jobstreet.co.id/id/job/{job_id}"
    }

# Function to fetch job data from multiple pages and extract details
def fetch_all_job_data(base_url, params, total_pages):
    all_data = []
    for page in range(1, total_pages + 1):  # Iterate through all pages
        params['page'] = str(page)
        url = base_url + "&".join([f"{key}={value}" for key, value in params.items()])
        print(f"Fetching data for page {page}: {url}")

        parsed_json = fetch_job_data(url)
        if parsed_json:
            for entry in parsed_json.get('data', []):
                if 'advertiser' in entry and 'description' in entry['advertiser']:
                    job_details = extract_job_details(entry)
                    all_data.append(job_details)
    return all_data

# Function to display a snippet of job data and save it to a JSON file
def display_and_save_job_data(all_data):
    df = pd.DataFrame(all_data)
    df['Number'] = df.reset_index().index + 1  # Adding a numbering column
    styled_df = pd.concat([df.head(10), df.tail(10)])
    styled_df = styled_df.style.set_properties(**{'text-align': 'left'})
    styled_df.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}])
    display(styled_df)

    # Save DataFrame to a single properly formatted JSON file
    df.to_json('JobData.json', orient='records', indent=2)

# Function to fetch metadata including total job count and page size
def fetch_metadata(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            metadata = data.get('solMetadata', {})
            total_job_count = metadata.get('totalJobCount', 0)
            page_size = metadata.get('pageSize', 0)
            return total_job_count, page_size
        else:
            print(f"Failed to fetch metadata. Status code: {response.status_code}")
            return 0, 0
    except requests.RequestException as e:
        print(f"Error occurred: {e}")
        return 0, 0

# Function to calculate the number of pages based on total job count and page size
def calculate_total_pages(total_jobs, page_size):
    return (total_jobs + page_size - 1) // page_size

# Function to get total job count, page size, and number of pages
def get_job_information(base_url, params):
    metadata_url = base_url + "&".join([f"{key}={value}" for key, value in params.items()])
    total_jobs, page_size = fetch_metadata(metadata_url)
    total_pages = calculate_total_pages(total_jobs, page_size)
    return total_jobs, page_size, total_pages

# Main function orchestrating the process
def main():
    base_url = "https://www.jobstreet.co.id/api/chalice-search/v4/search?"
    params = {
        "siteKey": "ID-Main",
        "keywords": "%22data+analyst%22",
        "sortmode": "ListedDate",
        "include": "seodata",
        "locale": "id-ID",
    }

    total_jobs, page_size, total_pages = get_job_information(base_url, params)

    all_job_data = fetch_all_job_data(base_url, params, total_pages)
    display_and_save_job_data(all_job_data)

if __name__ == "__main__":
    main()


Fetching data for page 1: https://www.jobstreet.co.id/api/chalice-search/v4/search?siteKey=ID-Main&keywords=%22data+analyst%22&sortmode=ListedDate&include=seodata&locale=id-ID&page=1
Fetching data for page 2: https://www.jobstreet.co.id/api/chalice-search/v4/search?siteKey=ID-Main&keywords=%22data+analyst%22&sortmode=ListedDate&include=seodata&locale=id-ID&page=2
Fetching data for page 3: https://www.jobstreet.co.id/api/chalice-search/v4/search?siteKey=ID-Main&keywords=%22data+analyst%22&sortmode=ListedDate&include=seodata&locale=id-ID&page=3
Fetching data for page 4: https://www.jobstreet.co.id/api/chalice-search/v4/search?siteKey=ID-Main&keywords=%22data+analyst%22&sortmode=ListedDate&include=seodata&locale=id-ID&page=4
Fetching data for page 5: https://www.jobstreet.co.id/api/chalice-search/v4/search?siteKey=ID-Main&keywords=%22data+analyst%22&sortmode=ListedDate&include=seodata&locale=id-ID&page=5
Fetching data for page 6: https://www.jobstreet.co.id/api/chalice-search/v4/search?si

Unnamed: 0,Job_Title,Company,Location,Posted_On,Time_Elapsed,Salary,Apply_Here,Number
0,Merchandise Planning Associate Manager,PT MAP AKTIF ADIPERKASA,Jakarta Raya,20/11/2023 04:44:51,11 hours,,https://www.jobstreet.co.id/id/job/71639391,1
1,Manager Warehouse and Support (Warehouse Analyst),PT. GAWE BECIK NADHAH ANUGRAH,Yogyakarta DI Yogyakarta,19/11/2023 09:27:26,1 days,Rp 6.000.000 – Rp 9.000.000 per month,https://www.jobstreet.co.id/id/job/71624382,2
2,Data Analyst,PT. Automobil Jaya Abadi,Sidoarjo Jawa Timur,18/11/2023 09:27:48,2 days,,https://www.jobstreet.co.id/id/job/71618799,3
3,Data Analyst,PT Cheil Worldwide Indonesia,Jakarta Raya,17/11/2023 09:52:27,3 days,,https://www.jobstreet.co.id/id/job/71608423,4
4,Data Analyst and Administration Marketing,Pengiklan Privat,Sidoarjo Jawa Timur,17/11/2023 08:57:23,3 days,"Rp 4,500,000 – Rp 6,300,000 per month",https://www.jobstreet.co.id/id/job/71605736,5
5,System Analyst,PT Persada Inti Utama (Jakarta),Bekasi Jawa Barat,17/11/2023 06:57:47,3 days,,https://www.jobstreet.co.id/id/job/71601660,6
6,Data Analyst,PT Adi Data Informatika,Jakarta Raya,17/11/2023 03:35:12,3 days,"Rp 7,000,000 – Rp 14,000,000 per month",https://www.jobstreet.co.id/id/job/71593804,7
7,Data Analyst - CS2,Orang Tua Group,Jakarta Barat Jakarta Raya,17/11/2023 02:14:00,3 days,,https://www.jobstreet.co.id/id/job/71590249,8
8,Data Analyst (Enseval Putera Megatrading Tbk),Company Listings ID,Jakarta Raya,16/11/2023 21:28:22,3 days,,https://www.jobstreet.co.id/id/job/71583421,9
9,Junior Data Analyst (JULO),Company Listings ID,Jakarta Raya,16/11/2023 21:28:06,3 days,,https://www.jobstreet.co.id/id/job/71583379,10
