# Retrieving data from NoFluffJobs
The first part of the exercise consists in retrieving data from the NoFluffJobs homepage. Complete the exercise following the steps:

1. Write a function that takes two parameters (job name and page number), and returns the HTML code of that page,
2. Write a function that takes one parameter (website code) and returns the information saying whether there are more offers on the page (True or False),
3. Write a function that takes one parameter (job name), and then in a loop, starting from 1 page:
    - Retrieves the code of the given page,
    - Checks if there are still ads on the page,
    - If there are, it saves the HTML code to disk and goes to the next page,
    - If there are not, it terminates the operation,
Remember to use previously written functions in step 3.

At this stage, we do not process the data yet, we retrieve it as it is available.

Run the script for the following jobs:
- data analyst,
- data scientist,
- data engineer.

NOTE:
For the website to generate its entire content after opening it needs to be clicked. In other words the process of loading the website should look as follows:
    - open the job offers page,
    - click any object on the page (e.g. accept cookies).

# File names
We will adopt the following file naming convention:

'{job_name}_{page_number}.html'

For example: data analyst_1.html is going to mean the list of data analyst job offers from page one. The files should be saved in the /data/raw directory.

# Hints:
Remember to add a time interval between every page transition, e.g. 5 seconds,
As a url to be opened by the browser, you can use the following template:
'https://nofluffjobs.com/pl/jobs?criteria={job_name}&page={page_number}'

to retrieve the HTML content of the page you can use:browser.page_source,
Because we do not how how many pages we are going to have for each job, you can use a while loop,
if you want to stop executing the loop you can use the break keyword,

# LIBRARIES

In [1]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
from urllib.parse import quote

def accept_cookies(browser):
    try:
        cookies_button = WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        cookies_button.click()
    except Exception as e:
        print("No cookies button found or other error:", e)



def more_offers_exist(html):
    """
     # Check if there are more job offers on the page.
    Parameters:
    - html (str): HTML content of the page.
    Returns:
    - bool: True if more offers exist, False otherwise.
    """
    
    return "Pokaż kolejne oferty" in html # for PL



def no_jobs_found(html):
     # Check if the specified element is present in the HTML
     # return 'Nebyly nalezeny žádné výsledky' in html #for CZ
     return "Brak wyników wyszukiwania" in html



def break_if_no_jobs(job_name, html):
     if no_jobs_found(html):
         print(f"No results found for {job_name}. Ending the script and continue with next job.")
         return True
     return False


def click_see_more_button(browser):
    see_more_button = browser.find_elements(By.XPATH, '//button[contains(text(), "Pokaż kolejne oferty")]')
    if see_more_button:
        try:
            see_more_button[0].click()
            WebDriverWait(browser, 10).until_not(
                EC.presence_of_element_located((By.CLASS_NAME, 'intercepting-element-class'))
            )
        except (TimeoutException, ElementClickInterceptedException):
            print("Timed out or intercepted while waiting for 'See more offers' button. No more offers or other issue.")
            return False
        return True
    else:
        print("Ending the script. No more jobs found.")
        return False




def save_html_to_file(job_name, page_number, html):
    directory = '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\raw_test'
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    file_name = f'{job_name}_{page_number}.html'
    file_path = os.path.join(directory, file_name)
    
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(html)





def retrieve_data(job_name):
    browser = webdriver.Chrome()
    
    try:
        page_number = 1
        
        # Encode the job_name to handle spaces and special characters in the URL
        url = f'https://nofluffjobs.com/pl/jobs?criteria=jobPosition=\'{quote(job_name)}\'&page={page_number}'
        
        # Open the job offers page
        browser.get(url)
        
        # Click any object on the page (e.g., accept cookies)
        accept_cookies(browser)
    
        while True:
            # Get the HTML code of the page
            page_html = browser.page_source
    
            if not click_see_more_button(browser):
                print(f"No more offers for {job_name} on page {page_number}. Ending the script.")
                break
    
            # Increment page number
            page_number += 1
    
            # Print information about the last page downloaded
            print(f"Job: {job_name}, Last Page Downloaded: {page_number}")
    
            # Add a time interval between page transitions (optional)
            time.sleep(2)
            
    finally:
        # Save HTML code to disk for the last downloaded page
        last_page_html = browser.page_source
        save_html_to_file(job_name, page_number, last_page_html)
        browser.quit()  # Quit the browser after processing all pages


# Separate function for saving HTML to a file
def save_html_for_jobs(jobs):
    for job in jobs:
        print(f"Processing job: {job}")
        retrieve_data(job)


# Run the script for the specified jobs
jobs_to_run = ['data analyst', 'data scientist', 'data engineer']
save_html_for_jobs(jobs_to_run)

In [None]:
import shutil
import os

def move_notebook_to_notebooks(notebook_path, notebooks_folder):
    notebook_filename = os.path.basename(notebook_path)

    # Move the notebook file to the "notebooks" subfolder
    shutil.move(notebook_path, os.path.join(notebooks_folder, notebook_filename))
    print(f"Notebook '{notebook_filename}' moved to the 'notebooks' subfolder.")

# Example usage with specific paths
notebook_path = "...\\SESSION 6 WORKSHOP\\WEBscraping\\1.0_data_retrieving.ipynb"
notebooks_folder = "...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\notebooks"

move_notebook_to_notebooks(notebook_path, notebooks_folder)