# Retrieving information about the website
Based on the files from the /data/raw directory (generated in exercise 1) extract the following information about an offer:
- location - both city and country. For remote work, set Remote as the city and N/A as the country,
- salary - both lower and upper limits and currency. If there is no pay range, write the same value in both fields (lower - limit = upper limit),
- name of position,
- company,
- technology.

Write the results of a single bid into a dictionary with the following structure:

{
    'name': 'name of the position',
    'company': 'name of the employer',
    'technology': 'name of the used technology',
    'job': 'information regarding name of the search e.g. data analyst ',
    'location': {'city': 'city of employment', 'country': 'country of employment'},
    'salary': {'low': 'lower limit', 'high': 'higher limit', 'currency': 'salary currency'} 
}

Put single items into a list.

A list of such dictionaries can be read using another Pandas method - json_normalize (documentation link). It is shown during the workshop, because json is a commonly used construct for communication between modules.

Save the results as DataFrame to data\interim\job_offers.csv using the ; separator, UTF-8 encoding, and without index (index=False).

# Complete the exercise following the steps:

1. Write a function that takes the HTML code of a page and returns a list with pieces of HTML code that contain information about a single ad,
2. Write a function that will take the HTML code containing information about one ad and return a dictionary with the information (described above), 
3. Assemble this into a working script that Finds all files in the data\raw directory,
    For each file:
        - Divides it into sections corresponding to the company,
        - Extracts the necessary information from it as a dictionary,
        - Will add the dictionary to the previously created list,
        - Loads the list with dictionaries using Pandas into the dataset,
        - Saves the dataset in the data\interim\ directory with the current date.


# File names
We will adopt the following file naming convention:

'job_offers_{current date}.csv'

Where the {current date} parameter should use the yyyy_mm_dd format (year month day).

# Hints:
To get the current date you can use the code: datetime.today().strftime('%Y_%m_%d'). Remember to import the appropriate module!
You can split the data parsing for a single offer into several smaller helper functions. For example, one can retrieve the salary, another - parse the location data. This will make the code easier to maintain.
To test the performance of your functions, you can manually pull HTML code from a file and pass it as a parameter. This way you don't need the whole script to test how its parts work.

In [None]:
import os
from bs4 import BeautifulSoup
import urllib.parse

In [None]:
# Function to extract URLs from HTML files and find ¨job name" for further scraping
def extract_url_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the link tag with the specified attributes and extract the 'href' attribute
    url_element = soup.find('link', rel='alternate', href=True)
    
    # Check if the link tag is found before accessing its 'href' attribute
    if url_element:
        url = url_element['href']
        return url
    else:
        return None


# Specify the directory where your HTML files are stored
html_file_path = '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\raw'



# Iterate through all files in the directory and print results
for filename in os.listdir(html_file_path):
    if filename.endswith('.html'):
        file_path = os.path.join(html_file_path, filename)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        url = extract_url_from_html(html_content)
        
        try:
            parsed_url = urllib.parse.urlparse(url)
            criteria = urllib.parse.parse_qs(parsed_url.query).get('criteria', [])
            job = urllib.parse.unquote(criteria[0]).split('=')[1].replace('%27', '').replace('%20', ' ').strip("'") if criteria and criteria[0] else 'Unknown'
            print(f"File: {filename}\nURL: {url}\nExtracted Job name: {job}\n")
        except Exception as e:
            print(f"Error parsing URL for {filename}: {e}\n")



In [None]:
def extract_ad_data(html_content):
    """
    Extract ad sections from HTML content.

    Parameters:
    - html_content (str): HTML content to be parsed.

    Returns:
    - list: List of ad sections.
    """
    # Parse the HTML content using BeautifulSoup
    ad = BeautifulSoup(html_content, 'html.parser')

    # Find all 'a' elements with class 'posting-list-item'
    ad_sections = ad.find_all('a', class_='posting-list-item')

    # Return the found ad sections
    return ad_sections


def parse_ad_section(html_content, ad_sections):
    """
    Parse ad sections to extract relevant information.

    Parameters:
    - ad_sections (list): List of ad sections to be parsed.

    Returns:
    - list: List of dictionaries containing parsed ad information.
    """
    ad_data = []

    # Iterate through each ad section
    for ad in ad_sections:
        ad_info = {}

        # Extract job title
        title = ad.find('h3', class_='posting-title__position')
        title = title.text.strip() if title else 'N/A'

        # Extract company name
        company = ad.find('h4', class_='company-name')
        company = company.text.strip() if company else 'N/A'

        # Extract technologies
        technology_elements = ad.find('nfj-posting-item-tiles').find_all('span', class_='tw-text-gray-60')
        technologies = [elem.text.strip() for elem in technology_elements if elem.text.strip() not in [' ', '•']]
        technology = ', '.join(technologies) if technologies else 'N/A'

        # Extract salary
        salary = ad.find("span", {"data-cy": "salary ranges on the job offer listing"})
        salary = salary.text.strip().replace('\xa0', '') if salary else 'N/A'
        # Extract salary details
        salary_low, salary_high, currency = 'N/A', 'N/A', 'N/A'
        salary_parts = salary.split('–')
        if len(salary_parts) == 2:
            salary_low = salary_parts[0].strip()
            salary_high_currency_parts = salary_parts[1].rsplit(maxsplit=1)
            salary_high = salary_high_currency_parts[0].strip()
            currency = salary_high_currency_parts[1].strip() if len(salary_high_currency_parts) > 1 else ''

        # Extract location
        location = ad.find('div', class_='tw-flex tw-items-center ng-star-inserted')
        location = location.text.strip() if location else 'N/A'
        # Extract location details (city, country, remotely)
        location_parts = location.split(',')
        city = location_parts[0].strip() if len(location_parts) > 0 else 'N/A'
        country = location_parts[1].strip() if len(location_parts) > 1 else 'N/A'
        remotely = 'Zdalnie' in location

        # Check if 'zdialnie' is present in the city, update values accordingly
        if 'zdialnie' in city.lower():
            city = 'N/A'
            country = 'N/A'
            remotely = True

        # Extract job name from URL
        url = extract_url_from_html(html_content)  # Assuming html_content is defined in your context
        parsed_url = urllib.parse.urlparse(url)
        criteria = urllib.parse.parse_qs(parsed_url.query).get('criteria', [])
        job = urllib.parse.unquote(criteria[0]).split('=')[1].replace('%27', '').replace('%20', ' ').strip("'") if criteria and criteria[0] else 'Unknown'

        # Populate ad_info dictionary
        ad_info = {
            'title': title,
            'company': company,
            'technology': technology,
            'job': job,
            'location': {'city': city, 'country': country, 'remotely': remotely},
            'salary': {'low': salary_low, 'high': salary_high, 'currency': currency}
        }

        # Append ad_info to ad_data list
        ad_data.append(ad_info)

    # Return the list of parsed ad information
    return ad_data


In [None]:
# Example usage:
html_file_path = '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\raw\\data engineer_5.html'

# Read the HTML content from the file
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Extract ad sections using the provided HTML content
ads = extract_ad_data(html_content)

# Print the number of ad sections found
print("Number of ad sections:", len(ads))

# Parse the ad sections to extract relevant information
parsed_ads = parse_ad_section(html_content, ads)  # Pass html_content to parse_ad_section

# Print the parsed ad information
print(parsed_ads)

In [None]:
def process_files(directory_path):
    """
    Process all files in a directory and extract ad information.

    Parameters:
    - directory_path (str): Path to the directory containing HTML files.

    Returns:
    - list: List of dictionaries containing parsed ad information.
    """

    # Initialize an empty list to store the extracted ad information
    ads_list = []

    # Iterate through each file in the specified directory
    for filename in os.listdir(directory_path):
        # Check if the file has a '.html' extension
        if filename.endswith('.html'):
            # Construct the full path to the file
            file_path = os.path.join(directory_path, filename)

            # Read the HTML content from the file
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()

            # Print a message indicating the current file being processed
            print(f'Processing file: {filename}')

            # Extract ad sections from the HTML content
            ads = extract_ad_data(html_content)

            # Parse ad sections to extract relevant information
            parsed_ads = parse_ad_section(html_content, ads)

            # Extend the ads_list with the parsed information from the current file
            ads_list.extend(parsed_ads)

            # Print the number of ads found in the current file
            print(f'Number of ads in {filename}: {len(parsed_ads)}')

    # Return the list of parsed ad information for all files
    return ads_list

# Specify the directory where your HTML files are stored
html_directory_path = '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\raw'

# Call the process_files function and store the result in a variable
result_ads_list = process_files(html_directory_path)

# Print the resulting list of parsed ad information
print("Final List of Parsed Ads:")
for idx, ad_info in enumerate(result_ads_list, 1):
    print(f"Ad {idx}:", ad_info)

In [None]:
import os
from datetime import datetime
import pandas as pd

# Function to save data to CSV file with the current date
def save_to_csv(data, output_directory):
    current_date = datetime.today().strftime('%Y_%m_%d')
    output_file_path = os.path.join(output_directory, f'job_offers_{current_date}.csv')
    df = pd.DataFrame(data)
    df.to_csv(output_file_path, index=False)
    
    print(f'Dataset saved to {output_file_path}')

# Specify the input directory where your HTML files are processed
input_directory = '...\\Phyton_Workshop\\data\\raw_test'

# Specify the output directory where you want to save the CSV file
output_directory = '...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\data\\interim_test'

# Call the save_to_csv function with the correct parameters
save_to_csv(result_ads_list, output_directory)

In [None]:
import shutil
import os

def move_notebook_to_notebooks(notebook_path, notebooks_folder):
    notebook_filename = os.path.basename(notebook_path)

    # Move the notebook file to the "notebooks" subfolder
    shutil.move(notebook_path, os.path.join(notebooks_folder, notebook_filename))
    print(f"Notebook '{notebook_filename}' moved to the 'notebooks' subfolder.")

# Example usage with specific paths
notebook_path = "...\\SESSION 6 WORKSHOP\\WEBscraping\\2.scraping data.ipynb"
notebooks_folder = "...\\SESSION 6 WORKSHOP\\Phyton_Workshop\\notebooks"

move_notebook_to_notebooks(notebook_path, notebooks_folder)