In [1]:
# Importing Selenium and Pandas

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import pandas as pd

In [None]:
# Set up Chrome options for headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Connect to Selenium running in another container
driver = webdriver.Remote(
    command_executor="http://172.17.0.2:4444",
    options=chrome_options  
)

In [4]:
# Check if it works with https://mate.academy
url = "https://mate.academy"
driver.get(url)

In [5]:
print(driver.title)

Безкоштовні IT курси до працевлаштування. Навчись програмувати онлайн | Mate academy


In [None]:
# Get courses' names and links to pages
try:
    courses_button = driver.find_element(By.XPATH, "//button[@data-qa='header-courses-dropdown-button']")
    courses_button.click()
except Exception as e:
    print("Error clicking courses button:", e)

# Scrape course titles and hrefs
try:
    courses = driver.find_elements(By.XPATH, "//ul[@class='DropdownProfessionsList_list__8OXQk']//a")
    course_data = []
    
    for course in courses:
        title = course.get_attribute("title")
        href = course.get_attribute("href")
        if title and href:
            course_data.append({"title": title, "href": href})
    
    # Print the extracted data
    for item in course_data:
        print(item)
    
except Exception as e:
    print("Error extracting courses:", e)

In [None]:
# On course's page get description and type
import time

for course in course_data:
    driver.get(course["href"])
    print(driver.title) #checking where we are

    # Retrieving description
    try:
        description_block = driver.find_element(By.CSS_SELECTOR, "div[data-qa='profession-salary-block']")
        description = description_block.find_element(By.TAG_NAME, "pre").text
    except:
        description = "Description not found"
    course["description"] = description

     # Retrieving type
    try:
        type_blocks = driver.find_elements(By.XPATH, "//div[contains(text(), 'Будні з 9 до 18') or contains(text(), 'У вільний час')]")
        types = [block.text for block in type_blocks]
    except:
        types = []
    if "Будні з 9 до 18" in types and "У вільний час" in types:
        course["type"] = "Full-time, Flex"
    elif "Будні з 9 до 18" in types:
        course["type"] = "Full-time"
    elif "У вільний час" in types:
        course["type"] = "Flex"
    else:
        course["type"] = "Unknown"


In [None]:
for course in course_data:
    print(course)

In [None]:
# Save as DataFrame
df = pd.DataFrame(course_data)
df

In [None]:
# Split the type column
def split_type(row):
    types = row['type'].split(', ')
    return pd.DataFrame({
        'title': [row['title']] * len(types),
        'href': [row['href']] * len(types),
        'description': [row['description']] * len(types),
        'type': types
    })

In [None]:
df_expanded = pd.concat(df.apply(split_type, axis=1).tolist(), ignore_index=True)
df_expanded

In [None]:
# Drop href
df_expanded = df_expanded.drop('href', axis = 1)

In [None]:
df_expanded

In [None]:
df_expanded.to_csv('WebScrappingMate.csv', index=False)