# Get list exam topics to excel file

In [20]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import re
from datetime import datetime

def fetch_page(session, url):
    try:
        response = session.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        discussion_rows = soup.find_all('div', class_='row discussion-row')
        titles_links = [(row.find('a', class_='discussion-link').text.strip()
                , row.find('a', class_='discussion-link')['href']
                , row.find('p', class_='creation-info').find_all('i')[1].get_text(strip=True))
                for row in discussion_rows]

        return titles_links
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

def scrape_examtopics(exam, num_pages):
    base_url = f"https://www.examtopics.com/discussions/{exam}/"
    all_titles_links = []
    with requests.Session() as session:
        retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.mount('https://', HTTPAdapter(max_retries=retries))

        urls = [f"{base_url}/{page_num}/" for page_num in range(1, num_pages + 1)]

        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_url = {executor.submit(fetch_page, session, url): url for url in urls}

            for future in as_completed(future_to_url):
                titles_links = future.result()
                all_titles_links.extend(titles_links)

    return all_titles_links, exam


def save_to_excel(data, exam):
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["title", "link", "load_date"])
    # Extract 'exam' and 'topic_question' from 'title'
    df["exam"] = df["title"].str.split("topic").str[0].str.strip()
    df["topic_question"] = df["title"].str.split("topic").str[1]
    # Extract 'topic' and 'question' from 'topic_question'
    df["topic"] = df["topic_question"].str.split("question").str[0].astype(int)
    df["question"] = df["topic_question"].str.split("question").str[1].apply(lambda x: int(re.search(r"(\d+)", x).group(1)))
    # # Add 'load_date' column with the current date
    # df["load_date"] = datetime.now().date()
    # Modify 'link' column to prepend the base URL
    df["link"] = "https://www.examtopics.com" + df["link"]
    # Select the desired columns
    df = df[["load_date", "exam", "topic", "question", "link"]]
    # Sort the DataFrame by 'exam', 'topic', and 'question'
    df = df.sort_values(by=["exam", "topic", "question"])

    path_file = f"exam_topics_{exam}_{datetime.now().strftime("%Y%m%d")}.xlsx"
    df.to_excel(path_file, index=False)
    return print(f"Data saved to {path_file}")

In [7]:
# crawl the exam topics for AWS use 'amazon' as exam, for Databricks use 'databricks' as exam, for GCP use 'google' as exam, for Azure use 'microsoft' as exam,
data, exam = scrape_examtopics(exam="amazon", num_pages=557)
save_to_excel(data, exam)

Data saved to exam_topics_amazon_20250430.xlsx


In [21]:
# crawl the exam topics for AWS use 'amazon' as exam, for Databricks use 'databricks' as exam, for GCP use 'google' as exam, for Azure use 'microsoft' as exam,
data, exam = scrape_examtopics(exam="databricks", num_pages=39)
save_to_excel(data, exam)

Data saved to exam_topics_databricks_20250619.xlsx


In [22]:
# crawl the exam topics for AWS use 'amazon' as exam, for Databricks use 'databricks' as exam, for GCP use 'google' as exam, for Azure use 'microsoft' as exam,
data, exam = scrape_examtopics(exam="microsoft", num_pages=1383)
save_to_excel(data, exam)

Data saved to exam_topics_microsoft_20250619.xlsx


# Crawl detail exam questions and discussions

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime

# Function to extract the question and answer choices from the HTML content
def extract_question_and_answers(excel_file, cert_name):
    # Read the Excel file and get the column with URLs
    df = pd.read_excel(excel_file)
    # filter exam
    df = df[df['exam'] == cert_name]
    # order by question desc and from top 400 to top 200
    #df = df.sort_values(by=['question'], ascending=False)
    #df = df.iloc[0:1]
    # Open a text file to write the final output
    cert_name = cert_name.lower().strip().replace(" ", "_").replace(":", "_")
    output_file = f"{cert_name}_{datetime.now().strftime("%Y%m%d")}.txt"
    with open(output_file, "w", encoding="utf-8") as file:
        for url in df['link']:
            try:
                with requests.Session() as session:
                    # Set up retries
                    session.headers.update({'User-Agent': 'Mozilla/5.0'})
                    response = session.get(url)
                    response.raise_for_status()

                    # Parse the HTML content
                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Find the div with the "new-comment-box" class
                    div = soup.find("div", class_="new-comment-box")

                    # Extract the value of the data-title attribute
                    data_title = div["data-title"] if div and "data-title" in div.attrs else ""

                    # Find the upload date in the discussion-meta-data div
                    meta_div = soup.find("div", class_="discussion-meta-data")
                    i_tags = meta_div.find_all("i")
                    upload_date = i_tags[-1].text.strip() if i_tags else ""
                    data_title = f"{data_title} - {upload_date}"

                    # Extract the question text
                    p_tag = soup.find('p', class_='card-text')
                    text_parts = []
                    if p_tag:
                        for element in p_tag.contents:
                            if isinstance(element, str):
                                text_parts.append(element)
                            elif element.name == 'br':
                                text_parts.append('\n')
                            elif element.name == 'img':
                                text_parts.append(element['src'])
                    question_str = ''.join(text_parts)

                    # Extract all answer choices
                    answer_choices = [
                        li.text.strip().replace('\n', '').replace('\r', '')
                        for li in soup.find_all('li', class_='multi-choice-item')
                    ]
                    answer_choices_str = '\n'.join(answer_choices)

                    # Combine the data_title, question_str, and answer_choices_str
                    final_str = f"{data_title}\n{question_str}\n{answer_choices_str}"

                    # Process the text to remove extra spaces
                    lines = final_str.splitlines()
                    formatted_lines = [" ".join(line.split()) for line in lines if line.strip()]
                    formatted_text = "\n".join(formatted_lines)

                    # Find all badges with "Highly Voted"
                    highly_voted_badges = soup.find_all("span", class_="badge badge-primary")
                    comment_str = ""
                    for badge in highly_voted_badges:
                        if "Highly Voted" in badge.text:
                            comment_container = badge.find_parent("div", class_="comment-container")
                            if comment_container:
                                comment_content = comment_container.find("div", class_="comment-content")
                                if comment_content:
                                    comment_str += (
                                        "Highly Voted comment found!\n"
                                        + comment_content.text.strip()
                                        + "\n******************************\n"
                                    )

                    # Combine the formatted text and comment string
                    final_output = f"{formatted_text}\n\n{comment_str}"

                    # Write the final output to the text file
                    file.write(final_output + "\n\n======================================================================================\n\n")
            except Exception as e:
                print(f"Error processing URL {url}: {e}")

In [2]:
extract_question_and_answers(excel_file="exam_topics_databricks_20250430.xlsx", cert_name="Exam Certified Machine Learning Professional")

In [10]:
extract_question_and_answers(excel_file="exam_topics_databricks_20250430.xlsx", cert_name="Exam Certified Data Engineer Associate")

In [11]:
extract_question_and_answers(excel_file="exam_topics_databricks_20250430.xlsx", cert_name="Exam Certified Data Engineer Professional")

Error processing URL https://www.examtopics.com/discussions/databricks/view/144256-exam-certified-data-engineer-professional-topic-1-question/: 503 Server Error: Service Unavailable for url: https://www.examtopics.com/discussions/databricks/view/144256-exam-certified-data-engineer-professional-topic-1-question/
Error processing URL https://www.examtopics.com/discussions/databricks/view/141553-exam-certified-data-engineer-professional-topic-1-question/: 503 Server Error: Service Unavailable for url: https://www.examtopics.com/discussions/databricks/view/141553-exam-certified-data-engineer-professional-topic-1-question/
Error processing URL https://www.examtopics.com/discussions/databricks/view/149836-exam-certified-data-engineer-professional-topic-1-question/: 503 Server Error: Service Unavailable for url: https://www.examtopics.com/discussions/databricks/view/149836-exam-certified-data-engineer-professional-topic-1-question/
Error processing URL https://www.examtopics.com/discussions/d

In [3]:
extract_question_and_answers(excel_file="exam_topics_databricks_20250430.xlsx", cert_name="Exam Certified Associate Developer for Apache Spark")

In [2]:
extract_question_and_answers(excel_file="exam_topics_google_20250430.xlsx", cert_name="Exam Professional Data Engineer")

Error processing URL https://www.examtopics.com/discussions/google/view/130220-exam-professional-data-engineer-topic-1-question-270/: 503 Server Error: Service Unavailable for url: https://www.examtopics.com/discussions/google/view/130220-exam-professional-data-engineer-topic-1-question-270/
Error processing URL https://www.examtopics.com/discussions/google/view/130262-exam-professional-data-engineer-topic-1-question-277/: 503 Server Error: Service Unavailable for url: https://www.examtopics.com/discussions/google/view/130262-exam-professional-data-engineer-topic-1-question-277/
Error processing URL https://www.examtopics.com/discussions/google/view/130263-exam-professional-data-engineer-topic-1-question-278/: 503 Server Error: Service Unavailable for url: https://www.examtopics.com/discussions/google/view/130263-exam-professional-data-engineer-topic-1-question-278/
Error processing URL https://www.examtopics.com/discussions/google/view/129911-exam-professional-data-engineer-topic-1-qu

In [3]:
extract_question_and_answers(excel_file="exam_topics_google_20250430.xlsx", cert_name="Exam Associate Data Practitioner")

In [None]:
extract_question_and_answers(excel_file="exam_topics_amazon_20250430.xlsx", cert_name="Exam AWS Certified Solutions Architect - Associate SAA-C03")

In [2]:
extract_question_and_answers(excel_file="exam_topics_amazon_20250430.xlsx", cert_name="Exam AWS Certified Developer - Associate DVA-C02")

In [10]:
extract_question_and_answers(excel_file="microsoft_examtopics_20250324 (1).xlsx", cert_name="Exam AI-900 ")

In [10]:
extract_question_and_answers(excel_file="microsoft_examtopics_20250324 (1).xlsx", cert_name="Exam PL-300 ")

In [2]:
extract_question_and_answers(excel_file="microsoft_examtopics_20250324 (1).xlsx", cert_name="Exam DP-900 ")