In [3]:
# Imports
from bs4 import BeautifulSoup as bs
import requests
import re
import time
import random
import langchain
import langchain_community
import langchain_core
import langchain_openai
import streamlit as st
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [4]:
url = 'https://community.atlassian.com/?sort=recent'
post_title_class = 'atl-post-list__tile__title'
post_body_class = 'lia-message-body-content'

In [5]:
def fetch_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print('Error fetching HTML!')
        return None

In [6]:
def pull_forum_posts(pages: int = 1):

    global post_title_class

    if not isinstance(pages, int):
        raise TypeError('The "pages" parameter must be an integer.')

    posts = []

    for i in range(1, pages + 1):

        url = f'https://community.atlassian.com/?sort=recent&page={i}'
        soup = bs(fetch_html(url), 'lxml')

        for post in soup.find_all(class_=post_title_class):

            post_data = {}

            post_data['title'] = post.find('a').get_text().strip()
            post_data['url'] = 'https://community.atlassian.com' + post.find('a')['href']

            posts.append(post_data)

        time.sleep(1)

    return posts


In [28]:
def pull_post_body(posts: list):

    global post_body_class

    if not isinstance(posts, list):
        raise TypeError('The "posts" parameter must be a list.')

    full_post_data = []

    for post in posts:

        post_data = {}

        soup = bs(fetch_html(post['url']), 'lxml')

        post_data['title'] = post['title']
        post_data['url'] = post['url']
        post_data['body'] = soup.find('div', class_=post_body_class).get_text().strip()

        full_post_data.append(post_data)

        time.sleep(random.randint(1, 3))

    return full_post_data

In [8]:
def full_pull(pages: int = 1):
    start_time = time.time()
    posts = pull_forum_posts(pages)
    post_pull_time = time.time() - start_time
    print(f'Pulled post titles and URLs. [{post_pull_time:.2f} seconds] \nProceeding to pull post bodies...')
    full_posts = pull_post_body(posts)
    body_pull_time = time.time() - start_time - post_pull_time
    print(f'Pulled post bodies. [{body_pull_time:.2f} seconds]\n[{time.time() - start_time:.2f} seconds]')

    return full_posts

In [32]:
one_page = full_pull(1)

Pulled post titles and URLs. [2.12 seconds] 
Proceeding to pull post bodies...
Pulled post bodies. [40.91 seconds]
[43.03 seconds]


In [33]:
one_page

[{'title': 'English trainer',
  'url': 'https://community.atlassian.com/t5/Teamwork-Lab-discussions/English-trainer/td-p/2760558',
  'body': "I'm happy to join this community"},
 {'title': 'use "-s recursive -X ours" for pull request',
  'url': 'https://community.atlassian.com/t5/Bitbucket-questions/use-quot-s-recursive-X-ours-quot-for-pull-request/qaq-p/2760552',
  'body': 'My Pull Request to a protected branch shows "You will need to resolve conflicts to be able to merge", but I locally I checked and it can be merged cleanly if I use `git merge\xa0-s recursive -X ours`.Can I use this strategy to resolve the conflict?'},
 {'title': 'Resource management: How to allocate issues on multiple users or teams in Jira?',
  'url': 'https://community.atlassian.com/t5/Jira-questions/Resource-management-How-to-allocate-issues-on-multiple-users-or/qaq-p/2760551',
  'body': "Hi,a very common use case is that companies want to do resource management on issues in Jira, and plan Initiatives or Epics m

In [62]:
url = 'https://www.servicenow.com/community/itsm/ct-p/it-service-management'

sn_post_title_class = 'custom-message-tile'
sn_post_title_avoid_class ='custom-thread-featured-flag'
sn_post_body_class = 'lia-message-body'
sn_load_button_xpath = '//*[@id="custom-loader-button"]'
sn_accept_cookies_button = '//*[@id="truste-consent-button"]'

def fetch_posts(n):
    global sn_post_title_class, sn_load_button_xpath, sn_post_body_class, url
    posts = []
    driver = webdriver.Chrome()
    action = ActionChains(driver)
    driver.get(url)
    time.sleep(2)
    load_button = driver.find_element(By.XPATH, sn_load_button_xpath)
    accept_cookies_button = driver.find_element(By.XPATH, sn_accept_cookies_button)
    action.click(accept_cookies_button).perform()
    while len(posts) < n:
        try:
            action.click(load_button).perform()
            post_titles = [post for post in driver.find_elements(By.CLASS_NAME, sn_post_title_class) if sn_post_title_avoid_class not in post.get_attribute('class')]
            for post in post_titles[len(posts):]:
                post_data = {}
                post_data['title'] = post.find_element(By.TAG_NAME, 'a').get_attribute('title')
                post_data['url'] = post.find_element(By.TAG_NAME, 'a').get_attribute('href')
                posts.append(post_data)
        except Exception as e:
            print(f'Error: {e}')
            break
    for post in posts[:n]:
        try:
            driver.get(post['url'])
            time.sleep(1)
            post['body'] = driver.find_element(By.CLASS_NAME, sn_post_body_class).text
        except Exception as e:
            print(f'Error: {e}')
    driver.quit()
    return posts[:n]

In [60]:
sn_posts = fetch_posts(6)

In [61]:
sn_posts

[{'title': 'Implementing Service Level Objectives (SLOs) within SOW: A Customer Use Case',
  'url': 'https://www.servicenow.com/community/itsm-blog/implementing-service-level-objectives-slos-within-sow-a-customer/ba-p/2958632',
  'body': "In today's digitally dynamic environment, ensuring that services are always available and reliable is crucial, especially for high-traffic applications. One of our key customers, a leading app in the US food and beverage sector, faces this challenge daily. They handle between 700,000 and 800,000 transactions every day, so service reliability is critical. Here’s how they improved their operations by managing Service Level Objectives (SLOs) within a Service Operations Workspace (SOW).\n  Customer Overview\nThe customer, a major player in the food and beverage industry, boasts one of the most visited apps in the sector. Despite their success, they face significant challenges in managing service reliability due to a fragmented monitoring and management se

In [22]:
class ScraperObjects:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)
            
    def __repr__(self):
        return f'{self.__class__.__name__}(' + '\n' + ',\n'.join(f'{key}={value}' for key,value in self.__dict__.items()) + '\n)'

In [23]:
test = ScraperObjects(url='somewebsite.com', post_title_class='post-title')

In [24]:
print(test)

ScraperObjects(
url=somewebsite.com,
post_title_class=post-title
)


In [None]:
# Example Function

def setup_driver():
    driver = webdriver.Chrome()
    return driver

def dynamic_scraper(scraper_objects):
    driver = setup_driver()
    driver.get(scraper_objects.url)
    posts = []
    # Execute initial clicks
    for i in range(1, 100):  # Arbitrarily large number, assuming user won't have more than 100 initial clicks
        attribute_name = f"initial_click_{i}"
        if hasattr(scraper_obj, attribute_name):
            click_selector = getattr(scraper_objects, attribute_name)
            try:
                element = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, click_selector))
                )
                element.click()
                print(f"Clicked element {attribute_name} with selector {click_selector}")
            except Exception as e:
                print(f"Error clicking {attribute_name}: {e}")
                break
        else:
            break
    
    # Perform repeated click and data scraping
    if hasattr(scraper_objects, 'repeated_click_selector'):
        while posts < scraper_objects.numbers_of_posts:
            try:
                # Insert data scraping logic here
                # Example: scrape data from the current page
                scrape_data(driver)

                # Click the "load more" or "next page" button
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, scraper_objects.repeated_click_selector))
                )
                next_button.click()
                print("Clicked repeated click element")
            except Exception as e:
                print(f"Exception occurred: {e}")
                break
    return posts[:n]

    driver.quit()

def scrape_data(driver):
    posts = []
    driver = webdriver.Chrome()
    action = ActionChains(driver)
    driver.get(url)
    time.sleep(2)
    load_button = driver.find_element(By.XPATH, sn_load_button_xpath)
    accept_cookies_button = driver.find_element(By.XPATH, sn_accept_cookies_button)
    action.click(accept_cookies_button).perform()
    while len(posts) < n:
        try:
            action.click(load_button).perform()
            post_titles = [post for post in driver.find_elements(By.CLASS_NAME, sn_post_title_class) if sn_post_title_avoid_class not in post.get_attribute('class')]
            for post in post_titles[len(posts):]:
                post_data = {}
                post_data['title'] = post.find_element(By.TAG_NAME, 'a').get_attribute('title')
                post_data['url'] = post.find_element(By.TAG_NAME, 'a').get_attribute('href')
                posts.append(post_data)
        except Exception as e:
            print(f'Error: {e}')
            break
    for post in posts[:n]:
        try:
            driver.get(post['url'])
            time.sleep(1)
            post['body'] = driver.find_element(By.CLASS_NAME, sn_post_body_class).text
        except Exception as e:
            print(f'Error: {e}')
    driver.quit()
    return posts[:n]

In [None]:
# Example Streamlit Code

class ScraperObjects:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

    def __repr__(self):
        attrs = ', '.join(f"{key}={value}" for key, value in self.__dict__.items())
        return f"{self.__class__.__name__}({attrs})"

def main():
    st.title("Dynamic Web Scraper Tool")

    # Collect user inputs
    url = st.text_input("URL")
    initial_clicks = []
    num_initial_clicks = st.number_input("Number of Initial Clicks", min_value=1, step=1)
    text_entry = st.checkbox

    for i in range(1, num_initial_clicks + 1):
        initial_click = st.text_input(f"Single Click {i} Button XPath:", key=f"initial_click_{i}")
        initial_clicks.append((f"initial_click_{i}", initial_click))

    repeated_click_selector = st.text_input("Repeated Click Button XPath:")

    if st.button("Create Scraper Object"):
        # Create a dictionary with user inputs
        user_input = {
            "url": url,
            "repeated_click_selector": repeated_click_selector
            "number_of_posts": num_initial_clicks
        }
        for click_name, click_selector in initial_clicks:
            user_input[click_name] = click_selector

        # Create an instance of ScraperObjects with the user input
        scraper_objects = ScraperObjects(**user_input)
        
        # Display the created object
        st.write(scraper_objects)
        
        # Call the dynamic scraper function
        dynamic_scraper(scraper_objects)

if __name__ == "__main__":
    main()
