In [1]:
# Imports
from bs4 import BeautifulSoup as bs
import requests
import re
import time
import random
import langchain
import langchain_community
import langchain_core
import langchain_openai

In [2]:
url = 'https://community.atlassian.com/?sort=recent'
post_title_class = 'atl-post-list__tile__title'
post_body_class = 'lia-message-body-content'

In [3]:
def fetch_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print('Error fetching HTML!')
        return None

In [4]:
def pull_forum_posts(pages: int = 1):

    global post_title_class

    if not isinstance(pages, int):
        raise TypeError('The "pages" parameter must be an integer.')

    posts = []

    for i in range(1, pages + 1):

        url = f'https://community.atlassian.com/?sort=recent&page={i}'
        soup = bs(fetch_html(url), 'lxml')

        for post in soup.find_all(class_=post_title_class):

            post_data = {}

            post_data['title'] = post.find('a').get_text().strip()
            post_data['url'] = 'https://community.atlassian.com' + post.find('a')['href']

            posts.append(post_data)

        time.sleep(1)

    return posts


In [5]:
def pull_post_body(posts: list):

    global post_body_class

    if not isinstance(posts, list):
        raise TypeError('The "posts" parameter must be a list.')

    full_post_data = []

    for post in posts:

        post_data = {}

        soup = bs(fetch_html(post['url']), 'lxml')

        post_data['title'] = post['title']
        post_data['url'] = post['url']
        post_data['body'] = soup.find('div', class_=post_body_class).get_text().strip()

        full_post_data.append(post_data)

        time.sleep(random.randint(1, 3))

    return full_post_data

In [6]:
def full_pull(pages: int = 1):
    start_time = time.time()
    posts = pull_forum_posts(pages)
    post_pull_time = time.time() - start_time
    print(f'Pulled post titles and URLs. [{post_pull_time:.2f} seconds] \nProceeding to pull post bodies...')
    full_posts = pull_post_body(posts)
    body_pull_time = time.time() - start_time - post_pull_time
    print(f'Pulled post bodies. [{body_pull_time:.2f} seconds]\n[{time.time() - start_time:.2f} seconds]')

    return full_posts

In [9]:
one_page = full_pull(1)

Pulled post titles and URLs. [2.11 seconds] 
Proceeding to pull post bodies...
Pulled post bodies. [50.28 seconds]
 OP TIME: 52.39 seconds


In [10]:
one_page

[{'title': 'Can you create a Bitbucket project from the API using an HTTP access token?',
  'url': 'https://community.atlassian.com/t5/Bitbucket-questions/Can-you-create-a-Bitbucket-project-from-the-API-using-an-HTTP/qaq-p/2757616',
  'body': "I'm trying to use the atlassian-python-api to manage may Bitbucket projects and repositories and I'm running into an issue where it says I don't have permission. I created my HTTP access token with project admin and repository admin. I'm trying to avoid using my password, but it appears the tokens may only be good for within a project?Has anyone done this before, and if so can you provide some suggestions on what I may be doing wrong?Thanks!"},
 {'title': 'Creating a simpler card from an existing card',
  'url': 'https://community.atlassian.com/t5/Trello-questions/Creating-a-simpler-card-from-an-existing-card/qaq-p/2757614',
  'body': 'when a card from another board is automated to copy into a list on a different board, can you create an automati

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

url = 'https://www.servicenow.com/community/itsm/ct-p/it-service-management'

sn_post_title_class = 'custom-message-tile custom-thread-unread'
sn_post_body_class = 'lia-message-body'
sn_load_button_xpath = '//*[@id="custom-loader-button"]'

def fetch_posts(n):
    global sn_post_title_class, sn_load_button_xpath, url
    posts = []
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(6)
    load_button = driver.find_element(By.XPATH, sn_load_button_xpath)
    while len(posts) < n:
        try:
            ActionChains(driver).move_to_element(load_button).click(load_button).perform()
            time.sleep(2)
            post_titles = driver.find_elements(By.CLASS_NAME, sn_post_title_class)
            for post in post_titles[len(posts):]:
                post_data = {}
                post_data['title'] = post.text
                post_data['url'] = 'https://www.servicenow.com' + post.get_attribute('href')
                posts.append(post_data)
        except Exception as e:
            print(f'Error: {e}')
            break
    driver.quit()
    return posts

# # Function to load more posts
# def load_more_posts(driver, load_button_xpath, post_xpath, n):
#     posts = []
#     while len(posts) < n:
#         try:
#             load_more_button = driver.find_element(By.XPATH, load_button_xpath)
#             ActionChains(driver).move_to_element(load_more_button).click(load_more_button).perform()
#             time.sleep(2)  # Wait for new posts to load
#             posts = driver.find_elements(By.XPATH, post_xpath)
#         except Exception as e:
#             print(f'Error: {e}')
#             break
#     return posts[:n]

# # XPath for the load more button and posts
# load_button_xpath = '//*[@id="load-more-button"]'  # Update this with the correct XPath
# post_xpath = '//*[@class="post"]'  # Update this with the correct XPath for posts

# # Get n posts
# n = 20
# posts = load_more_posts(driver, load_button_xpath, post_xpath, n)

# # Process the posts (e.g., print their content)
# for post in posts:
#     print(post.text)

# # Close the WebDriver
# driver.quit()

In [8]:
sn_posts = fetch_posts(10)

KeyboardInterrupt: 