## Code to scrape each forumID for ALL threads and store it into a TSV file

### Importing the necessary libraries first

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

### Initialising some global variables first

In [None]:
tid = []               # contains all the thread IDs 
messages = []          # stores all the message contents
subjects = []          # stores the subjects of the messages

### Function to scrape all the thread IDs present within the forumID, store it into the variable tid
#### Input: the forum URL

In [None]:
# function to connect to the given forum ID, retrieve all the threads present within that forum and store it in tid

def connect(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    }
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    
    soup = BeautifulSoup(res.text, 'html.parser')
    soup.prettify()

    # all threadIDs have the following id
    
    hrefs = soup.find_all("a", {"id": "jive-thread-0"})

    for href in hrefs:
        href = str(href)
        href = href.split('href="')[1].split("&")[0]
        # href of form threads.jspa?threadID=xxx
        tid.append(href)
    return tid

### Fetch all the messages inside each thread and store it into the final TSV file with tag as the name of the product

#### Input: The thread URL and the target file name

In [None]:
# for each thread ID, fetches the messages present within thread

def fetch(url, fname):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    }
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    
    soup = BeautifulSoup(res.text, 'html.parser')
    soup.prettify()
    
    # all messages have the following subject class
    
    subs = soup.find_all("span", {"class": "jive-subject"})
    
    for line in subs:
        subjects.append(line.text.strip())
    
    # all messages have the following body class
    
    msgs = soup.find_all("div", {"class": "jive-message-body"})
    
    for line in msgs:
        msg = line.text.strip()
        messages.append(msg)
        
        with open(fname + '.tsv', 'w', encoding='utf-8') as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t')
            writer.writerow(["label", "description"])
            for i, j in zip(subjects, messages):
                writer.writerow([fname, i + "\n" + j])

### Code to fetch all the threadIDs for given ForumID

#### id is the target forumID, limit is the number of threads needed, pname is the forum name

In [None]:
# code snippet to fetch all the threadIDs for given service

id = 186                    # target forumID
limit = 25                 # total number of threads
pname = "Amazon Elastic Beanstalk"  # target product name

i = 0

while i <= limit:
    url = "https://forums.aws.amazon.com/forum.jspa?forumID="+str(id)+"&start="+str(i)
    tid = connect(url)
    i = i + 25
    print("Fetched " + str(i) + " threads")

### ! ~ Code to fetch the messages from the thread and store in a TSV file  ~ !
#### Implements a custom exception to keep the loop running forever

In [None]:
# code snippet to fetch all the message for the corresponding threadID and store in tsv

i = 0

for i in range (0, len(tid)):
        try:
            print("Processing thread #" + str(i))
            fetch("https://forums.aws.amazon.com/" + tid[i], pname)
        except:
            time.sleep(5)
            i = i - 1
        i += 1

In [None]:
print(i)