# Code to scrape each forumID for threads and store it into a TSV file

### Importing the necessary libraries first

In [1]:
import requests
from bs4 import BeautifulSoup
import csv

### Initialising some global variables first

In [2]:
tid = []               # contains all the thread IDs 
messages = []          # stores all the message contents
subjects = []          # stores the subjects of the messages 
forumIDs = []          # stores the forumIDs from the scraped csv

### Function to load the scraped CSV file containing the forum IDs and store it into forumIDs array

In [3]:
# function to load the scraped csv file, get the details of the various products and store it into forumIDs

def loadcsv():
    with open('ForumID.csv', 'r') as csvFile:
        reader = csv.reader(csvFile)
        for row in reader:
            forumIDs.append(row)

### Function to scrape all the thread IDs present within the forumID, store it into the variable tid
#### Input: the forum URL

In [4]:
def connect(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    }
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    
    soup = BeautifulSoup(res.text, 'html.parser')
    soup.prettify()

    # all threadIDs have the following id
    
    hrefs = soup.find_all("a", {"id": "jive-thread-0"})

    for href in hrefs:
        href = str(href)
        href = href.split('href="')[1].split("&")[0]
        
        # href of form threads.jspa?threadID=xxx
        tid.append(href)
    return tid

### Fetch all the messages inside each thread and store it into the final TSV file with tag as the name of the product

#### Input: The thread URL and the target file name

In [5]:
def fetch(url, fname):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    }
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    
    soup = BeautifulSoup(res.text, 'html.parser')
    soup.prettify()
    
    # all messages have the following subject class
    
    subs = soup.find_all("span", {"class": "jive-subject"})
    
    for line in subs:
        subjects.append(line.text.strip())
    
    # all messages have the following body class
    
    msgs = soup.find_all("div", {"class": "jive-message-body"})
    
    for line in msgs:
        msg = line.text.strip()
        messages.append(msg)
        
        with open(fname + '.tsv', 'w', encoding='utf-8') as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t')
            writer.writerow(["label", "description"])
            for i, j in zip(subjects, messages):
                writer.writerow([fname, i + "\n" + j])

### Following is the code to execute the above functions

In [None]:
# Run the follow function if the scraping should be automatic

loadcsv()

# value of ID is the target forumID cell in the csv
# value of limit is number of threads you want to extract from each service

id = 73
limit = 14

i = 0

while i <= limit:
    url = "https://forums.aws.amazon.com/forum.jspa?"+forumIDs[id-1][1]+"&start="+str(i)
    tid = connect(url)
    i = i + 25
    
for thread in tid:
    fetch("https://forums.aws.amazon.com/" + thread, forumIDs[id-1][0])