* Download dataset from the URL: https://www.dropbox.com/s/dcds423fl7fscow/threadDataSet.zip?dl=0
        You can also use the command: `wget https://www.dropbox.com/s/dcds423fl7fscow/threadDataSet.zip?dl=0`
* Please unzip using the command: `unzip threadDataSet.zip`

## Step 1: Make sure that the unzip happened correctly and the folder structure is as described below:

## Step 2: Fix the XML parsing issue where '&' character is misplaced
- If not fixed, this causes the XML parsing to fail

In [11]:
import pandas as pd
import numpy as np
import os

In [12]:
import re

regex = re.compile(r"&(?!amp;|lt;|gt;)")

thread_directory = "./threadDataSet/threads as original xml/"
for r, d, f in os.walk(thread_directory):
    for file in f:
        if '.xml' in file:
            with open(r + '/' + file, 'r+', encoding='utf8', errors="ignore") as content:
                text = content.read()
                text = regex.sub("&amp;", text)
                content.seek(0)
                content.write(text)
                content.truncate()

## Step 3: Creating Dataframe from Forum threads
* Each thread with all it's posts is in an XML
* Each thread is a seperate file
* Below method iterates over each file and creates the forum dataframe
* Every row in the dataframe is a post or comment in a thread

In [13]:
import xml.etree.ElementTree as et

def createIssueDF(fileName):
    xtree = et.parse(fileName)
    xroot = xtree.getroot()
    threadID = ""
    title = ""
    rows = []
    index = 1
    columns = ["userID", "Date", "postNum", "text"]
    for child in xroot:
        fileName = os.path.basename(fileName).rstrip('.xml')
        if child.tag in ('ThreadID'):
            threadID = child.text
        elif child.tag in ('Title'):
            title = child.text
        elif child.tag in ('InitPost','Post'):
            if child.find("UserID").text is not None:
                userID = child.find("UserID").text.strip()
            else:
                userID = ""
            date = child.find("Date").text.strip()
            postNum = index
            index += 1
            if child.find("icontent") is not None:
                text = child.find("icontent").text.strip()
            else:
                text = child.find("rcontent").text.strip()
            rows.append({"userID":userID, "Date":date, "postNum":postNum, "text":text})

    returnDF = pd.DataFrame(rows, columns = columns)
    returnDF['Filename'] = fileName
    returnDF['ThreadID'] = threadID
    returnDF['Title'] = title
    
    return returnDF

In [16]:
def parse_threads_and_summary(thread_directory, summary_directory):
    df = pd.DataFrame(columns=['Filename', 'ThreadID', 'Title', 'userID', 'Date', 'postNum', 'text'])
    for file in os.listdir(thread_directory):
        filename = os.fsdecode(file)
        if filename.endswith(".xml"): 
            returnDF = createIssueDF(os.path.join(thread_directory, filename))
            df = df.append(returnDF)
    df['summary'] = ''
    for file in os.listdir(summary_directory):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            with open(os.path.join(summary_directory, filename), 'r', encoding='utf8', errors="ignore") as f:
                lineList = []
                for line in f:
                    if line.strip() != '':
                        lineList.append(line.rstrip('\n'))
            df['summary'][df['Filename'] == filename.rstrip('.txt')] = ''.join(lineList)
    return df

## Step 4: Preparing dataset of Gold Directory

In [17]:
thread_directory = "./threadDataSet/threads as original xml/gold"
# Replace path with Annotator_Two to use a different summary
summary_directory = "./threadDataSet/human summaries/gold_Annotator_One"

goldDF = parse_threads_and_summary(thread_directory, summary_directory)

## Step 5: Preparing dataset of Batch_one Directory

In [18]:
thread_directory = "./threadDataSet/threads as original xml/batch_one"
summary_directory = "./threadDataSet/human summaries/batch_one_Annotator_One"

batchOneDF = parse_threads_and_summary(thread_directory, summary_directory)

## Step 6: Preparing dataset of Batch_two Directory

In [20]:
thread_directory = "./threadDataSet/threads as original xml/batch_two"
summary_directory = "./threadDataSet/human summaries/batch_two_Annotator_One"

batchTwoDF = parse_threads_and_summary(thread_directory, summary_directory)

## Step 7: Combining all datasets

In [21]:
finalDF = goldDF.append(batchOneDF)
finalDF = finalDF.append(batchTwoDF)

## Final Checks: Are there any threads with less than 2 posts?

- No, so there are no threads that we want to exclude
- If yes, it makes sense to exclude these threads since there is nothing to summarize

In [22]:
cumDF = finalDF.groupby('ThreadID').count().reset_index()
cumDF[cumDF['Date'] < 2]['ThreadID']

Series([], Name: ThreadID, dtype: object)

## Saving the dataset

In [24]:
finalDF.to_csv('travel_threads.csv', sep='|', index=False)