# Store XML data in a pickle file

#### Data directory

In [10]:
data_dir = "../testing_data/posts"    # directory of XML posts
labels_dir = '../testing_data/labels.tsv'
ranks_dir = '../testing_data/author_rankings.tsv'
affiliation_dir = '../testing_data/author_rankings_summary.tsv'

In [11]:
result_dir = "../testin_data/test.pkl"    # Directory for output

In [2]:
from xml.dom import minidom as md
from bs4 import BeautifulSoup
import pandas as pd
import os
# This version parse timestamp of the posts as well.

### Saving XML file in a dataframe 

In [3]:
# Ref: https://www.mkyong.com/python/python-read-xml-file-dom-example/
def XMLtoDF(xml_file):
    """ Takes an XML file directory. Returns a DateFrame of 'msg', 'msgID', 'authorID'"""
    
    dfcols = [ 'msgID','msg', 'time', 'authorID']
    xmlDF = pd.DataFrame(columns=dfcols)
    doc  = md.parse(xml_file)
    msg = doc.getElementsByTagName("body")
    if len(msg) == 0:
        pass
    else: 
        msg = msg[0]
        if msg.firstChild is None:    # Handle "empty" body of xml
            empty= doc.createTextNode('_')    # To avoid any 'NoneType' error OR len()=0
            msg.appendChild(empty)
        msgID = doc.getElementsByTagName("message")[0].getAttribute("href")
        authorID = doc.getElementsByTagName("author")[0].getAttribute("href")
        time = doc.getElementsByTagName("post_time")[0]
        xmlDF = xmlDF.append(pd.Series([msgID, msg.firstChild,time.firstChild.data, authorID], index=dfcols), ignore_index=True)
        return xmlDF

In [4]:
#  To parse all the files, sample <- 0
sample = 0

xmlDF = pd.DataFrame()
for file in os.listdir(data_dir):
    xmlDF = xmlDF.append(XMLtoDF(data_dir+'/' + file), ignore_index=True)
    sample -=1
    if sample == 0: break

In [5]:
xmlDF.head()

Unnamed: 0,msgID,msg,time,authorID
0,/messages/id/139570,"<DOM Text node ""'Hello my i'..."">",2015-06-08T01:43:07+00:00,/users/id/6940
1,/messages/id/139571,"<DOM Text node ""'Up for a c'..."">",2015-06-08T01:43:30+00:00,/users/id/6940
2,/messages/id/139572,"<DOM Text node ""'Salt and v'..."">",2015-06-08T03:14:54+00:00,/users/id/5111
3,/messages/id/139573,"<DOM Text node ""'Right now '..."">",2015-06-08T03:23:09+00:00,/users/id/5111
4,/messages/id/139574,"<DOM Text node """"I'm exhaus""..."">",2015-06-08T03:31:12+00:00,/users/id/5111


### Data munging

In [6]:
#
# Cleaning data
#
xmlDF.msg = xmlDF['msg'].map(lambda x:  BeautifulSoup(x.data, 'html.parser')) 
xmlDF.msgID = xmlDF['msgID'].map(lambda x: x.lstrip('/messages/id/'))
xmlDF.authorID = xmlDF['authorID'].map(lambda x: x.lstrip('/users/id/'))
xmlDF.head()

Unnamed: 0,msgID,msg,time,authorID
0,139570,Hello my is jas any up for a chat,2015-06-08T01:43:07+00:00,6940
1,139571,Up for a chat,2015-06-08T01:43:30+00:00,6940
2,139572,"Salt and vinegar (which I really don't like, b...",2015-06-08T03:14:54+00:00,5111
3,139573,"Right now I am exhausted, it's been a huge day...",2015-06-08T03:23:09+00:00,5111
4,139574,I'm exhausted already and it's only 1:30pm! Th...,2015-06-08T03:31:12+00:00,5111


### Adding other pieces of information to the dataframe

In [7]:
# Adding labels
labels=pd.read_csv(labels_dir, sep='\t', header=None, names = ['msgID','label','fine_grained'])
labels['msgID'] = labels['msgID'].astype(str)    # To match the main dataframe types
print("Number of labeled posts: ", len(labels))
print("Number of posts: ", len(xmlDF))
data = xmlDF.merge(labels, on='msgID', how='left')
data.head()

Number of labeled posts:  400
Number of posts:  88


Unnamed: 0,msgID,msg,time,authorID,label,fine_grained
0,139570,Hello my is jas any up for a chat,2015-06-08T01:43:07+00:00,6940,,
1,139571,Up for a chat,2015-06-08T01:43:30+00:00,6940,,
2,139572,"Salt and vinegar (which I really don't like, b...",2015-06-08T03:14:54+00:00,5111,,
3,139573,"Right now I am exhausted, it's been a huge day...",2015-06-08T03:23:09+00:00,5111,,
4,139574,I'm exhausted already and it's only 1:30pm! Th...,2015-06-08T03:31:12+00:00,5111,,


In [8]:
# Adding author's info
ranks = pd.read_csv(ranks_dir, sep='\t', header=None, names = ['authorID','rank'])
ranks['authorID'] = ranks['authorID'].astype(str)    # To match the main dataframe types

# Adding author's affiliation with the forum
affiliation = pd.read_csv(affiliation_dir, header=None, names = ['rank'])

# Spliting affiliation at the last element to have the rank!
affiliation['affiliation'] = affiliation['rank'].map(lambda x: x[-1])
affiliation['rank'] = affiliation['rank'].map(lambda x: x[0:-1].rstrip())
ranks = ranks.merge(affiliation, on='rank',how='outer')
ranks.head()

Unnamed: 0,authorID,rank,affiliation
0,52,Community Manager,1
1,124,Community Manager,1
2,1192,Community Manager,1
3,7149,Community Manager,1
4,7544,Community Manager,1


In [9]:
data = data.merge(ranks, on='authorID',how='outer')
data.head()

Unnamed: 0,msgID,msg,time,authorID,label,fine_grained,rank,affiliation
0,139570,Hello my is jas any up for a chat,2015-06-08T01:43:07+00:00,6940,,,Rookie scribe,0.0
1,139571,Up for a chat,2015-06-08T01:43:30+00:00,6940,,,Rookie scribe,0.0
2,139572,"Salt and vinegar (which I really don't like, b...",2015-06-08T03:14:54+00:00,5111,,,Builder,
3,139573,"Right now I am exhausted, it's been a huge day...",2015-06-08T03:23:09+00:00,5111,,,Builder,
4,139574,I'm exhausted already and it's only 1:30pm! Th...,2015-06-08T03:31:12+00:00,5111,,,Builder,


### Saving the pickle file

In [94]:
import sys
sys.setrecursionlimit(100000)
data.to_pickle(result_dir)