In [1]:
import sys
import os
import re
try:
    import ujson as json  # UltraJSON if available
except:
    import json
from dateutil import parser as dateparser

from operator import itemgetter
from xml.etree import cElementTree as etree
from collections import defaultdict

from data import DATA_DIR


In [2]:
filename = os.path.join(DATA_DIR, "posts.xml")
print("Reading from xml %s" % filename)
filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")
print("Filtered: %s" % filename_filtered)
filename_filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")
print("Meta: %s" % filename_filtered_meta)

Reading from xml data/posts.xml
Filtered: data/filtered.tsv
Meta: data/filtered-meta.json


In [3]:
q_creation = {}  # creation datetimes of questions
q_accepted = {}  # id of accepted answer

# question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]
meta = defaultdict(list)

# regegx to find code snippets
code_match = re.compile('<pre>(.*?)</pre>', re.MULTILINE | re.DOTALL)
link_match = re.compile(
    '<a href="http://.*?".*?>(.*?)</a>', re.MULTILINE | re.DOTALL)
img_match = re.compile('<img(.*?)/>', re.MULTILINE | re.DOTALL)
tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)


In [4]:
years = defaultdict(int)
num_questions = 0
num_answers = 0

if sys.version_info.major < 3:
    # Python 2, map() returns a list, which will lead to out of memory errors.
    # The following import ensures that the script behaves like being executed
    # with Python 3.
    from itertools import imap as map


In [5]:
def parsexml(filename):
    global num_questions, num_answers

    counter = 0

    it = map(itemgetter(1),
             iter(etree.iterparse(filename, events=('start',))))

    root = next(it)  # get posts element

    for elem in it:
        if counter % 100000 == 0:
            print("Processed %i <row/> elements" % counter)

        counter += 1

        if elem.tag == 'row':
            creation_date = dateparser.parse(elem.get('CreationDate'))

            Id = int(elem.get('Id'))
            PostTypeId = int(elem.get('PostTypeId'))
            Score = int(elem.get('Score'))

            if PostTypeId == 1:
                num_questions += 1
                years[creation_date.year] += 1

                ParentId = -1
                TimeToAnswer = 0
                q_creation[Id] = creation_date
                accepted = elem.get('AcceptedAnswerId')
                if accepted:
                    q_accepted[Id] = int(accepted)
                IsAccepted = 0

            elif PostTypeId == 2:
                num_answers += 1

                ParentId = int(elem.get('ParentId'))
                if not ParentId in q_creation:
                    # question was too far in the past
                    continue

                TimeToAnswer = (creation_date - q_creation[ParentId]).seconds

                if ParentId in q_accepted:
                    IsAccepted = int(q_accepted[ParentId] == Id)
                else:
                    IsAccepted = 0

                meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))

            else:
                continue

            Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(
                elem.get('Body'))

            values = (Id, ParentId,
                      IsAccepted,
                      TimeToAnswer, Score,
                      Text.encode("utf-8"),
                      NumTextTokens, NumCodeLines, LinkCount, NumImages)

            yield values

            root.clear()  # preserve memory

In [6]:
def filter_html(s):
    num_code_lines = 0
    link_count_in_code = 0
    code_free_s = s

    num_images = len(img_match.findall(s))

    # remove source code and count how many lines
    for match_str in code_match.findall(s):
        num_code_lines += match_str.count('\n')
        code_free_s = code_match.sub("", code_free_s)

        # sometimes source code contain links, which we don't want to count
        link_count_in_code += len(link_match.findall(match_str))

    links = link_match.findall(s)
    link_count = len(links)

    link_count -= link_count_in_code

    link_free_s = re.sub(
        " +", " ", tag_match.sub('', code_free_s)).replace("\n", "")

    for link in links:
        if link.lower().startswith("http://"):
            link_free_s = link_free_s.replace(link, '')

    num_text_tokens = link_free_s.count(" ")

    return link_free_s, num_text_tokens, num_code_lines, link_count, num_images

In [None]:
with open(filename_filtered, "w") as f:
    for values in parsexml(filename):
        line = "\t".join(map(str, values))
        f.write(line + "\n")

with open(filename_filtered_meta, "w") as f:
    json.dump(meta, f)

print("years:", years)
print("#qestions: %i" % num_questions)
print("#answers: %i" % num_answers)

Processed 0 <row/> elements
Processed 100000 <row/> elements
Processed 200000 <row/> elements
Processed 300000 <row/> elements
Processed 400000 <row/> elements
Processed 500000 <row/> elements
Processed 600000 <row/> elements
Processed 700000 <row/> elements
Processed 800000 <row/> elements
Processed 900000 <row/> elements
Processed 1000000 <row/> elements
Processed 1100000 <row/> elements
Processed 1200000 <row/> elements
Processed 1300000 <row/> elements
Processed 1400000 <row/> elements
Processed 1500000 <row/> elements
Processed 1600000 <row/> elements
Processed 1700000 <row/> elements
Processed 1800000 <row/> elements
Processed 1900000 <row/> elements
Processed 2000000 <row/> elements
Processed 2100000 <row/> elements
Processed 2200000 <row/> elements
Processed 2300000 <row/> elements
Processed 2400000 <row/> elements
Processed 2500000 <row/> elements
Processed 2600000 <row/> elements
Processed 2700000 <row/> elements
Processed 2800000 <row/> elements
Processed 2900000 <row/> elem