In [48]:
import os
import re
import pandas as pd
import dateutil
import logging
import hashlib
import string
import datetime
import time
import json

import bs4 
import dateutil.parser

In [49]:
DATA_DIR='/Users/altaf/Datasets/dnmarchives/agora-forums'
OUTPUT_DIR = "/Users/altaf/Projects/dnm_evolution/data/agora/posts"

MAX_TOPICS = 30000
TRACE_TOPICS = 1000

In [50]:
FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'
DATE_FORMAT = '%b %d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

In [51]:
def str_clean(s):
    return(s.encode('ascii',errors='ignore').strip())

In [52]:
class Topic(object):
    def __init__(self):
        self.pages = list()

    def write_post(self, post_wrapper, topic_id, page_id, post_id, title, dest):
        post_tag = format("%d_%d_%d" % (topic_id, page_id, post_id))
        filename = os.path.join(dest, format("post_%s.txt" % post_tag))
        post = post_wrapper.select("div.postarea div.post div.inner")
        if not post:
            return
        
        poster = post_wrapper.select("div.poster h4 a")
        if not poster:
            return
        
        text = str_clean(post[0].get_text())
        author = str_clean(poster[0].get_text())
        
        if (len(text) < 100 or len(author) < 1):
            return
        with open(filename, 'w') as outfile:
            outfile.write(title + ". " + text)
        authors[post_tag] = author

    def add_page(self, filename, topic_id, page_id):    
        if page_id in self.pages:
            return
        
        soup = bs4.BeautifulSoup(open(filename), 'html.parser')
        forumposts = soup.find('div', id='forumposts')
        
        if not forumposts:
            return
        
        title = str_clean(forumposts.select("div h3")[0].contents[-1])
        match = re.match(r'^Topic\:(.*?)\(Read \d+ times\)$', title)
        
        if not match:
            return
        
        title = str_clean(match.group(1))
        
        post_id = 0
        for post_wrapper in forumposts.find_all('div', class_='post_wrapper'):
            self.write_post(post_wrapper, topic_id, page_id, post_id, title, OUTPUT_DIR)
            post_id += 1
                
        self.pages.append(page_id)           

In [53]:
class App(object):
    
    def __init__(self):
        self.topics = dict()

    def dbg_trace(self):
        num_topics = len(self.topics)

        if num_topics and not (num_topics % TRACE_TOPICS):
            logger.debug("Topics %d" % num_topics)

    def run(self):
        folders = os.listdir(DATA_DIR)
        folder_dates = sorted([dateutil.parser.parse(f) for f in folders if re.match(r'^\d{4}-\d{2}-\d{2}$', f)], reverse=True)
        
        for folder_date in folder_dates:
            if len(self.topics) >= MAX_TOPICS:
                break;
                
            folder = "%4s-%02d-%02d" % (folder_date.year, folder_date.month, folder_date.day)

            index_php = os.path.join(DATA_DIR, folder, 'index.php')
            if not os.path.exists(index_php):
                continue

            for filename in os.listdir(index_php):
                self.dbg_trace()
                
                if len(self.topics) >= MAX_TOPICS:
                    break;
                    
                match = re.match(r'^topic\,(\d+)\.(\d+)\.html$', filename)
                if not match:
                    continue

                topic_id = int(match.group(1))
                page_id = int(match.group(2))

                if topic_id in self.topics:
                    topic = self.topics[topic_id]
                else:
                    topic = Topic()
                    self.topics[topic_id] = topic
                
                topic.add_page(os.path.join(index_php, filename), topic_id, page_id)            

In [54]:
authors = dict()
app = App()

In [55]:
if __name__=='__main__':
    print("START: %s" % time.ctime())
    %time app.run() 
    authors_filename = os.path.join(OUTPUT_DIR, "authors.json")
    with open(authors_filename, 'w') as outfile:
        json.dump(authors, outfile)
    print("FINISH: %s" % time.ctime())

START: Fri Apr  8 00:37:24 2016


Apr 08 00:39:34 DEBUG  Topics 1000
Apr 08 00:39:34 DEBUG  Topics 1000
Apr 08 00:41:24 DEBUG  Topics 2000
Apr 08 00:41:24 DEBUG  Topics 2000
Apr 08 00:42:18 DEBUG  Topics 3000
Apr 08 00:42:18 DEBUG  Topics 3000
Apr 08 00:43:25 DEBUG  Topics 4000
Apr 08 00:43:25 DEBUG  Topics 4000
Apr 08 00:43:25 DEBUG  Topics 4000
Apr 08 00:43:25 DEBUG  Topics 4000
Apr 08 00:45:32 DEBUG  Topics 5000
Apr 08 00:45:32 DEBUG  Topics 5000
Apr 08 00:45:32 DEBUG  Topics 5000
Apr 08 00:45:32 DEBUG  Topics 5000
Apr 08 00:46:56 DEBUG  Topics 6000
Apr 08 00:46:56 DEBUG  Topics 6000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:50:27 DEBUG  Topics 7000
Apr 08 00:51:28 DEBUG  Topics 8000
Apr 08 00:51:28 DEBUG  Topics 8000
Apr 08 00:51:28 DEBU

KeyboardInterrupt: 

In [None]:
for topic_id, topic in app.topics.iteritems():
    print(topic.title)
    print(topic.pages)
    print(topic.posters)

In [5]:
filename = '/Users/altaf/Datasets/dnmarchives/agora-forums/2015-07-06/index.php/topic,100166.0.html'
soup = bs4.BeautifulSoup(open(filename), 'html.parser', from_encoding='utf-8')
forumposts = soup.find('div', id='forumposts')
for post_wrapper in forumposts.find_all('div', class_='post_wrapper'):
    poster = post_wrapper.find('div', class_="poster")

#post = div.find('div', class_="post_wrapper")
#poster = post.find('div', class_="poster")

In [None]:
post_wrapper = forumposts.find_all('div', class_='post_wrapper')
p = post_wrapper[0].select("div.postarea div.post div.inner")
str_clean(p[0].get_text())

In [45]:
app_10000 = app
authors_10000 = authors

len(authors)

132473

In [56]:
authors_filename = os.path.join(OUTPUT_DIR, "authors.json")
with open(authors_filename, 'w') as outfile:
    json.dump(authors, outfile)
