In [1]:
import os
import re
import pandas as pd
import dateutil
import logging
import hashlib
import string
import datetime
import time

import bs4 
import dateutil.parser

In [2]:
DATA_DIR='/Users/altaf/Datasets/dnmarchives/agora-forums'
MAX_TOPICS = 40000
TRACE_TOPICS = 500

In [3]:
FORMAT = '%(asctime)-15s %(levelname)-6s %(message)s'
DATE_FORMAT = '%b %d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

In [7]:
def str_clean(s):
    return(s.encode('ascii',errors='ignore').strip())
    
def file_checksum(path, block_size=32*128, hex=False):
    '''
    Block size directly depends on the block size of your filesystem
    to avoid performances issues
    Here I have blocks of 4096 octets (Default NTFS)
    '''
    md5 = hashlib.md5()
    with open(path,'rb') as f:
        for chunk in iter(lambda: f.read(block_size), b''):
             md5.update(chunk)
    if hex:
        return md5.hexdigest()
    return md5.digest()

In [8]:
class Topic(object):
    def __init__(self):
        self.title = None
        self.pages = list()
        self.posters = dict()

    def add_page(self, filename, page_id):    
        if page_id in self.pages:
            return
        
        soup = bs4.BeautifulSoup(open(filename), 'html.parser')
        forumposts = soup.find('div', id='forumposts')
        
        if not forumposts:
            return
        
        title = str_clean(forumposts.select("div h3")[0].contents[-1])
        match = re.match(r'^Topic\:(.*?)\(Read \d+ times\)$', title)
        
        if not match:
            return
        
        self.title = str_clean(match.group(1))
        
        for posts in forumposts.find_all('div', class_='post_wrapper'):
            poster = posts.find('div', class_="poster")
            h4 = poster.find('h4')
            poster_tag = h4.find('a')
            if not poster_tag:
                poster_tag = h4
               
            poster_id = str_clean(poster_tag.get_text())
            if not poster_id in self.posters:
                self.posters[poster_id] = poster_id
                
        self.pages.append(page_id)           

In [9]:
class App(object):
    
    def __init__(self):
        self.topics = dict()

    def dbg_trace(self):
        num_topics = len(self.topics)

        if num_topics and not (num_topics % TRACE_TOPICS):
            logger.debug("Topics %d" % num_topics)
        
    def run(self):
        folders = os.listdir(DATA_DIR)
        folder_dates = sorted([dateutil.parser.parse(f) for f in folders if re.match(r'^\d{4}-\d{2}-\d{2}$', f)], reverse=True)
        
        for folder_date in folder_dates:
            if len(self.topics) >= MAX_TOPICS:
                break;
                
            folder = "%4s-%02d-%02d" % (folder_date.year, folder_date.month, folder_date.day)
            
            #if not re.match(r'^\d{4}-\d{2}-\d{2}$', folder):
            #    continue

            index_php = os.path.join(DATA_DIR, folder, 'index.php')
            if not os.path.exists(index_php):
                continue

            for filename in os.listdir(index_php):
                self.dbg_trace()
                
                if len(self.topics) >= MAX_TOPICS:
                    break;
                    
                match = re.match(r'^topic\,(\d+)\.(\d+)\.html$', filename)
                if not match:
                    continue

                topic_id = int(match.group(1))
                page_id = int(match.group(2))

                if topic_id in self.topics:
                    topic = self.topics[topic_id]
                else:
                    topic = Topic()
                    self.topics[topic_id] = topic
                
                topic.add_page(os.path.join(index_php, filename), page_id)            

In [10]:
if __name__=='__main__':
    logger.debug("START")
    app = App()
    %time app.run()    
    logger.debug("FINISH")

Apr 07 03:20:25 DEBUG  START
Apr 07 03:21:26 DEBUG  Topics 500
Apr 07 03:22:12 DEBUG  Topics 1000
Apr 07 03:23:01 DEBUG  Topics 1500
Apr 07 03:23:41 DEBUG  Topics 2000
Apr 07 03:24:10 DEBUG  Topics 2500
Apr 07 03:24:24 DEBUG  Topics 3000
Apr 07 03:24:46 DEBUG  Topics 3500
Apr 07 03:25:17 DEBUG  Topics 4000
Apr 07 03:25:17 DEBUG  Topics 4000
Apr 07 03:25:48 DEBUG  Topics 4500
Apr 07 03:25:48 DEBUG  Topics 4500
Apr 07 03:25:48 DEBUG  Topics 4500
Apr 07 03:25:48 DEBUG  Topics 4500
Apr 07 03:25:48 DEBUG  Topics 4500
Apr 07 03:27:04 DEBUG  Topics 5000
Apr 07 03:27:04 DEBUG  Topics 5000
Apr 07 03:27:44 DEBUG  Topics 5500
Apr 07 03:28:19 DEBUG  Topics 6000
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topics 6500
Apr 07 03:30:38 DEBUG  Topi

CPU times: user 4h 7min 31s, sys: 2min 1s, total: 4h 9min 33s
Wall time: 4h 11min 18s


In [None]:
for topic_id, topic in app.topics.iteritems():
    print(topic.title)
    print(topic.pages)
    print(topic.posters)

In [254]:
filename = '/Users/altaf/Datasets/dnmarchives/agora-forums/2015-07-06/index.php/topic,100166.0.html'
soup = bs4.BeautifulSoup(open(filename), 'html.parser', from_encoding='utf-8')
forumposts = soup.find('div', id='forumposts')
for posts in div.find_all('div', class_='post_wrapper'):
    poster = posts.find('div', class_="poster")

post = div.find('div', class_="post_wrapper")
poster = post.find('div', class_="poster")

In [12]:
topic_ids = app.topics.keys()

In [43]:
import json

OUTPUT_DIR = "/Users/altaf/Projects/dnm_evolution/data/agora/topics"

block_size = 10000
topic_id_start = 0
topic_id_end = topic_id_start + block_size

for topic_id in topic_ids: # [topic_id_start:topic_id_end]:
    topic = app.topics[topic_id]
    data = {
        "title": topic.title,
        "pages": topic.pages,
        "posters": topic.posters.keys()
    }
    filename = os.path.join(OUTPUT_DIR, "topic_%d.json" % topic_id)
    with open(filename, 'w') as outfile:
        json.dump(data, outfile)