In [15]:
from os import path, makedirs
from re import compile
from requests import get
from itertools import chain
from nltk.tokenize import sent_tokenize
from pandas import DataFrame, concat
from sys import argv
from transliterate import translit
from requests.exceptions import ConnectionError, ReadTimeout
from json import JSONDecodeError, load
import urllib.request
from bs4 import BeautifulSoup

In [2]:
DVACH = 'https://2ch.hk/'


def cut(data):
    r = compile(r'<.*?>|>>\d*|\(OP\)|&#(\d*);|&quot;|&gt;|(http|https):.*')
    return r.sub('', punctuate_word(punctuate_sent((data))))


def punctuate_sent(data):
    r = compile(r'([a-zA-Zа-яА-Я])([.!\?])')
    return r.sub(r'\1. ', data)


def punctuate_word(data):
    r = compile(r'([a-zA-Zа-яА-Я])([,])')
    return r.sub(r'\1, ', data)


def load_threads(board='b'):
    try:
        dvach_page = get('https://2ch.hk/{}/catalog.json'.format(board)).json()
        return [i['num'] for i in dvach_page['threads']]
    except:
        print('Нет такой доски.')


def load_comments(threads, board='b'):
    print('Загружаем комментарии с ' + board)
    comments = []

    for every_thread in threads:
        try:
            thread = get(DVACH + board + '/res/' + every_thread + '.json', timeout=5).json()
            [comments.append(sent_tokenize(cut(i['comment']))) for i in thread['threads'][0]['posts'] if
            len(cut(i['comment'])) > 2]
        except:
            pass
    print('Данные загружены, переходим к сериализации')
    return list(chain.from_iterable(comments)), board


def serialize_comments(comments, board='b'):
    df = DataFrame()
    df['comment'] = comments
    df.index.names = ['comment_id']
    subdir = '../pickle'
    try:
        file_path = path.join(subdir, board + '.csv')
        old_df = DataFrame.from_csv(file_path)
        df = concat((df, old_df)).drop_duplicates()
        print('Данные смерджены с уже существующими.')
    except FileNotFoundError:
        print('Сериализованных данных доски ' + board + ' не найдено, сделан новый файл.')
    if not path.exists(subdir):
        makedirs(subdir)
    file_path = path.join(subdir, board + '.csv')
    df.to_csv(file_path)
    print('Сериализованы данные доски ' + board)

In [3]:
boardlist = []

fp = urllib.request.urlopen(DVACH)
mybytes = fp.read()

mystr = mybytes.decode('utf8')
fp.close()

soup = BeautifulSoup(mystr, 'lxml')

for i in soup.find('div', 'board-list-mob').optgroup.findChildren('option'):
    boardlist.append(str(i.text)[1:-1])

In [4]:
def get_data(name):
    if name == 'comment':
        return [cut(i[name]) for i in thread['threads'][0]['posts']]
    elif name == 'files':
        return [len(i[name]) for i in thread['threads'][0]['posts']]
    else:
        return [i[name] for i in thread['threads'][0]['posts']]

In [5]:
def make_name(board, subject):
    subject = subject.replace('|', '.')
    subject = subject.replace('/', '.')
    return '{}_{}.csv'.format(board, translit(subject.lower().replace(' ', '_'), 'ru', reversed=True))

In [6]:
threads = []

for board in boardlist:
    for i, every_thread in enumerate(load_threads(board)[:30]):
        try:
            thread = get('https://2ch.hk/{0}/res/{1}.json'.format(board, every_thread), timeout=5).json()
        except (ConnectionError, ReadTimeout, JSONDecodeError):
            continue
        thread_subject =  thread['threads'][0]['posts'][0]['subject']
        if '#' not in thread_subject or '&#' in thread_subject:
            continue
        #print(thread_subject)
        threads.append({'board': board,
                        'thread_subject': thread_subject,
                        'comment': get_data('comment'), 
                        'date': get_data('date'),
                        'email': get_data('email'),
                        'num': get_data('num'),
                        'op': get_data('op'),
                        'subject': get_data('subject'),
                        'parent': get_data('parent'),
                       'files': get_data('files')})

In [7]:
for i in threads:
    DataFrame.from_dict(i).to_csv(path.join('2ch-dataset', make_name(i['board'], i['subject'][0])))

In [20]:
k = concat([DataFrame.from_csv('2ch-dataset/fiz_convict_conditioning_#17.csv'), DataFrame.from_csv('2ch-dataset/fiz_convict_conditioning_#18.csv')]).drop_duplicates().reset_index(drop=True)