In [1]:
import sys
if '..' not in sys.path:
    sys.path.insert(0, '..')
if '../../personalized-rinna/' not in sys.path:
    sys.path.insert(0, '../../personalized-rinna/')

import logging
import json
import re

import MeCab
import pandas as pd
from gensim import corpora
from tqdm import tqdm_notebook as tqdm

from persona.data.preprocessing.normalizing import normalize_neologd

In [2]:
%%time
with open('../jiren2_2018-09-14.txt') as f:
    data = [line.split('\t') for line in f]

CPU times: user 40.7 s, sys: 5.1 s, total: 45.8 s
Wall time: 45.7 s


In [3]:
mt = MeCab.Tagger(r'-Owakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
# Prevent utf-8 codec error
mt.parse('')

def tokenize(text):
    # remove white spaces
    regex = re.compile(r"\s+")
    text = regex.sub("", text)
    text = normalize_neologd(text)
    # remove trailing new line
    text = mt.parse(text).strip()
    return text

In [4]:
user_desc = pd.read_csv('../../twitter-user-selection/selected_user_id_desc_noun_score.csv', sep='\t')

In [5]:
user_desc = user_desc.set_index('id')
id_to_desc = user_desc['description']

In [6]:
def data_transform(line):
    instance = {}
    user_id, context, query, response = line
    # '<context_end>' indicate the boundary between context and query
    # remove possible leading white space
    instance['title'] = ' '.join([tokenize(context), '<context_end>', tokenize(query)]).strip()
    desc = id_to_desc[int(line[0])]
    instance['content'] = tokenize(desc)
    # add dummy scores/votes to be compatible with existing code
    instance['comment'] = [[tokenize(response), 0]]
    # use utf8
    return json.dumps(instance, ensure_ascii=False)

In [7]:
with open('../data/twitter_train.data', 'w') as f:
    for line in tqdm(data):
        f.write(data_transform(line) + '\n')

HBox(children=(IntProgress(value=0, max=19143269), HTML(value='')))




In [8]:
def sentence_iterator(path):
    with open(path) as f:
        for line in f:
            instance = json.loads(line)
            yield instance['title'].split(' ')
            yield instance['comment'][0][0].split(' ')

In [9]:
logging.basicConfig(format='(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

In [10]:
dictionary = corpora.Dictionary(sentence_iterator('../data/twitter_train.data'),
                                prune_at=1e7)

(asctime)s : INFO : adding document #20370000 to Dictionary(1037514 unique tokens: ['!', '<context_end>', 'です', 'どー', 'も']...)
(asctime)s : INFO : adding document #20380000 to Dictionary(1037764 unique tokens: ['!', '<context_end>', 'です', 'どー', 'も']...)
(asctime)s : INFO : adding document #20390000 to Dictionary(1038049 unique tokens: ['!', '<context_end>', 'です', 'どー', 'も']...)
(asctime)s : INFO : adding document #20400000 to Dictionary(1038390 unique tokens: ['!', '<context_end>', 'です', 'どー', 'も']...)
(asctime)s : INFO : adding document #20410000 to Dictionary(1038674 unique tokens: ['!', '<context_end>', 'です', 'どー', 'も']...)
(asctime)s : INFO : adding document #20420000 to Dictionary(1038985 unique tokens: ['!', '<context_end>', 'です', 'どー', 'も']...)
(asctime)s : INFO : adding document #20430000 to Dictionary(1039266 unique tokens: ['!', '<context_end>', 'です', 'どー', 'も']...)
(asctime)s : INFO : adding document #20440000 to Dictionary(1039572 unique tokens: ['!', '<context_end>', 'です',

In [14]:
sorted_id_freq = sorted(dictionary.dfs.items(), key=lambda x: x[1], reverse=True)

In [None]:
#dictionary.filter_extremes(no_below=0, no_above=1, keep_n=30_000)

In [16]:
with open('../data/twitter_vocab.txt', 'w') as f:
    for id, freq in sorted_id_freq:
        word = dictionary[id]
        f.write(f'{word}\t{freq}\n')

In [15]:
sorted_id_freq

[(1, 19143269),
 (45, 8448911),
 (15, 7828239),
 (128, 7691705),
 (32, 7401678),
 (0, 6696600),
 (14, 6451347),
 (88, 6218300),
 (114, 5742384),
 (13, 5442680),
 (97, 5393681),
 (83, 5233932),
 (44, 5227078),
 (138, 5209517),
 (236, 4807844),
 (56, 4794684),
 (46, 4759169),
 (20, 4710430),
 (48, 4706197),
 (80, 4546421),
 (94, 4516313),
 (2, 4280713),
 (201, 4004613),
 (4, 3834488),
 (7, 3763419),
 (136, 3481151),
 (9, 3462962),
 (315, 3215924),
 (38, 2931019),
 (67, 2904989),
 (25, 2638113),
 (96, 2166344),
 (122, 2124609),
 (290, 2047271),
 (171, 1952244),
 (109, 1869845),
 (158, 1829532),
 (166, 1797569),
 (106, 1625800),
 (435, 1578355),
 (160, 1570191),
 (231, 1566557),
 (412, 1510902),
 (473, 1422156),
 (125, 1367020),
 (47, 1317128),
 (277, 1260820),
 (104, 1161820),
 (143, 1111883),
 (144, 1110374),
 (297, 1105878),
 (140, 1096688),
 (188, 1055813),
 (216, 1019199),
 (81, 1015967),
 (452, 1001252),
 (427, 981442),
 (105, 978132),
 (54, 972796),
 (5, 922698),
 (244, 903135),
 (2