In [38]:
import os
import re
import json
import codecs
import gc
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import scipy
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.neighbors import DistanceMetric
from sklearn.metrics.pairwise import pairwise_distances as pdist

from scripts.utils import *
from scripts.readability_functions import *

In [6]:
dir_in_ridero_pos = 'snapshot50/ridero_parsed_json'
dir_in_ridero_raw = 'snapshot50/ridero_books'
dir_in_ridero_json = 'snapshot50/ridero_json'
dir_in_canon_raw = 'txt_cut'
dir_in_canon_pos = 'parsed_json_cut'
k = 2000

In [9]:
catalog = pd.read_csv('collection_catalog_full_final.csv', sep=';')
with open('stop3grams.txt', 'r') as f:
    stop3grams = set([s.lower() for s in f.read().splitlines()])
authors = []
book_names = []
book_fbusta_ids = []
book_ids = []
for filename in sorted(os.listdir(dir_in_canon_pos)):
    with open(os.path.join(dir_in_canon_pos, filename), 'r') as book:
        fbusta_id = int(filename.split('.')[0])
        book_fbusta_ids.append(fbusta_id)
        book_ids.append(get_book_id(fbusta_id, catalog))
        authors.append(get_author_id(fbusta_id, catalog))
        book_names.append(get_book_name(fbusta_id, catalog))
book_fbusta_ids = np.array(book_fbusta_ids)
book_names = np.array(book_names)
authors = np.array(authors)
book_ids = np.array(book_ids)
groups = pd.Series(authors).value_counts()
singles = np.array(groups[groups==1].index)
singles

array([ 14,  36,  31,   7, 107])

In [11]:
%%time
data = []
lengths = []
for filename in sorted(os.listdir(dir_in_canon_raw)):
    fbusta_id = int(filename.split('.')[0])
    if get_author_id(fbusta_id, catalog) in singles:
        continue
    with codecs.open(os.path.join(dir_in_canon_raw, filename), encoding='utf-8') as book:
        content = book.read()
        data.append(content)
        lengths.append(len(re.findall(u"(?u)\\b\\w+\\b", content)))

CPU times: user 2.62 s, sys: 85.4 ms, total: 2.7 s
Wall time: 2.79 s


In [14]:
data_train = np.array(data)
len_train = np.array(lengths)
authors_train = authors
train_idx = np.array(range(len(authors)))
test_idx = []

In [17]:
%%time
cv_mfw = CountVectorizer(max_features=k, token_pattern=u"(?u)\\b\\w+\\b").fit(data_train)
td = pd.DataFrame(cv_mfw.transform(data_train).todense())
td.columns = sorted(cv_mfw.vocabulary_) 

CPU times: user 10.4 s, sys: 69.4 ms, total: 10.4 s
Wall time: 9.87 s


In [18]:
freqs = td.div(len_train, axis=0)
sclr = StandardScaler().fit(freqs)
freqs_sc = sclr.transform(freqs)
cosine_delta = pd.DataFrame(pdist(freqs_sc, metric='cosine'))
b_delta = pd.DataFrame(pdist(freqs_sc, metric='manhattan'))

In [59]:
ridero_catalog = pd.read_csv('ridero_catalog', sep=';')
ridero_catalog.columns = ['filename', 'author', 'bookname']

In [68]:
ridero_catalog[ridero_catalog['filename'] == 'addikt']['author'].item()

'Алиса Чалис'

In [74]:
#ridero delta
ridero_data = []
ridero_lengths = []
ridero_authors = []
ridero_filenames = []
ridero_booknames = []
for filename in sorted(os.listdir(dir_in_ridero_raw)):
    if not os.path.isfile(os.path.join(dir_in_ridero_raw, filename)):
        continue
    with codecs.open(os.path.join(dir_in_ridero_raw, filename), encoding='utf-8') as book:
        content = book.read()
        ridero_filenames.append(filename)
        author = ridero_catalog[ridero_catalog['filename'] == filename]['author'].item()
        bookname = ridero_catalog[ridero_catalog['filename'] == filename]['bookname'].item()
        ridero_authors.append(author)
        ridero_booknames.append(bookname.replace('\xa0', ' '))
        ridero_data.append(content)
        ridero_lengths.append(len(re.findall(u"(?u)\\b\\w+\\b", content)))

In [80]:
catalog[catalog['author_id'] == 111]['author_surname'].iloc[0]

'Рубина'

In [76]:
ridero_data = np.array(ridero_data)
ridero_lengths = np.array(ridero_lengths)
ridero_authors = np.array(ridero_authors)
ridero_filenames = np.array(ridero_filenames)
ridero_booknames = np.array(ridero_booknames)

In [34]:
rtd = pd.DataFrame(cv_mfw.transform(ridero_data).todense())
rtd.columns = sorted(cv_mfw.vocabulary_)
freqs_ridero = rtd.div(ridero_lengths, axis=0)
freqs_ridero_sc = sclr.transform(freqs_ridero)

In [82]:
dists = scipy.spatial.distance.cdist(freqs_ridero_sc, freqs_sc, metric='cosine')

In [93]:
res = []
for i, ridero_book in enumerate(dists):
    closest = dists[i].argsort()[:5]
    for j in closest:
        canon_a = catalog[catalog['author_id'] == authors[j]]['author_surname'].iloc[0]
        res.append((
            ridero_filenames[i],
            ridero_booknames[i],
            ridero_authors[i],
            canon_a,
            book_names[j],
            book_fbusta_ids[j],
            dists[i][j]
        ))

In [94]:
pd.DataFrame(res).to_csv('snapshot50/cos_delta_top5.csv', sep=';', index=False)

In [51]:
closest = dists[0].argsort()[:5]

In [75]:
np.array(ridero_booknames)[closest]

array(['30 вещей, которые каждая девушка должна успеть сделать до 30 лет',
       'Боги Олимпа', 'Черникина и другие', '69 +/– 1 = Ad hoc',
       'Амур де труа'], dtype='<U64')

In [None]:
dop = []
for i, u in enumerate(freqs_ridero_sc):
    for j,v in enumerate(freqs_sc):
        dop.append((ridero_names[i], whole[j], 1 - scipy.spatial.distance.cosine(freqs_sc[j], freqs_ridero_sc[i]), 'undirected'))