In [117]:
import csv
import codecs
import fuzzywuzzy
import re
from itertools import groupby
from fuzzywuzzy import process, fuzz
from collections import namedtuple
import requests

In [2]:
def read_csv(file):
    with codecs.open(file, encoding='utf-8') as f:
        data = list(csv.reader(f, delimiter=';', quotechar='"'))[1:]
        return data

In [17]:
catalog = read_csv('catalog.txt')
len(catalog)

453621

In [91]:
liter = read_csv('liter.csv')
len(liter)

543

In [97]:
curr_author = None
for l in liter:
    if l[0]:
        curr_author = l[0]
    else:
        l[0] = curr_author

In [105]:
def get_surname(s):
    return re.split(',? ', s)[0]

In [106]:
author_set = set(get_surname(t[0]) for t in liter)
catalog_with_authors = [x for x in catalog if x[0] in author_set]
catalog_by_author_surname = dict((surname, list(entries)) for surname, entries in groupby(catalog_with_authors, lambda x: x[0]))
len(catalog_by_author_surname['Замятин'])

85

In [111]:
def match_catalog_entry(liter_entry):
    surname = get_surname(liter_entry[0])
    book = liter_entry[3]
    cat_by_author = catalog_by_author_surname[surname]
    cat_book_names = [x[3] for x in cat_by_author]
    best_book, score = process.extractOne(book, cat_book_names, scorer=fuzz.token_sort_ratio)
    best_cat_entry = next(x for x in cat_by_author if x[3] == best_book)
    return (score, best_cat_entry, liter_entry)

matched_with_cat = list(map(match_catalog_entry, liter))

In [119]:
well_matched = [x for x in matched_with_cat if x[0] > 90]
len(well_matched)

446

In [139]:
ids = [cat[-1] for score, cat, lit in well_matched]
urls = ['http://flibusta.is/b/%s/fb2' % id for id in ids]
urls

['http://flibusta.is/b/388207/fb2',
 'http://flibusta.is/b/425691/fb2',
 'http://flibusta.is/b/168343/fb2',
 'http://flibusta.is/b/225045/fb2',
 'http://flibusta.is/b/329591/fb2',
 'http://flibusta.is/b/235106/fb2',
 'http://flibusta.is/b/10547/fb2',
 'http://flibusta.is/b/295292/fb2',
 'http://flibusta.is/b/169265/fb2',
 'http://flibusta.is/b/348954/fb2',
 'http://flibusta.is/b/77039/fb2',
 'http://flibusta.is/b/253788/fb2',
 'http://flibusta.is/b/395149/fb2',
 'http://flibusta.is/b/102645/fb2',
 'http://flibusta.is/b/96412/fb2',
 'http://flibusta.is/b/137565/fb2',
 'http://flibusta.is/b/74285/fb2',
 'http://flibusta.is/b/173116/fb2',
 'http://flibusta.is/b/395621/fb2',
 'http://flibusta.is/b/74105/fb2',
 'http://flibusta.is/b/69892/fb2',
 'http://flibusta.is/b/150978/fb2',
 'http://flibusta.is/b/97832/fb2',
 'http://flibusta.is/b/78600/fb2',
 'http://flibusta.is/b/401178/fb2',
 'http://flibusta.is/b/461562/fb2',
 'http://flibusta.is/b/388665/fb2',
 'http://flibusta.is/b/72596/fb2',
 

In [145]:
def download_file(url):
    r = requests.get(url, stream=True, proxies = {
      "http": "45.76.95.243:3128"
    })
    local_filename = re.match('.*filename="(.+)".*', r.headers['Content-Disposition'])[1]
    print(local_filename)
    with open('fb2/' + local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024*1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                #f.flush()
    return local_filename

for url in urls:
    try:
        download_file(url)
    except Exception as e:
        print(e)

'content-disposition'
'content-disposition'
Bulychev_Perpendikulyarnyy-mir.g38QIA.408502.fb2.zip
'content-disposition'
Vayner_Era-miloserdiya.tsDo2g.324990.fb2.zip
Vayner_Gorod-prinyal.3ofHjw.200525.fb2.zip
Vayner_Dilogiya_1_Petlya-i-kamen-v-zelenoy-trave.Qj7Bsw.201842.fb2.zip
Granin_Iskateli.bhevgw.498584.fb2.zip
Granin_Idu-na-grozu.rC7RHg.499588.fb2.zip
Granin_Kartina.pp67Hg.20477.fb2.zip
Granin_Eshche-zameten-sled.qk5PRw.149025.fb2.zip
Granin_Nash-dorogoy-Roman-Avdeevich.mi15NA.490942.fb2.zip
Iskander_Sozvezdie-Kozlotura.LfWWBg.156514.fb2.zip
Iskander_Sandro-iz-Chegema.1nMzkQ.273618.fb2.zip
Iskander_Morskoy-skorpion.UiuNBg.153215.fb2.zip
Iskander_Kroliki-i-udavy.zk4xWA.70498.fb2.zip
Iskander_Trinadcatyy-podvig-Gerakla.1mM4qw.388896.fb2.zip
Kabakov_Nevozvrashchenec.onH8QA.24693.fb2.zip
Kabakov_Posledniy-geroy.NUzPXw.24695.fb2.zip
Kabakov_Sochinitel.t8Swbg.24698.fb2.zip
Kaverin_Dva-kapitana.qh2Zmg.68345.fb2.zip
Kaverin_Otkrytaya-kniga.eUgdOw.68344.fb2.zip
Kaverin_Sem-par-nechistyh.AOp

Uspenskiy_Krasnaya-ruka-chernaya-prostynya-zelenye-palcy.b9fQxw.77731.fb2.zip
'content-disposition'
'content-disposition'
'content-disposition'
Pogorelskiy_Lafertovskaya-makovnica.WmDcMQ.43667.fb2.zip
Pogorelskiy_Lyubimye-knigi-Lva-Tolstogo-Detstvo-do-14-let-_1_Chernaya-kurica-ili-Podzemnye-zhiteli.13aYXg.110929.fb2.zip
Vodolazkin_Aviator.9JpD7Q.449077.fb2.zip
Vodolazkin_Lavr.fga7xw.310583.fb2.zip
Evgeniy_Vodolazkin_Solovev_i_Larionov.pdf
Yuzefovich_Syshchik-Putilin_3_Knyaz-vetra.N0bLlQ.76325.fb2.zip
Yuzefovich_Zhuravli-i-karliki.AnIC0Q.148480.fb2.zip
Shishkin_Vzyatie-Izmaila.rMm-kg.248856.fb2.zip
Shishkin_Venerin-volos.yH-w_A.248855.fb2.zip
Shishkin_Pismovnik.-AQ4xg.248853.fb2.zip
Matveeva_Pereval-Dyatlova.2PJxMA.36282.fb2.zip
Matveeva_Nebesa.MwQjQg.380537.fb2.zip
Fedin_Goroda-i-gody.OuJQdg.176408.fb2.zip
Fedin_Pervye-radosti_1_Pervye-radosti.oeQtrA.151379.fb2.zip
Fedin_Pervye-radosti_2_Neobyknovennoe-leto.gs-FgQ.115721.fb2.zip
Bazhov_Mednoy-gory-hozyayka.S2MSYw.143109.fb2.zip
Bazhov_

In [149]:
import os

books = [f for f in os.listdir('fb2') if f[-7:] == "fb2.zip"]
len(books)

393