In [7]:
import numpy as np
import requests
import grequests
from selenium import webdriver
import re
import math
from bs4 import BeautifulSoup

In [8]:
def get_soup_by_url(url):
    html = requests.get(url).text.strip()
    return BeautifulSoup(html, 'html.parser')

In [9]:
def only_digits(s):
    return ''.join(c for c in s if c.isdigit())

### Users parsing

In [10]:
import csv

In [11]:
USERS_TABLE = "users.csv"
FILMS_TABLE = "films.csv"

In [22]:
class FilmsParser:
    uhead = ["id","username","episodes","hours","days"]
    fhead = ["fid","uid","mark","stat","views"]
    
    def __init__(self, users, films, mode="a+", lock=None, echo=False):
        self.ufile = open(users,mode, newline='', encoding='utf-8')
        self.ffile = open(films,mode, newline='', encoding='utf-8')
        
        #move to begin
        self.ufile.seek(0,0)
        self.ffile.seek(0,0)
        
        self.users = set()
        ureader = csv.DictReader(self.ufile, fieldnames=self.uhead)
        for row in ureader:
            self.users.add(row["username"])
        
        self.uid = len(self.users)
        self.uwriter = csv.DictWriter(self.ufile, fieldnames=self.uhead)
        self.fwriter = csv.DictWriter(self.ffile, fieldnames=self.fhead)
        self.lock = lock
        self.echo = echo
        
        #move to end
        self.ufile.seek(0,2)
        self.ffile.seek(0,2)
        
        print("Readed %d uniq users" % self.uid)
        
    def close(self):
        self.ufile.close()
        self.ffile.close()
    
    def __del__(self):
        self.close()
        
    def __enter__(self):
        return self

    def __exit__(self, typ, val, tb):
        self.close()
        
    def atomic(func):
        def enter_lock(self, *args, **kwargs):
            if self.lock is not None:
                with self.lock:
                    ret = func(self, *args, **kwargs)
            else: ret = func(self, *args, **kwargs)
            return ret
        return enter_lock
        
    @atomic
    def addUser(self, username, episodes, hours, days):
        self.users.add(username)
        self.uwriter.writerow(dict(zip(self.uhead, (self.uid,username,episodes,hours,days))))
        self.uid = self.uid + 1
        
    @atomic
    def addFilm(self, fid, uid, mark, stat, views):
        self.fwriter.writerow(dict(zip(self.fhead, (fid,uid,mark,stat,views))))
        
    def parseUserMarks(self, username, link, html=None):
        if username in self.users: return
        
        if html is None: soup = get_soup_by_url(link)
        else: soup = BeautifulSoup(html, 'html.parser')
            
        stats = soup.select(".statusBlock div")
        if stats is None or len(stats) < 3: 
            #Private profile
            self.addUser(username, -1, -1, -1)
            return
        
        episodes, hours, days = (int(only_digits(stat.get_text())) \
                                 for stat in soup.select(".statusBlock div")[:3])
        
        uid = self.uid
        self.addUser(username, episodes, hours, days)
        
        for stat,table in enumerate(soup.select(".tabs_cont")):
            for row in table.findAll("tr"):
                cells = row.findAll("td")
                if len(cells) != 3: continue # Expand button row
                fid = int(cells[0].find("a").attrs['href'].split('/')[-2])
                stars = int(only_digits(cells[1].find('span').attrs['class'][1]))
                views = int(cells[2].select("._done")[0].string)
                
                self.addFilm(fid,uid,stars,stat,views)
                
        if self.echo:
            print("Added %s, Users now: %d" % (username, self.uid))
        
    def parseUsers(self, link, session=None):
        if session is None: session = requests.Session()
        html = session.get(link).text.strip()
        soup = BeautifulSoup(html, 'html.parser')

        users = soup.select("main .col4")
        usernames = list()
        links = list()
        for user in users:
            link = user.find('a').attrs['href']
            username = user.select('.userBlockName')[0].string
            if self.echo:
                print(username,link)
            if not username in self.users:
                usernames.append(username)
                links.append(link)
            #self.parseUserMarks(username, link)
        rs = (grequests.get(u, timeout=2, session=session) for u in links[:30])
        responses = grequests.map(rs)
        for resp, username, link in zip(responses, usernames, links):
            if resp is None: 
                if self.echo:
                    print("Response for %s is None" % link)
                continue
            self.parseUserMarks(username, link, resp.text.strip())
            
        return users is not None and len(users) > 0
    

```python
mode="w+" - создаёт чистый файл
mode="r+" - открывает если есть
```

In [23]:
with FilmsParser(USERS_TABLE,FILMS_TABLE,mode="r+",echo=True) as fp:
    fp.parseUsers("https://myshows.me/community/users/")

Readed 550319 uniq users
Thiarnan https://myshows.me/Thiarnan
Dinobot https://myshows.me/Dinobot
Jumper_D https://myshows.me/Jumper_D
Elia4014 https://myshows.me/Elia4014
fcdd_yura https://myshows.me/fcdd_yura
jjj2657724 https://myshows.me/jjj2657724
n_kisl96 https://myshows.me/n_kisl96
vk382607 https://myshows.me/vk382607
bu_michelle https://myshows.me/bu_michelle
sborisenko96 https://myshows.me/sborisenko96
id165174749 https://myshows.me/id165174749
MINNESOTKA https://myshows.me/MINNESOTKA
Linato https://myshows.me/Linato
vk322408 https://myshows.me/vk322408
chekun https://myshows.me/chekun
OksanaSavich https://myshows.me/OksanaSavich
Ksenchez https://myshows.me/Ksenchez
id13018570 https://myshows.me/id13018570
solz0r https://myshows.me/solz0r
Бреган_Дэрт https://myshows.me/Бреган_Дэрт
Mariee https://myshows.me/Mariee
Roman007007 https://myshows.me/Roman007007
Светка_мо https://myshows.me/Светка_мо
LidiaMartin https://myshows.me/LidiaMartin
nicholas https://myshows.me/nicholas
advoka

In [24]:
import pandas as pd

In [25]:
udf = pd.read_csv(USERS_TABLE, names=FilmsParser.uhead, index_col=0)
fdf = pd.read_csv(FILMS_TABLE, names=FilmsParser.fhead)

In [26]:
udf.head()

Unnamed: 0_level_0,username,episodes,hours,days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,philoctetes,4069,2718,113
1,id1700103,1722,1310,55
2,FallenFromMoon,8155,4536,189
3,FaidEyren,4883,3257,136
4,Requnael,5293,3269,136


In [27]:
fdf.tail()

Unnamed: 0,fid,uid,mark,stat,views
10441300,54382,550342,0,3,32
10441301,24054,550342,0,3,16
10441302,6528,550342,0,3,10
10441303,22610,550342,0,3,200
10441304,16813,550342,0,3,69


### Однопоточная реализация
~ 3 Users / sec

In [None]:
from IPython.display import clear_output
from time import time
#from tqdm import tqdm as tqdm

with FilmsParser(USERS_TABLE,FILMS_TABLE,mode="r+") as fp:
    it = 700
    users_target = 150000
    #pbar = tqdm(total=users_target,initial=fp.uid,miniters=0)
    ups = 0 #users per sec
    while fp.uid < users_target:
        print("Page: %d, Users: %d, U/S=%.4g" % (it, fp.uid, ups))
        lv = fp.uid
        lt = time()
        fp.parseUsers("https://myshows.me/community/users/?page={}".format(it))
        it += 1
        clear_output(wait=True)
        ups = (fp.uid-lv)/(time()-lt)
        #one iter adds ~30 users

### Добавим параллельности и всё такое
~ 15$\pm$5 Users / sec

In [28]:
import multiprocessing
from time import time
from IPython.display import clear_output

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
lock = multiprocessing.Semaphore()

with FilmsParser(USERS_TABLE,FILMS_TABLE,mode="a+",lock=lock) as fp, \
    requests.Session() as ss, \
    ThreadPoolExecutor(max_workers=16) as executor:
    
    page = 21040
    old_uid = fp.uid
    old_uid2 = fp.uid
    old_t = time()
    ups = 0
    def threadFunc():
        global fp, page, old_uid, old_uid2, old_t, ups, lock, ss
        while True:
            with lock:
                c_page = page
                page += 1
            fp.parseUsers("https://myshows.me/community/users/?page={}".format(c_page), session=None)
            with lock:
                delta = fp.uid-old_uid2
                old_uid2 = fp.uid
                print("Page: %d, Users: %d(+%d), U/S=%.4g" % (page, fp.uid, delta, ups))
                clear_output(wait=True)
                new_t = time()
                if new_t - old_t > 10.0:
                    ups = (fp.uid-old_uid)/(new_t-old_t)
                    old_uid = fp.uid
                    old_t = time()
                c_uid = fp.uid
            if c_uid > 550000: break
            
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)
    executor.submit(threadFunc)

Page: 21055, Users: 550432(+5), U/S=7.957


### Делаем только с оценками (удаляем нули)

In [59]:
import os
from tqdm import tqdm_notebook as tqdm
with open(FILMS_TABLE, "r", newline='', encoding='utf-8') as ft:
    films_without_zero = list(os.path.splitext(FILMS_TABLE))
    films_without_zero[0]+="_nozeros"
    films_without_zero = "".join(films_without_zero)
    with open(films_without_zero, "w", newline='', encoding='utf-8') as fnzt:
        freader = csv.DictReader(ft, fieldnames=FilmsParser.fhead)
        fwriter = csv.DictWriter(fnzt, fieldnames=FilmsParser.fhead)
        bar = tqdm(freader, unit="F", miniters=100000)
        filtered = all_cnt = 0
        for row in bar:
            all_cnt += 1
            if row["mark"] != "0":
                fwriter.writerow(row)
                filtered += 1
                bar.set_postfix(Filtered="%d" % filtered, refresh=False)
        

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Для статистики посчитает количество только публичных пользователей

In [58]:
import os
from tqdm import tqdm_notebook as tqdm
with open(USERS_TABLE, "r", newline='', encoding='utf-8') as ut:
    only_public_users = list(os.path.splitext(USERS_TABLE))
    only_public_users[0]+="_public"
    only_public_users = "".join(only_public_users)
    with open(only_public_users, "w", newline='', encoding='utf-8') as upbt:
        ureader = csv.DictReader(ut, fieldnames=FilmsParser.uhead)
        uwriter = csv.DictWriter(upbt, fieldnames=FilmsParser.uhead)
        bar = tqdm(ureader, unit="U", miniters=100000)
        filtered = all_cnt = 0
        for row in bar:
            all_cnt += 1
            if row["episodes"] != "-1":
                uwriter.writerow(row)
                filtered += 1
                bar.set_postfix(Filtered="%d" % filtered, refresh=False)
        

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

<div align="right">Alex Sobolev, 2018</div>