In [1]:
import numpy as np
import requests
from selenium import webdriver
import re
import math
from bs4 import BeautifulSoup

In [2]:
def get_soup_by_url(url):
    html = requests.get(url).text.strip()
    return BeautifulSoup(html, 'html.parser')

In [3]:
def only_digits(s):
    return ''.join(c for c in s if c.isdigit())

### Users parsing

In [4]:
import csv

In [5]:
USERS_TABLE = "users.csv"
FILMS_TABLE = "films.csv"

In [12]:
class FilmsParser:
    uhead = ["id","username","episodes","hours","days"]
    fhead = ["fid","uid","mark","stat","views"]
    
    def __init__(self, users, films, mode="r+"):
        self.ufile = open(users,mode)
        self.ffile = open(films,mode)
        
        self.users = set()
        ureader = csv.DictReader(self.ufile, fieldnames=self.uhead)
        for row in ureader:
            self.users.add(row["username"])
        
        self.uid = len(self.users)
        self.uwriter = csv.DictWriter(self.ufile, fieldnames=self.uhead)
        self.fwriter = csv.DictWriter(self.ffile, fieldnames=self.fhead)
        
        print("Readed %d uniq users" % self.uid)
    
    def __del__(self):
        self.ufile.close()
        self.ffile.close()
        
    def addUser(self, username, episodes, hours, days):
        self.users.add(username)
        self.uwriter.writerow(dict(zip(self.uhead, (self.uid,username,episodes,hours,days))))
        self.uid = self.uid + 1
        
    def addFilm(self, fid, uid, mark, stat, views):
        self.fwriter.writerow(dict(zip(self.fhead, (fid,uid,mark,stat,views))))
        
    def parseUserMarks(self, username, link):
        if username in self.users: return
        
        soup = get_soup_by_url(link)
        stats = soup.select(".statusBlock div")
        if stats is None or len(stats) < 3: 
            #Private profile
            self.addUser(username, -1, -1, -1)
            return
        
        episodes, hours, days = (int(only_digits(stat.get_text())) \
                                 for stat in soup.select(".statusBlock div")[:3])
        
        uid = self.uid
        self.addUser(username, episodes, hours, days)
        
        for stat,table in enumerate(soup.select(".tabs_cont")):
            for row in table.findAll("tr"):
                cells = row.findAll("td")
                if len(cells) != 3: continue # Expand button row
                fid = int(cells[0].find("a").attrs['href'].split('/')[-2])
                stars = int(only_digits(cells[1].find('span').attrs['class'][1]))
                views = int(cells[2].select("._done")[0].string)
                
                self.addFilm(fid,uid,stars,stat,views)
                
        print("Added %s, Users now: %d" % (username, self.uid))
        
    def parseUsers(self, link):
        soup = get_soup_by_url(link)
        users = soup.select("main .col4")
        for user in users:
            link = user.find('a').attrs['href']
            username = user.select('.userBlockName')[0].string
            print(username,link)
            self.parseUserMarks(username, link)
        return users is not None and len(users) > 0
    

```python
mode="w+" - создаёт чистый файл
mode="r+" - открывает если есть
```

In [7]:
fp = FilmsParser(USERS_TABLE,FILMS_TABLE,mode="r+")
fp.parseUsers("https://myshows.me/community/users/")
del fp

Readed 56 uniq users
tatabsu https://myshows.me/tatabsu
iwafy https://myshows.me/iwafy
Added iwafy, Users now: 57
dotraw https://myshows.me/dotraw
Added dotraw, Users now: 58
waker https://myshows.me/waker
Added waker, Users now: 59
nara_takero https://myshows.me/nara_takero
Added nara_takero, Users now: 60
DaryaRifma https://myshows.me/DaryaRifma
mega755 https://myshows.me/mega755
Added mega755, Users now: 61
BonnieBoyle https://myshows.me/BonnieBoyle
Added BonnieBoyle, Users now: 62
ladabelova https://myshows.me/ladabelova
Daedalus https://myshows.me/Daedalus
Added Daedalus, Users now: 63
twee_ballen https://myshows.me/twee_ballen
Added twee_ballen, Users now: 64
Daneeka https://myshows.me/Daneeka
Added Daneeka, Users now: 65
id66950277 https://myshows.me/id66950277
Added id66950277, Users now: 66
FRFD https://myshows.me/FRFD
ALBERTv https://myshows.me/ALBERTv
Added ALBERTv, Users now: 67
armitaj https://myshows.me/armitaj
Added armitaj, Users now: 68
8orodina https://myshows.me/8oro

In [8]:
import pandas as pd

In [9]:
udf = pd.read_csv(USERS_TABLE, names=FilmsParser.uhead, index_col=0)
fdf = pd.read_csv(FILMS_TABLE, names=FilmsParser.fhead)

In [10]:
udf.head()

Unnamed: 0_level_0,username,episodes,hours,days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,tatabsu,13751,10075,420
1,Viktoria777,2875,2535,106
2,Панда_Panda,7759,4973,207
3,KseniaMimi,10818,5462,228
4,leXzZz,11622,7456,311


In [11]:
fdf.tail()

Unnamed: 0,fid,uid,mark,stat,views
6695,55513.0,29,0,3,10.0
6696,635.0,29,0,3,22.0
6697,53268.0,29,0,3,15.0
6698,49400.0,29,0,3,22.0
6699,50805.0,29,0,3,10.0


### Добавьте параллельность и всё такое

In [13]:
fp = FilmsParser(USERS_TABLE,FILMS_TABLE,mode="r+")
while fp.uid < 100:
    fp.parseUsers("https://myshows.me/community/users/")
    #one iter adds ~30 users
del fp

Readed 79 uniq users
aymani2013 https://myshows.me/aymani2013
Added aymani2013, Users now: 80
Legenda https://myshows.me/Legenda
Added Legenda, Users now: 81
Unbeautiful https://myshows.me/Unbeautiful
Added Unbeautiful, Users now: 82
marwik https://myshows.me/marwik
Added marwik, Users now: 83
vk312168 https://myshows.me/vk312168
Added vk312168, Users now: 84
бритт https://myshows.me/бритт
Added бритт, Users now: 85
rhymer144 https://myshows.me/rhymer144
Added rhymer144, Users now: 86
shurfiada https://myshows.me/shurfiada
Added shurfiada, Users now: 87
manjasha https://myshows.me/manjasha
Added manjasha, Users now: 88
andjoe23 https://myshows.me/andjoe23
Lunatic3k https://myshows.me/Lunatic3k
Added Lunatic3k, Users now: 89
vk719181 https://myshows.me/vk719181
Added vk719181, Users now: 90
Kristen_Lauren https://myshows.me/Kristen_Lauren
Added Kristen_Lauren, Users now: 91
Midi https://myshows.me/Midi
flegmali https://myshows.me/flegmali
Added flegmali, Users now: 92
АндрейСноу https:/

<div align="right">Alex Sobolev, 2018</div>