In [1]:
import requests as r
import random
from bs4 import BeautifulSoup
import lxml
import pandas as pd
from time import sleep
import re
import json

#### Funkcje, klasy:

In [2]:
# pobieranie zawartości strony i przygotowanie zupy

class Page:

    
    def __init__(self, url, failGettingTimer=3, walkTimer=1,
                 noRepetitions=3, timeout=10, headers={'User-Agent': 'Urzad Statystyczny w Krakowie'},
                 parser='lxml', allowRedirects=False):
        self.url=url
        self.walkTimer=walkTimer
        self.failGettingTimer=failGettingTimer
        self.parser=parser
        self.headers=headers
        self.timeout=timeout
        self.noRepetitions=noRepetitions
        self.allowRedirects=allowRedirects
        self.response = None
        
    def getPage(self):
        
        sleep(self.walkTimer)
        
        response=r.get(url=self.url, headers=self.headers, timeout=self.timeout, allow_redirects=self.allowRedirects)
        
        if response.status_code!=200:

            for i in range(self.noRepetitions):
                sleep(self.failGettingTimer)
                response=r.get(url=self.url, headers=self.headers, timeout=self.timeout, allow_redirects=self.allowRedirects)
                
                if response.status_code==200:
                    break

        self.response = response

    
    def makeSoup(self):
        
        if self.response == None:
            
            self.getPage()


        try:
            soup=BeautifulSoup(self.response.text, self.parser)
        except:
            soup=None
            
        return soup


In [3]:
# wyciąganie informacji ze strony lekarza 

def getDetails(soup):
    
    
    physicianDetails = BeautifulSoup(str(soup.find('div',  attrs={'class':'unified-doctor-header-info__name'})))
    
    # tytuł jeśli jest
    try:
        title = physicianDetails.find(attrs={'itemprop':'name'}).find_previous_sibling().get_text(strip=True)
    except:
        title = None
    
    # imię i nazwisko
    name = physicianDetails.find(attrs={'itemprop':'name'}).get_text(strip=True)
    
    # specjalizacje
    physicianSpecializations = [spec['title'] for spec in list(soup.select('[data-test-id = "doctor-specializations"] > a '))]
    specializationsKeys=['specialization' + str(num+1) for num in range(len(physicianSpecializations))]
    specializations = dict(zip(specializationsKeys, physicianSpecializations))
    
    
    try:
            lastCommentDate = soup.find('div', attrs={'data-id':'doctor-opinions-list'}).find('time', attrs={'itemprop':'datePublished'})['datetime']
            lastCommentDate = pd.to_datetime(lastCommentDate).date().isoformat()
                            
    except:
        lastCommentDate = None
          
    # oceny
    ratings = soup.find('u', class_="rating rating-lg")
    
    # liczba ocen
    
    try:
        rating = ratings['data-score']
        opinionCount = ratings.get_text(strip=True)
    except:
        rating = None
        opinionCount = 0
    
    # czy online
    online = len(soup.find_all(attrs={'data-online-only': "true"}))

    
    details = {
                'title' : title
                ,'name' : name
                ,'online' : online
                ,'rating' : rating
                ,'opinionCount' : opinionCount
                ,'lastCommentDate' : lastCommentDate  
                ,'specializations' : specializations 
             }
    
    return details

In [4]:
# wyciąganie adresu ze strony lekarza

def addressKey(attributeName, dictObject):
    switcherKey=dictObject[attributeName]
    switcher={
        'postalCode' : 'postalCode'
        ,'addressLocality' : 'locality'
        ,'addressRegion' : 'region'
        ,'addressCountry' : 'country'
        ,'streetAddress' : 'street'
             
             }
    return switcher.get(switcherKey, 'otherAdressAttribute')   

def getAddresses(soup):
    
    addresses={}
    addressList = [address for address in soup.find_all("div", {'data-id':"doctor-address-item"})
                   if len(address.select('div[data-test-id="address-info"]')) != 0]
    
    for i, address in enumerate(addressList):
        
        addresses['address'+str(i+1)]={}
        
        details = address.find('div', {'data-test-id':"address-info"}).select('span[itemprop]')
        
        if len(details) != 0:
            
            for addressDetail in details:
                try:
                    addresses['address'+str(i+1)][addressKey('itemprop', addressDetail)]=addressDetail['content']

                except:
                    try:
                        streetValue=addressDetail.select_one('[data-test-id=address-info-street]').get_text()
                        maxIndex=len(streetValue)
                        if streetValue[(maxIndex-1)]==',':
                            streetValue=streetValue[:maxIndex-1]
                    except:
                        streetValue=None
                    addresses['address'+str(i+1)][addressKey('itemprop', addressDetail)]=streetValue

                    
        
        try:
            addresses['address'+str(i+1)]['name'] = address.find(attrs={'itemprop' : 'name'})['content']
            addresses['address'+str(i+1)]['text'] = address.find(attrs={'data-test-id':'address-info'}).get_text()
        except:
            pass
            
        try:
            addresses['address'+str(i+1)]['latitude'] = address.find(attrs={'itemprop' : 'latitude'})['content']
            addresses['address'+str(i+1)]['longitude'] = address.find(attrs={'itemprop' : 'longitude'})['content']
        except:
            pass

        
    return addresses

In [5]:
# zbieranie linków do stron lekarzy ze strony katalogu

def getPhysiciansLinkList(catalogPage):
    physiciansLinkList=[element['data-doctor-url'] for element in catalogPage.select('[data-doctor-url]') 
                        if len(element.select('[title*="Ten lekarz potwierdził wiarygodność danych na swoim profilu"]'))!=0]


    return physiciansLinkList

In [6]:
# pobiera listę relatywnych linków dla poszczególnych specjalizacji

def getSpecRelativeLinksList(specSoup, bsExpression='''h3 > div > a:not([href*="covid"]):is(.text-muted)'''):
    specRelativeLinksList=[specialization['href'] for specialization in specSoup.select(bsExpression)]
    return specRelativeLinksList

In [7]:
# nawiguje po stronach z lekarzami danej specjalizacji

def nextPage(physiciansPage):

    try:
        value=physiciansPage.select_one('[data-test-id=pagination-next]')['href']
    except:
        value=None
        
    return value

### Właściwy program
#### Zebranie linków do indywidualnych stron lekarzy

In [8]:
# lista specjalizacji
allSpecPageURL = 'https://www.znanylekarz.pl/specjalizacje-lekarskie'

allSpecSoup = Page(allSpecPageURL).makeSoup()

specializationsLinks = ['https://www.znanylekarz.pl' + relativeLink for relativeLink in getSpecRelativeLinksList(allSpecSoup)]

In [14]:
%%time

physiciansLinkList = []


for specializationPageURL in specializationsLinks:

    catalogPageURL = specializationPageURL
    pageNum = 0
    
    while True:

        pageNum += 1
        
        catalogPage = Page(catalogPageURL)    
        catalogPage.getPage()

        # jeśli nas przekierowuje na początek, przerywamy
        if catalogPage.response.status_code != 200:        
            break

        catalogPageSoup = catalogPage.makeSoup()

        # zbieramy linki do stron lekarzy
        physiciansLinkList += [link for link in getPhysiciansLinkList(catalogPageSoup) if link not in physiciansLinkList]

        # adres następnej strony katalogu:       
        catalogPageURL = nextPage(catalogPageSoup)

        # jeśli nie ma następnej strony, przerywamy
        if catalogPageURL == None:
            break    


# lista lekarzy do zapisania
df = pd.DataFrame(physiciansLinkList)
df.to_csv('listaLekarzy.csv', encoding='utf-8', index=False)


ReadTimeout: HTTPSConnectionPool(host='www.znanylekarz.pl', port=443): Read timed out. (read timeout=10)

Jeśli straci łączność, do wznawiania programu od miejsca gdzie skończył

In [None]:
# zobaczmy ile mamy
len(physiciansLinkList)

In [23]:
# lista specjalizacji od tej na której skończyło
specializationContinuation = specializationsLinks[specializationsLinks.index(specializationPageURL):]
specializationContinuation[0] = catalogPageURL

In [80]:
# jeszcze raz ta sama pętla
for specializationPageURL in specializationContinuation:

    catalogPageURL = specializationPageURL
    pageNum = 0
    
    while True:

        pageNum += 1
        
        catalogPage = Page(catalogPageURL)    
        catalogPage.getPage()

        # jeśli nas przekierowuje na początek, przerywamy
        if catalogPage.response.status_code != 200:        
            break

        catalogPageSoup = catalogPage.makeSoup()

        # zbieramy linki do stron lekarzy
        physiciansLinkList += [link for link in getPhysiciansLinkList(catalogPageSoup) if link not in physiciansLinkList]

        # adres następnej strony katalogu:       
        catalogPageURL = nextPage(catalogPageSoup)

        # jeśli nie ma następnej strony, przerywamy
        if catalogPageURL == None:
            break    


# lista lekarzy do zapisania
df = pd.DataFrame(physiciansLinkList)
df.to_csv('listaLekarzy.csv', encoding='utf-8', index=False)

In [81]:
physiciansLinkList[-10:]

['https://www.znanylekarz.pl/katarzyna-mijalska/weterynarz/lask',
 'https://www.znanylekarz.pl/magdalena-traczynska/weterynarz/elblag',
 'https://www.znanylekarz.pl/magdalena-staszewska-2/weterynarz/lidzbark-warminski',
 'https://www.znanylekarz.pl/agnieszka-lewandowska-2/weterynarz/olsztyn',
 'https://www.znanylekarz.pl/karol-stapor/weterynarz/krakow',
 'https://www.znanylekarz.pl/agnieszka-durydiwka-kaczmarek/weterynarz/wroclaw',
 'https://www.znanylekarz.pl/artur-matuszak/weterynarz/szamotuly',
 'https://www.znanylekarz.pl/janusz-lesnik/weterynarz/raciborz',
 'https://www.znanylekarz.pl/joanna-pietras-2/weterynarz/krakow',
 'https://www.znanylekarz.pl/marek-mrowczynski/weterynarz/zielona-gora']

In [82]:
len(physiciansLinkList)

53544

#### Zbieranie danych ze stron lekarzy

In [8]:
# wczytanie poprzednich
df = pd.read_csv('listaLekarzy.csv')
physiciansLinkList = list(df['0'])

In [9]:
# puste ramki dla lekarzy i adresów
physicianTable = pd.DataFrame(columns = ['physicianID', 'title', 'name', 'online', 'rating', 'opinionCount', 'lastCommentDate',
       'specializations', 'link'])

addressTable = pd.DataFrame(columns = ['physicianID', 'postalCode', 'locality', 'region', 'country', 'street', 'latitude',
       'longitude'])

physicianIDCounter = 0
physiciansFailed = []


# pętla do przechodzenia przez strony lekarzy
for physicianURL in physiciansLinkList:
    
    try:
        # zupa z lekarza
        physicianSoup = Page(physicianURL).makeSoup()

        physicianIDCounter += 1

        # data frame ze szczegółami 
        try:
            details = pd.DataFrame(getDetails(physicianSoup))
            details['physicianID'] = physicianIDCounter
            details['link'] = physicianURL
        except:
            details = pd.DataFrame()        

        # dołączenie do głównej tabeli
        physicianTable = pd.concat([physicianTable, details])

        # data frame z adresami
        try:
            address = pd.DataFrame(getAddresses(physicianSoup)).T
            address['physicianID'] = physicianIDCounter
        except:
            address = pd.DataFrame()

        # dołączenie do głównej tabeli
        addressTable = pd.concat([addressTable, address])
        
    except:
        physiciansFailed.append(physicianURL)



# wyczyszczenie kolumny z polem tekstu
addressTable.text = addressTable.text.str.replace('\n', ' ')
addressTable.text = addressTable.text.str.replace('\t', ' ')


# zapisanie obu tabel
physicianTable.to_csv('physicians.csv', encoding='utf-8', index=False)        
addressTable.to_csv('addresses.csv', encoding='utf-8', index=False)        
        
# zapisanie tych lekarzy, którzy nie wyszli:
pd.DataFrame(physiciansFailed).to_csv('physiciansFailed.csv', encoding='utf-8', index=False)

In [10]:
# jeszcze raz to samo dla lekarzy, których nie złapało za pierwszym razem

physicianTable2 = pd.DataFrame(columns = ['physicianID', 'title', 'name', 'online', 'rating', 'opinionCount', 'lastCommentDate',
       'specializations', 'link'])

addressTable2 = pd.DataFrame(columns = ['physicianID', 'postalCode', 'locality', 'region', 'country', 'street', 'latitude',
       'longitude'])

physicianIDCounter2 = 100000
physiciansFailed2 = []


# pętla do przechodzenia przez strony lekarzy
for physicianURL in physiciansFailed:
    
    try:
        # zupa z lekarza
        physicianSoup = Page(physicianURL).makeSoup()

        physicianIDCounter2 += 1

        # data frame ze szczegółami 
        try:
            details = pd.DataFrame(getDetails(physicianSoup))
            details['physicianID'] = physicianIDCounter2
            details['link'] = physicianURL
        except:
            details = pd.DataFrame()        

        # dołączenie do głównej tabeli
        physicianTable2 = pd.concat([physicianTable2, details])

        # data frame z adresami
        try:
            address = pd.DataFrame(getAddresses(physicianSoup)).T
            address['physicianID'] = physicianIDCounter2
        except:
            address = pd.DataFrame()

        # dołączenie do głównej tabeli
        addressTable2 = pd.concat([addressTable2, address])
        
    except:
        physiciansFailed2.append(physicianURL)


# wyczyszczenie kolumny z polem tekstu
addressTable2.text = addressTable2.text.str.replace('\n', ' ')
addressTable2.text = addressTable2.text.str.replace('\t', ' ')


# zapisanie obu tabel
physicianTable2.to_csv('physicians2.csv', encoding='utf-8', index=False)        
addressTable2.to_csv('addresses2.csv', encoding='utf-8', index=False)        
        
# zapisanie tych lekarzy, którzy nie wyszli:
pd.DataFrame(physiciansFailed2).to_csv('physiciansFailed2.csv', encoding='utf-8', index=False)

#### I to by było na tyle