### Obtain the list of dog races: Manually copy from item inspect, save to text file

In [52]:
readfile = '..\\data\\races\\races_tags.txt'
writefile = '..\\data\\races\\dog_races.txt'

# Read file 
with open(readfile, 'r') as rf:
    data = rf.read().split('</option>')
data = data[:-1]

# Extract list of races
races = [s.split('>')[1] for s in data]

# Write list of races
with open(writefile, 'w') as wf:
    wf.write('\n'.join(races))

### Scrape Dog Entries

In [1]:
import requests
import bs4
import re
import json

In [2]:
from datetime import datetime
import locale
locale.setlocale(locale.LC_TIME, 'esn')

'Spanish_Spain.1252'

In [3]:
# Functions to retreive information from each dog entry
def get_photo(entry_soup):
    ''' 
    Return url (string) of photo for entry 
    '''
    div_photo = entry_soup.find_all('div', attrs={'class':'ficha-mascota-foto'})
    url_photo = div_photo[0].find('img')['src']
    if url_photo == "/img/common/usuarios-nofoto.gif":
        url_photo = ''
    return url_photo

def get_entry(entry_soup):
    '''
    Return list of fields from entry
    '''    
    soup = entry_soup.find_all('div', attrs={'class':'ficha-mascota-info'})
    fields = []
    try:
        for b in soup[0].find_all('b'):
            fields.append(b.get_text().strip())
    except:
        fields = []
    return fields

def get_owner(entry_soup):
    ''' 
    Returns string of owner name
    '''
    user_soup = entry_soup.find_all('div', attrs={'id':'info-user-int'})
    temp = user_soup[0].find_all('h4')
    return temp[0].get_text()
    

def make_dbdict(url_photo, entry_field, entry_owner, ped):
    # Transformations and integrity control
    if entry_field[5].capitalize() not in {'Hembra', 'Macho'}:
        entry_field[5] = ''
    dtobj = datetime.strptime(entry_field[2], '%d de %B de %Y')
    
    db_dict = {'name': entry_field[0], \
               'gender': entry_field[5], \
               'city': entry_field[-1], \
               'photo': url_photo, \
               'pedigree': ped, \
               'race': entry_field[1], \
               'born':{'__type': "Date", 'iso': dtobj.strftime('%Y-%m-%dT00:00:00.000Z')}, \
               'username': entry_owner, \
               'useremail': entry_owner + '@traitydogs.com', \
              }
#               'location': [], \
    return db_dict

In [5]:
url_base = 'http://www.perros.com'
entry_dicts = []

# Loop over all search gallery pages
#range(1,1317+1)
for ipage in range(2,10):
    url_gallery = "http://www.perros.com/nuestros-perros/%d/?sexo=&raza=&pais=8&provincia=&localidad=" % ipage

    # Get links to each dog from the current search gallery page
    gallery_soup = bs4.BeautifulSoup(requests.get(url_gallery).text)
    gallery_links = []
    for div in gallery_soup.find_all('div', attrs={'class':'buscador-item'}):
        gallery_links.append(div.a['href'])

    # Get Pedigree for each dog in gallery
    gallery_str = str(gallery_soup.find_all('div', attrs={'class':'buscador-item'}))
    findstr = '<b>Pedigree:</b> '
    pedigree = [gallery_str[i+len(findstr):i+len(findstr)+2] for i in [s.start() for s in re.finditer(findstr, gallery_str)]]
    pedigree = [s == 'Si' for s in pedigree]

    # Check that pedigree is equal in size to gallery_links list
    if len(gallery_links) != len(pedigree):
        print('Some dog at page %d does not have pedigree' % ipage)

    # For each Dog Entry    
    for entry_link, ped in zip(gallery_links, pedigree):
        response = requests.get(url_base + entry_link)    
        if response.status_code != 404:
            entry_soup = bs4.BeautifulSoup(response.text)
            url_photo = get_photo(entry_soup)
            if url_photo:
                entry_field = get_entry(entry_soup)
                entry_owner = get_owner(entry_soup)
                db_dict = make_dbdict(url_base + url_photo, entry_field, entry_owner, ped)
                entry_dicts.append(db_dict)
    print(len(entry_dicts))


with open('Dog.json', 'w') as fdb:
    fdb.write('{ "results": ')
    fdb.write(json.dumps(entry_dicts, sort_keys=True, indent=4))
    fdb.write(' }')
print('Success!')


17
29
43
59
71
85
97
110
Success!


In [380]:

print(entry_soup.find_all('div', attrs={'id':'info-user-int'}))

[<div id="info-user-int">
<a href="/usuarios/silf.html"><img align="middle" alt="Sin foto" height="58" src="/img/common/usuarios-nofoto.gif" width="58"/></a>
<h4><a href="/usuarios/silf.html">Silf</a></h4>
					Vilamarxant (Valencia) <br/>
<b>Sexo:</b> Mujer					<div class="foro-ver-usuario">
<a class="fotos" href="/usuarios/silf.html">0  Albums</a>   <a class="huella" href="/usuarios/silf.html">2 perros</a>
</div>
</div>]


In [395]:
temp = user_soup[0].find_all('h4')
temp[0].get_text()
#.get_text().strip())

'Silf'