In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import random
import os
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
from fake_useragent import UserAgent
import pymongo

%matplotlib inline

data_dir = os.path.join(os.path.curdir, "data")

### Download Stanford dogs to start

In [3]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [5]:
!wget -P ./data/ http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar 

--2017-09-01 23:00:16--  http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 793579520 (757M) [application/x-tar]
Saving to: ‘./data/images.tar’


2017-09-01 23:00:50 (22.3 MB/s) - ‘./data/images.tar’ saved [793579520/793579520]



In [6]:
!wget -P ./data http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar

--2017-09-01 23:01:02--  http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21852160 (21M) [application/x-tar]
Saving to: ‘./data/annotation.tar’


2017-09-01 23:01:03 (22.5 MB/s) - ‘./data/annotation.tar’ saved [21852160/21852160]



In [7]:
!wget -P ./data http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar

--2017-09-01 23:01:20--  http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 481280 (470K) [application/x-tar]
Saving to: ‘./data/lists.tar’


2017-09-01 23:01:20 (3.60 MB/s) - ‘./data/lists.tar’ saved [481280/481280]



In [9]:
!tar -xf ./data/images.tar -C ./data/

In [10]:
!tar -xf ./data/annotation.tar -C ./data/

In [11]:
!tar -xf ./data/lists.tar -C ./data/

### Gather names of dog breeds

In [45]:
image_dir = os.path.join(data_dir, 'Images')

In [46]:
dog_dirs = [direct for direct in os.listdir(image_dir)\
            if os.path.isdir(os.path.join(image_dir, direct))]

In [47]:
dog_breeds = sorted([dog.split('-', 1)[1].lower() for dog in dog_dirs])

In [48]:
list(enumerate(dog_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan_hound'),
 (2, 'african_hunting_dog'),
 (3, 'airedale'),
 (4, 'american_staffordshire_terrier'),
 (5, 'appenzeller'),
 (6, 'australian_terrier'),
 (7, 'basenji'),
 (8, 'basset'),
 (9, 'beagle'),
 (10, 'bedlington_terrier'),
 (11, 'bernese_mountain_dog'),
 (12, 'black-and-tan_coonhound'),
 (13, 'blenheim_spaniel'),
 (14, 'bloodhound'),
 (15, 'bluetick'),
 (16, 'border_collie'),
 (17, 'border_terrier'),
 (18, 'borzoi'),
 (19, 'boston_bull'),
 (20, 'bouvier_des_flandres'),
 (21, 'boxer'),
 (22, 'brabancon_griffon'),
 (23, 'briard'),
 (24, 'brittany_spaniel'),
 (25, 'bull_mastiff'),
 (26, 'cairn'),
 (27, 'cardigan'),
 (28, 'chesapeake_bay_retriever'),
 (29, 'chihuahua'),
 (30, 'chow'),
 (31, 'clumber'),
 (32, 'cocker_spaniel'),
 (33, 'collie'),
 (34, 'curly-coated_retriever'),
 (35, 'dandie_dinmont'),
 (36, 'dhole'),
 (37, 'dingo'),
 (38, 'doberman'),
 (39, 'english_foxhound'),
 (40, 'english_setter'),
 (41, 'english_springer'),
 (42, 'entlebucher'),
 (43

These need to be adjusted for each website to be scraped, but will be used as keys for mongodb.

### Scrape Dogtime

In [49]:
dogtime_breeds = [breed.replace("_", "-") for breed in dog_breeds]
dogtime_breeds[3] += "-terrier"
dogtime_breeds[5] += "-sennenhunde"
dogtime_breeds[8] += "-hound"
dogtime_breeds[13] = "cavalier-king-charles-spaniel"
dogtime_breeds[15] += "-coonhound"
dogtime_breeds[19] = "boston-terrier"
dogtime_breeds[22] = "brussels-griffon"
dogtime_breeds[24] = "brittany"
dogtime_breeds[25] = "".join(dogtime_breeds[25].split('-'))
dogtime_breeds[26] += "-terrier"
dogtime_breeds[27] += "-welsh-corgi"
dogtime_breeds[30] += "-chow"
dogtime_breeds[31] += "-spaniel"
dogtime_breeds[35] += "-terrier"
dogtime_breeds[38] += "-pinscher"
dogtime_breeds[41] += "-spaniel"
dogtime_breeds[42] += "-mountain-dog"
dogtime_breeds[43] = "american-eskimo-dog"
dogtime_breeds[44] += "-dog"
dogtime_breeds[47] = "german-shorthaired-pointer"
dogtime_breeds[54] = "belgian-sheepdog"
dogtime_breeds[61] = "japanese-chin"
dogtime_breeds[69] += "er"
dogtime_breeds[70] += "-apso"
dogtime_breeds[71] = "alaskan-malamute"
dogtime_breeds[72] = "belgian-malinois"
dogtime_breeds[73] = "maltese"
dogtime_breeds[74] = "xoloitzcuintli"
dogtime_breeds[85] = "pekingese"
dogtime_breeds[86] += "-welsh-corgi"
dogtime_breeds[89] += "-coonhound"
dogtime_breeds[96] = "scottish-terrier"
dogtime_breeds[105] = "poodle"
dogtime_breeds[111] = "toy-fox-terrier"
dogtime_breeds[113] = "treeing-walker-coonhound"
dogtime_breeds[118] = "fox-terrier"

In [50]:
list(enumerate(dogtime_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan-hound'),
 (2, 'african-hunting-dog'),
 (3, 'airedale-terrier'),
 (4, 'american-staffordshire-terrier'),
 (5, 'appenzeller-sennenhunde'),
 (6, 'australian-terrier'),
 (7, 'basenji'),
 (8, 'basset-hound'),
 (9, 'beagle'),
 (10, 'bedlington-terrier'),
 (11, 'bernese-mountain-dog'),
 (12, 'black-and-tan-coonhound'),
 (13, 'cavalier-king-charles-spaniel'),
 (14, 'bloodhound'),
 (15, 'bluetick-coonhound'),
 (16, 'border-collie'),
 (17, 'border-terrier'),
 (18, 'borzoi'),
 (19, 'boston-terrier'),
 (20, 'bouvier-des-flandres'),
 (21, 'boxer'),
 (22, 'brussels-griffon'),
 (23, 'briard'),
 (24, 'brittany'),
 (25, 'bullmastiff'),
 (26, 'cairn-terrier'),
 (27, 'cardigan-welsh-corgi'),
 (28, 'chesapeake-bay-retriever'),
 (29, 'chihuahua'),
 (30, 'chow-chow'),
 (31, 'clumber-spaniel'),
 (32, 'cocker-spaniel'),
 (33, 'collie'),
 (34, 'curly-coated-retriever'),
 (35, 'dandie-dinmont-terrier'),
 (36, 'dhole'),
 (37, 'dingo'),
 (38, 'doberman-pinscher'),
 (39, 'englis

In [51]:
dogtime_unused = [dogtime_breeds.pop(110),
                    dogtime_breeds.pop(76),
                    dogtime_breeds.pop(63),
                    dogtime_breeds.pop(37),
                    dogtime_breeds.pop(36),
                    dogtime_breeds.pop(4),
                    dogtime_breeds.pop(2)]

In [52]:
dogtime_unused

['toy-poodle',
 'miniature-poodle',
 'kelpie',
 'dingo',
 'dhole',
 'american-staffordshire-terrier',
 'african-hunting-dog']

In [53]:
unused_dog_breeds = [dog_breeds.pop(110),
                    dog_breeds.pop(76),
                    dog_breeds.pop(63),
                    dog_breeds.pop(37),
                    dog_breeds.pop(36),
                    dog_breeds.pop(4),
                    dog_breeds.pop(2)]

In [54]:
unused_dog_breeds

['toy_poodle',
 'miniature_poodle',
 'kelpie',
 'dingo',
 'dhole',
 'american_staffordshire_terrier',
 'african_hunting_dog']

In [55]:
list(zip(dog_breeds, dogtime_breeds))

[('affenpinscher', 'affenpinscher'),
 ('afghan_hound', 'afghan-hound'),
 ('airedale', 'airedale-terrier'),
 ('appenzeller', 'appenzeller-sennenhunde'),
 ('australian_terrier', 'australian-terrier'),
 ('basenji', 'basenji'),
 ('basset', 'basset-hound'),
 ('beagle', 'beagle'),
 ('bedlington_terrier', 'bedlington-terrier'),
 ('bernese_mountain_dog', 'bernese-mountain-dog'),
 ('black-and-tan_coonhound', 'black-and-tan-coonhound'),
 ('blenheim_spaniel', 'cavalier-king-charles-spaniel'),
 ('bloodhound', 'bloodhound'),
 ('bluetick', 'bluetick-coonhound'),
 ('border_collie', 'border-collie'),
 ('border_terrier', 'border-terrier'),
 ('borzoi', 'borzoi'),
 ('boston_bull', 'boston-terrier'),
 ('bouvier_des_flandres', 'bouvier-des-flandres'),
 ('boxer', 'boxer'),
 ('brabancon_griffon', 'brussels-griffon'),
 ('briard', 'briard'),
 ('brittany_spaniel', 'brittany'),
 ('bull_mastiff', 'bullmastiff'),
 ('cairn', 'cairn-terrier'),
 ('cardigan', 'cardigan-welsh-corgi'),
 ('chesapeake_bay_retriever', 'che

Remove unavailable breeds / wild dogs

In [11]:
dogtime_breed_url = "http://dogtime.com/dog-breeds"

r = requests.get(dogtime_breed_url)

dog_soup = BeautifulSoup(r.text, "lxml")

In [12]:
link = dog_soup.find_all(class_='group-list-item')[0].find('a')['href']
image = dog_soup.find_all(class_='group-list-item')[0].find('img')['src']

In [13]:
link, image

('http://dogtime.com/dog-breeds/affenpinscher',
 'http://cdn3-www.dogtime.com/assets/uploads/2011/01/file_23096_affenpinscher-300x189.jpg')

In [13]:
base_url = "http://dogtime.com/dog-breeds/"
breed_links = [base_url+breed for breed in dogtime_breeds]

In [15]:
breed_links[:10]

['http://dogtime.com/dog-breeds/affenpinscher',
 'http://dogtime.com/dog-breeds/afghan-hound',
 'http://dogtime.com/dog-breeds/airedale-terrier',
 'http://dogtime.com/dog-breeds/appenzeller-sennenhunde',
 'http://dogtime.com/dog-breeds/australian-terrier',
 'http://dogtime.com/dog-breeds/basenji',
 'http://dogtime.com/dog-breeds/basset-hound',
 'http://dogtime.com/dog-breeds/beagle',
 'http://dogtime.com/dog-breeds/bedlington-terrier',
 'http://dogtime.com/dog-breeds/bernese-mountain-dog']

In [33]:
client = pymongo.MongoClient("mongodb://54.67.76.74/dogbreeds")

In [34]:
db = client.dogbreeds

In [14]:
breed_links[-1]

'http://dogtime.com/dog-breeds/yorkshire-terrier'

In [15]:
r = requests.get(breed_links[-1])
dog_soup = BeautifulSoup(r.text, "lxml")

In [25]:
content = dog_soup.find(class_="category-article-main")

In [37]:
failed_breeds = []
for dog_breed, dogtime_breed in zip(dog_breeds, dogtime_breeds):
    dog_url = base_url + dogtime_breed
    try:
        r = requests.get(dog_url)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        print(dog_breed+" failed!")
        failed_breeds.append(dogtime_breed)
        time.sleep(random.uniform(3, 5))
        continue
    dog_soup = BeautifulSoup(r.text, "lxml")
    dog_content = dog_soup.find(class_="category-article-main")
    db.dogbreeds.update_one({'breed' : dog_breed, 'url' : dog_url,
                            'dogtime_content' : str(dog_content)})
    time.sleep(random.uniform(3, 5))
        

### Scrape dogbreedinfo

In [59]:
list(enumerate(dog_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan_hound'),
 (2, 'airedale'),
 (3, 'appenzeller'),
 (4, 'australian_terrier'),
 (5, 'basenji'),
 (6, 'basset'),
 (7, 'beagle'),
 (8, 'bedlington_terrier'),
 (9, 'bernese_mountain_dog'),
 (10, 'black-and-tan_coonhound'),
 (11, 'blenheim_spaniel'),
 (12, 'bloodhound'),
 (13, 'bluetick'),
 (14, 'border_collie'),
 (15, 'border_terrier'),
 (16, 'borzoi'),
 (17, 'boston_bull'),
 (18, 'bouvier_des_flandres'),
 (19, 'boxer'),
 (20, 'brabancon_griffon'),
 (21, 'briard'),
 (22, 'brittany_spaniel'),
 (23, 'bull_mastiff'),
 (24, 'cairn'),
 (25, 'cardigan'),
 (26, 'chesapeake_bay_retriever'),
 (27, 'chihuahua'),
 (28, 'chow'),
 (29, 'clumber'),
 (30, 'cocker_spaniel'),
 (31, 'collie'),
 (32, 'curly-coated_retriever'),
 (33, 'dandie_dinmont'),
 (34, 'doberman'),
 (35, 'english_foxhound'),
 (36, 'english_setter'),
 (37, 'english_springer'),
 (38, 'entlebucher'),
 (39, 'eskimo_dog'),
 (40, 'flat-coated_retriever'),
 (41, 'french_bulldog'),
 (42, 'german_shepherd'),
 (4

In [62]:
dogbreedinfo_breeds = [breed.replace("-", "").replace("_", "") for breed in dog_breeds]
dogbreedinfo_breeds[1] = 'afghan'
dogbreedinfo_breeds[3] = 'appenzell'
dogbreedinfo_breeds[6] += 'hound'
dogbreedinfo_breeds[9] = 'bernesemountain'
dogbreedinfo_breeds[11] = 'cavalierkingcharlesspaniel'
dogbreedinfo_breeds[17] = 'bostonterrier'
dogbreedinfo_breeds[20] = 'griffon'
dogbreedinfo_breeds[22] = 'brittany'
dogbreedinfo_breeds[24] += 'terrier'
dogbreedinfo_breeds[25] += 'corgi'
dogbreedinfo_breeds[26] = 'chesapeakebay'
dogbreedinfo_breeds[28] += 'chow'
dogbreedinfo_breeds[29] += 'spaniel'
dogbreedinfo_breeds[30] = 'cockers'
dogbreedinfo_breeds[37] += 'spaniel'
dogbreedinfo_breeds[39] = 'americaneskimo'
dogbreedinfo_breeds[49] = 'greaterswissmountain'
dogbreedinfo_breeds[50] = 'belgiangroenendael'
dogbreedinfo_breeds[62] = 'labrador'
dogbreedinfo_breeds[64] += 'er'
dogbreedinfo_breeds[65] += 'apso'
dogbreedinfo_breeds[66] = 'alaskanmalamute'
dogbreedinfo_breeds[67] = 'belgianmalinois'
dogbreedinfo_breeds[68] = 'maltese'
dogbreedinfo_breeds[69] = 'xoloitzcuintle'
dogbreedinfo_breeds[79] = 'pekingese'
dogbreedinfo_breeds[80] += 'corgi'
dogbreedinfo_breeds[83] += 'coonhound'
dogbreedinfo_breeds[90] = 'scottishterrier'
dogbreedinfo_breeds[101] = 'sussex'
dogbreedinfo_breeds[104] = 'toyfoxterrier'
dogbreedinfo_breeds[106] = 'treeingwalkercoonhound'
dogbreedinfo_breeds[109] = 'westhighland'
dogbreedinfo_breeds[111] = 'wirefoxterrier'

In [63]:
list(zip(dog_breeds, dogbreedinfo_breeds))

[('affenpinscher', 'affenpinscher'),
 ('afghan_hound', 'afghan'),
 ('airedale', 'airedale'),
 ('appenzeller', 'appenzell'),
 ('australian_terrier', 'australianterrier'),
 ('basenji', 'basenji'),
 ('basset', 'bassethound'),
 ('beagle', 'beagle'),
 ('bedlington_terrier', 'bedlingtonterrier'),
 ('bernese_mountain_dog', 'bernesemountain'),
 ('black-and-tan_coonhound', 'blackandtancoonhound'),
 ('blenheim_spaniel', 'cavalierkingcharlesspaniel'),
 ('bloodhound', 'bloodhound'),
 ('bluetick', 'bluetick'),
 ('border_collie', 'bordercollie'),
 ('border_terrier', 'borderterrier'),
 ('borzoi', 'borzoi'),
 ('boston_bull', 'bostonterrier'),
 ('bouvier_des_flandres', 'bouvierdesflandres'),
 ('boxer', 'boxer'),
 ('brabancon_griffon', 'griffon'),
 ('briard', 'briard'),
 ('brittany_spaniel', 'brittany'),
 ('bull_mastiff', 'bullmastiff'),
 ('cairn', 'cairnterrier'),
 ('cardigan', 'cardigancorgi'),
 ('chesapeake_bay_retriever', 'chesapeakebay'),
 ('chihuahua', 'chihuahua'),
 ('chow', 'chowchow'),
 ('clumb

In [64]:
dogbreedinfo_url = "https://www.dogbreedinfo.com/"


In [88]:
failed_breeds = []
for dog_breed, dogbreedinfo_breed in zip(dog_breeds, dogbreedinfo_breeds):
    dog_url = dogbreedinfo_url + dogbreedinfo_breed + ".htm"
    try:
        r = requests.get(dog_url)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        print(dog_breed+" failed!")
        failed_breeds.append(dogbreedinfo_breed)
        time.sleep(random.uniform(3, 5))
        continue
    dog_soup = BeautifulSoup(r.text, "lxml")
    dog_content = dog_soup.find(class_="mainArea")
    db.dogbreeds.update_one({'breed' : dog_breed},{"$set" : {'dogbreedinfo_url' : dog_url,
                            'dogbreedinfo_content' : str(dog_content)}})
    time.sleep(random.uniform(3, 5))    

### Scrape wikipedia

In [90]:
list(enumerate(dog_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan_hound'),
 (2, 'airedale'),
 (3, 'appenzeller'),
 (4, 'australian_terrier'),
 (5, 'basenji'),
 (6, 'basset'),
 (7, 'beagle'),
 (8, 'bedlington_terrier'),
 (9, 'bernese_mountain_dog'),
 (10, 'black-and-tan_coonhound'),
 (11, 'blenheim_spaniel'),
 (12, 'bloodhound'),
 (13, 'bluetick'),
 (14, 'border_collie'),
 (15, 'border_terrier'),
 (16, 'borzoi'),
 (17, 'boston_bull'),
 (18, 'bouvier_des_flandres'),
 (19, 'boxer'),
 (20, 'brabancon_griffon'),
 (21, 'briard'),
 (22, 'brittany_spaniel'),
 (23, 'bull_mastiff'),
 (24, 'cairn'),
 (25, 'cardigan'),
 (26, 'chesapeake_bay_retriever'),
 (27, 'chihuahua'),
 (28, 'chow'),
 (29, 'clumber'),
 (30, 'cocker_spaniel'),
 (31, 'collie'),
 (32, 'curly-coated_retriever'),
 (33, 'dandie_dinmont'),
 (34, 'doberman'),
 (35, 'english_foxhound'),
 (36, 'english_setter'),
 (37, 'english_springer'),
 (38, 'entlebucher'),
 (39, 'eskimo_dog'),
 (40, 'flat-coated_retriever'),
 (41, 'french_bulldog'),
 (42, 'german_shepherd'),
 (4

In [95]:
wikipedia_breeds = [breed.replace("-", "_").title() for breed in dog_breeds]
wikipedia_breeds[2] += "_Terrier"
wikipedia_breeds[3] += "_Sennenhund"
wikipedia_breeds[6] += "_Hound"
wikipedia_breeds[11] = "Cavalier_King_Charles_Spaniel"
wikipedia_breeds[13] += "_Coonhound"
wikipedia_breeds[17] = "Boston_Terrier"
wikipedia_breeds[18] = "Bouvier_des_Flandres"
wikipedia_breeds[20] = "Griffon_Bruxellois"
wikipedia_breeds[23] = "Bullmastiff"
wikipedia_breeds[24] += "_Terrier"
wikipedia_breeds[25] += "_Welsh_Corgi"
wikipedia_breeds[28] += "_Chow"
wikipedia_breeds[29] += "_Spaniel"
wikipedia_breeds[30] = "English_Cocker_Spaniel"
wikipedia_breeds[31] = "Rough_Collie"
wikipedia_breeds[33] += "_Terrier"
wikipedia_breeds[34] += "_Pinscher"
wikipedia_breeds[37] += "_Spaniel"
wikipedia_breeds[38] = "Entlebucher_Mountain_Dog"
wikipedia_breeds[39] = "American_Eskimo_Dog"
wikipedia_breeds[40] = "Flat-Coated_Retriever"
wikipedia_breeds[43] = "German_Shorthaired_Pointer"
wikipedia_breeds[50] += "_dog"
wikipedia_breeds[57] = "Japanese_Chin"
wikipedia_breeds[64] += "er"
wikipedia_breeds[65] += "_Apso"
wikipedia_breeds[66] = "Alaskan_Malamute"
wikipedia_breeds[67] += "_dog"
wikipedia_breeds[68] = "Maltese_(dog)"
wikipedia_breeds[69] += "_Dog"
wikipedia_breeds[78] += "_(dog)"
wikipedia_breeds[79] = "Pekingese"
wikipedia_breeds[80] += "_Welsh_Corgi"
wikipedia_breeds[83] += "_Coonhound"
wikipedia_breeds[86] = "St._Bernard_(dog)"
wikipedia_breeds[88] += "_(dog)"
wikipedia_breeds[90] = "Scottish_Terrier"
wikipedia_breeds[96] = "Australian_Silky_Terrier"
wikipedia_breeds[97] = "Soft-Coated_Wheaten_Terrier"
wikipedia_breeds[98] = "Staffordshire_Bull_Terrier"
wikipedia_breeds[99] = "Poodle"
wikipedia_breeds[104] = "Toy_Fox_Terrier"
wikipedia_breeds[106] = "Treeing_Walker_Coonhound"
wikipedia_breeds[111] = "Wire_Fox_Terrier"

In [96]:
list(zip(dog_breeds, wikipedia_breeds))

[('affenpinscher', 'Affenpinscher'),
 ('afghan_hound', 'Afghan_Hound'),
 ('airedale', 'Airedale_Terrier'),
 ('appenzeller', 'Appenzeller_Sennenhund'),
 ('australian_terrier', 'Australian_Terrier'),
 ('basenji', 'Basenji'),
 ('basset', 'Basset_Hound'),
 ('beagle', 'Beagle'),
 ('bedlington_terrier', 'Bedlington_Terrier'),
 ('bernese_mountain_dog', 'Bernese_Mountain_Dog'),
 ('black-and-tan_coonhound', 'Black_And_Tan_Coonhound'),
 ('blenheim_spaniel', 'Cavalier_King_Charles_Spaniel'),
 ('bloodhound', 'Bloodhound'),
 ('bluetick', 'Bluetick_Coonhound'),
 ('border_collie', 'Border_Collie'),
 ('border_terrier', 'Border_Terrier'),
 ('borzoi', 'Borzoi'),
 ('boston_bull', 'Boston_Terrier'),
 ('bouvier_des_flandres', 'Bouvier_des_Flandres'),
 ('boxer', 'Boxer'),
 ('brabancon_griffon', 'Griffon_Bruxellois'),
 ('briard', 'Briard'),
 ('brittany_spaniel', 'Brittany_Spaniel'),
 ('bull_mastiff', 'Bullmastiff'),
 ('cairn', 'Cairn_Terrier'),
 ('cardigan', 'Cardigan_Welsh_Corgi'),
 ('chesapeake_bay_retriev

In [103]:
headers = {
    'User-Agent' : 'PoochrScoopr',
    'From' : 'aawiegel@gmail.com'
}

In [104]:
wikipedia_api = "https://en.wikipedia.org/api/rest_v1/page/html/" #Afghan_Hound?redirect=true"

In [105]:
failed_breeds = []
for dog_breed, wikipedia_breed in zip(dog_breeds, wikipedia_breeds):
    dog_url = wikipedia_api + wikipedia_breed + "?redirect=true"
    try:
        r = requests.get(dog_url, headers=headers)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        print(dog_breed+" failed!")
        failed_breeds.append(dogbreedinfo_breed)
        time.sleep(random.uniform(1, 2))
        continue
    db.dogbreeds.update_one({'breed' : dog_breed},{"$set" : {'wikipedia_url' : dog_url,
                            'wikipedia_content' : str(r.text)}})
    time.sleep(random.uniform(1, 2)) 