In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import random
import os
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
from fake_useragent import UserAgent
import pymongo

%matplotlib inline

data_dir = os.path.join(os.path.curdir, "data")

### Download Stanford dogs to start

In [3]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [5]:
!wget -P ./data/ http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar 

--2017-09-01 23:00:16--  http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 793579520 (757M) [application/x-tar]
Saving to: ‘./data/images.tar’


2017-09-01 23:00:50 (22.3 MB/s) - ‘./data/images.tar’ saved [793579520/793579520]



In [6]:
!wget -P ./data http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar

--2017-09-01 23:01:02--  http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21852160 (21M) [application/x-tar]
Saving to: ‘./data/annotation.tar’


2017-09-01 23:01:03 (22.5 MB/s) - ‘./data/annotation.tar’ saved [21852160/21852160]



In [7]:
!wget -P ./data http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar

--2017-09-01 23:01:20--  http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 481280 (470K) [application/x-tar]
Saving to: ‘./data/lists.tar’


2017-09-01 23:01:20 (3.60 MB/s) - ‘./data/lists.tar’ saved [481280/481280]



In [9]:
!tar -xf ./data/images.tar -C ./data/

In [10]:
!tar -xf ./data/annotation.tar -C ./data/

In [11]:
!tar -xf ./data/lists.tar -C ./data/

### Gather names of dog breeds

In [45]:
image_dir = os.path.join(data_dir, 'Images')

In [46]:
dog_dirs = [direct for direct in os.listdir(image_dir)\
            if os.path.isdir(os.path.join(image_dir, direct))]

In [47]:
dog_breeds = sorted([dog.split('-', 1)[1].lower() for dog in dog_dirs])

In [48]:
list(enumerate(dog_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan_hound'),
 (2, 'african_hunting_dog'),
 (3, 'airedale'),
 (4, 'american_staffordshire_terrier'),
 (5, 'appenzeller'),
 (6, 'australian_terrier'),
 (7, 'basenji'),
 (8, 'basset'),
 (9, 'beagle'),
 (10, 'bedlington_terrier'),
 (11, 'bernese_mountain_dog'),
 (12, 'black-and-tan_coonhound'),
 (13, 'blenheim_spaniel'),
 (14, 'bloodhound'),
 (15, 'bluetick'),
 (16, 'border_collie'),
 (17, 'border_terrier'),
 (18, 'borzoi'),
 (19, 'boston_bull'),
 (20, 'bouvier_des_flandres'),
 (21, 'boxer'),
 (22, 'brabancon_griffon'),
 (23, 'briard'),
 (24, 'brittany_spaniel'),
 (25, 'bull_mastiff'),
 (26, 'cairn'),
 (27, 'cardigan'),
 (28, 'chesapeake_bay_retriever'),
 (29, 'chihuahua'),
 (30, 'chow'),
 (31, 'clumber'),
 (32, 'cocker_spaniel'),
 (33, 'collie'),
 (34, 'curly-coated_retriever'),
 (35, 'dandie_dinmont'),
 (36, 'dhole'),
 (37, 'dingo'),
 (38, 'doberman'),
 (39, 'english_foxhound'),
 (40, 'english_setter'),
 (41, 'english_springer'),
 (42, 'entlebucher'),
 (43

These need to be adjusted for each website to be scraped, but will be used as keys for mongodb.

### Scrape Dogtime

In [49]:
dogtime_breeds = [breed.replace("_", "-") for breed in dog_breeds]
dogtime_breeds[3] += "-terrier"
dogtime_breeds[4] = "american-pit-bull-terrier"
dogtime_breeds[5] += "-sennenhunde"
dogtime_breeds[8] += "-hound"
dogtime_breeds[13] = "cavalier-king-charles-spaniel"
dogtime_breeds[15] += "-coonhound"
dogtime_breeds[19] = "boston-terrier"
dogtime_breeds[22] = "brussels-griffon"
dogtime_breeds[24] = "brittany"
dogtime_breeds[25] = "".join(dogtime_breeds[25].split('-'))
dogtime_breeds[26] += "-terrier"
dogtime_breeds[27] += "-welsh-corgi"
dogtime_breeds[30] += "-chow"
dogtime_breeds[31] += "-spaniel"
dogtime_breeds[35] += "-terrier"
dogtime_breeds[38] += "-pinscher"
dogtime_breeds[41] += "-spaniel"
dogtime_breeds[42] += "-mountain-dog"
dogtime_breeds[43] = "american-eskimo-dog"
dogtime_breeds[44] += "-dog"
dogtime_breeds[47] = "german-shorthaired-pointer"
dogtime_breeds[54] = "belgian-sheepdog"
dogtime_breeds[61] = "japanese-chin"
dogtime_breeds[69] += "er"
dogtime_breeds[70] += "-apso"
dogtime_breeds[71] = "alaskan-malamute"
dogtime_breeds[72] = "belgian-malinois"
dogtime_breeds[73] = "maltese"
dogtime_breeds[74] = "xoloitzcuintli"
dogtime_breeds[85] = "pekingese"
dogtime_breeds[86] += "-welsh-corgi"
dogtime_breeds[89] += "-coonhound"
dogtime_breeds[96] = "scottish-terrier"
dogtime_breeds[105] = "poodle"
dogtime_breeds[111] = "toy-fox-terrier"
dogtime_breeds[113] = "treeing-walker-coonhound"
dogtime_breeds[118] = "fox-terrier"

In [25]:
list(enumerate(dogtime_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan-hound'),
 (2, 'american-pit-bull-terrier'),
 (3, 'airedale-terrier'),
 (4, 'american-staffordshire-terrier'),
 (5, 'appenzeller-sennenhunde'),
 (6, 'australian-terrier'),
 (7, 'basenji'),
 (8, 'basset-hound'),
 (9, 'beagle'),
 (10, 'bedlington-terrier'),
 (11, 'bernese-mountain-dog'),
 (12, 'black-and-tan-coonhound'),
 (13, 'cavalier-king-charles-spaniel'),
 (14, 'bloodhound'),
 (15, 'bluetick-coonhound'),
 (16, 'border-collie'),
 (17, 'border-terrier'),
 (18, 'borzoi'),
 (19, 'boston-terrier'),
 (20, 'bouvier-des-flandres'),
 (21, 'boxer'),
 (22, 'brussels-griffon'),
 (23, 'briard'),
 (24, 'brittany'),
 (25, 'bullmastiff'),
 (26, 'cairn-terrier'),
 (27, 'cardigan-welsh-corgi'),
 (28, 'chesapeake-bay-retriever'),
 (29, 'chihuahua'),
 (30, 'chow-chow'),
 (31, 'clumber-spaniel'),
 (32, 'cocker-spaniel'),
 (33, 'collie'),
 (34, 'curly-coated-retriever'),
 (35, 'dandie-dinmont-terrier'),
 (36, 'dhole'),
 (37, 'dingo'),
 (38, 'doberman-pinscher'),
 (39, '

In [50]:
dogtime_unused = [dogtime_breeds.pop(110),
                    dogtime_breeds.pop(76),
                    dogtime_breeds.pop(63),
                    dogtime_breeds.pop(37),
                    dogtime_breeds.pop(36),
                    dogtime_breeds.pop(2)]

In [51]:
dogtime_unused

['toy-poodle',
 'miniature-poodle',
 'kelpie',
 'dingo',
 'dhole',
 'african-hunting-dog']

In [52]:
unused_dog_breeds = [dog_breeds.pop(110),
                    dog_breeds.pop(76),
                    dog_breeds.pop(63),
                    dog_breeds.pop(37),
                    dog_breeds.pop(36),
                    dog_breeds.pop(2)]

In [53]:
unused_dog_breeds

['toy_poodle',
 'miniature_poodle',
 'kelpie',
 'dingo',
 'dhole',
 'african_hunting_dog']

In [54]:
list(zip(dog_breeds, dogtime_breeds))

[('affenpinscher', 'affenpinscher'),
 ('afghan_hound', 'afghan-hound'),
 ('airedale', 'airedale-terrier'),
 ('american_staffordshire_terrier', 'american-pit-bull-terrier'),
 ('appenzeller', 'appenzeller-sennenhunde'),
 ('australian_terrier', 'australian-terrier'),
 ('basenji', 'basenji'),
 ('basset', 'basset-hound'),
 ('beagle', 'beagle'),
 ('bedlington_terrier', 'bedlington-terrier'),
 ('bernese_mountain_dog', 'bernese-mountain-dog'),
 ('black-and-tan_coonhound', 'black-and-tan-coonhound'),
 ('blenheim_spaniel', 'cavalier-king-charles-spaniel'),
 ('bloodhound', 'bloodhound'),
 ('bluetick', 'bluetick-coonhound'),
 ('border_collie', 'border-collie'),
 ('border_terrier', 'border-terrier'),
 ('borzoi', 'borzoi'),
 ('boston_bull', 'boston-terrier'),
 ('bouvier_des_flandres', 'bouvier-des-flandres'),
 ('boxer', 'boxer'),
 ('brabancon_griffon', 'brussels-griffon'),
 ('briard', 'briard'),
 ('brittany_spaniel', 'brittany'),
 ('bull_mastiff', 'bullmastiff'),
 ('cairn', 'cairn-terrier'),
 ('card

Remove unavailable breeds / wild dogs

In [31]:
dogtime_breed_url = "http://dogtime.com/dog-breeds"

r = requests.get(dogtime_breed_url)

dog_soup = BeautifulSoup(r.text, "lxml")

In [32]:
link = dog_soup.find_all(class_='group-list-item')[0].find('a')['href']
image = dog_soup.find_all(class_='group-list-item')[0].find('img')['src']

In [33]:
link, image

('http://dogtime.com/dog-breeds/affenpinscher',
 'http://cdn3-www.dogtime.com/assets/uploads/2011/01/file_23096_affenpinscher-300x189.jpg')

In [34]:
base_url = "http://dogtime.com/dog-breeds/"
breed_links = [base_url+breed for breed in dogtime_breeds]

In [35]:
breed_links[:10]

['http://dogtime.com/dog-breeds/affenpinscher',
 'http://dogtime.com/dog-breeds/afghan-hound',
 'http://dogtime.com/dog-breeds/american-pit-bull-terrier',
 'http://dogtime.com/dog-breeds/airedale-terrier',
 'http://dogtime.com/dog-breeds/appenzeller-sennenhunde',
 'http://dogtime.com/dog-breeds/australian-terrier',
 'http://dogtime.com/dog-breeds/basenji',
 'http://dogtime.com/dog-breeds/basset-hound',
 'http://dogtime.com/dog-breeds/beagle',
 'http://dogtime.com/dog-breeds/bedlington-terrier']

In [36]:
client = pymongo.MongoClient("mongodb://54.67.76.74/dogbreeds")

In [37]:
db = client.dogbreeds

In [39]:
breed_links[2]

'http://dogtime.com/dog-breeds/american-pit-bull-terrier'

In [64]:
dog_breeds[3]

'american_staffordshire_terrier'

In [15]:
r = requests.get(breed_links[-1])
dog_soup = BeautifulSoup(r.text, "lxml")

In [25]:
content = dog_soup.find(class_="category-article-main")

In [65]:
failed_breeds = []
for dog_breed, dogtime_breed in zip([dog_breeds[3]], [dogtime_breeds[3]]):
    dog_url = base_url + dogtime_breed
    try:
        r = requests.get(dog_url)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        print(dog_breed+" failed!")
        failed_breeds.append(dogtime_breed)
        time.sleep(random.uniform(3, 5))
        continue
    dog_soup = BeautifulSoup(r.text, "lxml")
    dog_content = dog_soup.find(class_="category-article-main")
    db.dogbreeds.update_one({'breed' : dog_breed}, {"$set" : {'url' : dog_url,
                            'dogtime_content' : str(dog_content)}})
    time.sleep(random.uniform(3, 5))
        

### Scrape dogbreedinfo

In [59]:
list(enumerate(dog_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan_hound'),
 (2, 'airedale'),
 (3, 'american_staffordshire_terrier'),
 (4, 'appenzeller'),
 (5, 'australian_terrier'),
 (6, 'basenji'),
 (7, 'basset'),
 (8, 'beagle'),
 (9, 'bedlington_terrier'),
 (10, 'bernese_mountain_dog'),
 (11, 'black-and-tan_coonhound'),
 (12, 'blenheim_spaniel'),
 (13, 'bloodhound'),
 (14, 'bluetick'),
 (15, 'border_collie'),
 (16, 'border_terrier'),
 (17, 'borzoi'),
 (18, 'boston_bull'),
 (19, 'bouvier_des_flandres'),
 (20, 'boxer'),
 (21, 'brabancon_griffon'),
 (22, 'briard'),
 (23, 'brittany_spaniel'),
 (24, 'bull_mastiff'),
 (25, 'cairn'),
 (26, 'cardigan'),
 (27, 'chesapeake_bay_retriever'),
 (28, 'chihuahua'),
 (29, 'chow'),
 (30, 'clumber'),
 (31, 'cocker_spaniel'),
 (32, 'collie'),
 (33, 'curly-coated_retriever'),
 (34, 'dandie_dinmont'),
 (35, 'doberman'),
 (36, 'english_foxhound'),
 (37, 'english_setter'),
 (38, 'english_springer'),
 (39, 'entlebucher'),
 (40, 'eskimo_dog'),
 (41, 'flat-coated_retriever'),
 (42, 'french

In [66]:
dogbreedinfo_breeds = [breed.replace("-", "").replace("_", "") for breed in dog_breeds]
dogbreedinfo_breeds[1] = 'afghan'
dogbreedinfo_breeds[3] = 'americanstaffordshire'
dogbreedinfo_breeds[4] = 'appenzell'
dogbreedinfo_breeds[7] += 'hound'
dogbreedinfo_breeds[10] = 'bernesemountain'
dogbreedinfo_breeds[12] = 'cavalierkingcharlesspaniel'
dogbreedinfo_breeds[18] = 'bostonterrier'
dogbreedinfo_breeds[21] = 'griffon'
dogbreedinfo_breeds[23] = 'brittany'
dogbreedinfo_breeds[25] += 'terrier'
dogbreedinfo_breeds[26] += 'corgi'
dogbreedinfo_breeds[27] = 'chesapeakebay'
dogbreedinfo_breeds[29] += 'chow'
dogbreedinfo_breeds[30] += 'spaniel'
dogbreedinfo_breeds[31] = 'cockers'
dogbreedinfo_breeds[38] += 'spaniel'
dogbreedinfo_breeds[40] = 'americaneskimo'
dogbreedinfo_breeds[50] = 'greaterswissmountain'
dogbreedinfo_breeds[51] = 'belgiangroenendael'
dogbreedinfo_breeds[63] = 'labrador'
dogbreedinfo_breeds[65] += 'er'
dogbreedinfo_breeds[66] += 'apso'
dogbreedinfo_breeds[67] = 'alaskanmalamute'
dogbreedinfo_breeds[68] = 'belgianmalinois'
dogbreedinfo_breeds[69] = 'maltese'
dogbreedinfo_breeds[70] = 'xoloitzcuintle'
dogbreedinfo_breeds[80] = 'pekingese'
dogbreedinfo_breeds[81] += 'corgi'
dogbreedinfo_breeds[84] += 'coonhound'
dogbreedinfo_breeds[91] = 'scottishterrier'
dogbreedinfo_breeds[102] = 'sussex'
dogbreedinfo_breeds[105] = 'toyfoxterrier'
dogbreedinfo_breeds[107] = 'treeingwalkercoonhound'
dogbreedinfo_breeds[110] = 'westhighland'
dogbreedinfo_breeds[112] = 'wirefoxterrier'

In [67]:
list(zip(dog_breeds, dogbreedinfo_breeds))

[('affenpinscher', 'affenpinscher'),
 ('afghan_hound', 'afghan'),
 ('airedale', 'airedale'),
 ('american_staffordshire_terrier', 'americanstaffordshire'),
 ('appenzeller', 'appenzell'),
 ('australian_terrier', 'australianterrier'),
 ('basenji', 'basenji'),
 ('basset', 'bassethound'),
 ('beagle', 'beagle'),
 ('bedlington_terrier', 'bedlingtonterrier'),
 ('bernese_mountain_dog', 'bernesemountain'),
 ('black-and-tan_coonhound', 'blackandtancoonhound'),
 ('blenheim_spaniel', 'cavalierkingcharlesspaniel'),
 ('bloodhound', 'bloodhound'),
 ('bluetick', 'bluetick'),
 ('border_collie', 'bordercollie'),
 ('border_terrier', 'borderterrier'),
 ('borzoi', 'borzoi'),
 ('boston_bull', 'bostonterrier'),
 ('bouvier_des_flandres', 'bouvierdesflandres'),
 ('boxer', 'boxer'),
 ('brabancon_griffon', 'griffon'),
 ('briard', 'briard'),
 ('brittany_spaniel', 'brittany'),
 ('bull_mastiff', 'bullmastiff'),
 ('cairn', 'cairnterrier'),
 ('cardigan', 'cardigancorgi'),
 ('chesapeake_bay_retriever', 'chesapeakebay')

In [68]:
dogbreedinfo_url = "https://www.dogbreedinfo.com/"


In [69]:
failed_breeds = []
for dog_breed, dogbreedinfo_breed in zip([dog_breeds[3]], [dogbreedinfo_breeds[3]]):
    dog_url = dogbreedinfo_url + dogbreedinfo_breed + ".htm"
    try:
        r = requests.get(dog_url)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        print(dog_breed+" failed!")
        failed_breeds.append(dogbreedinfo_breed)
        time.sleep(random.uniform(3, 5))
        continue
    dog_soup = BeautifulSoup(r.text, "lxml")
    dog_content = dog_soup.find(class_="mainArea")
    db.dogbreeds.update_one({'breed' : dog_breed},{"$set" : {'dogbreedinfo_url' : dog_url,
                            'dogbreedinfo_content' : str(dog_content)}})
    time.sleep(random.uniform(3, 5))    

### Scrape wikipedia

In [70]:
list(enumerate(dog_breeds))

[(0, 'affenpinscher'),
 (1, 'afghan_hound'),
 (2, 'airedale'),
 (3, 'american_staffordshire_terrier'),
 (4, 'appenzeller'),
 (5, 'australian_terrier'),
 (6, 'basenji'),
 (7, 'basset'),
 (8, 'beagle'),
 (9, 'bedlington_terrier'),
 (10, 'bernese_mountain_dog'),
 (11, 'black-and-tan_coonhound'),
 (12, 'blenheim_spaniel'),
 (13, 'bloodhound'),
 (14, 'bluetick'),
 (15, 'border_collie'),
 (16, 'border_terrier'),
 (17, 'borzoi'),
 (18, 'boston_bull'),
 (19, 'bouvier_des_flandres'),
 (20, 'boxer'),
 (21, 'brabancon_griffon'),
 (22, 'briard'),
 (23, 'brittany_spaniel'),
 (24, 'bull_mastiff'),
 (25, 'cairn'),
 (26, 'cardigan'),
 (27, 'chesapeake_bay_retriever'),
 (28, 'chihuahua'),
 (29, 'chow'),
 (30, 'clumber'),
 (31, 'cocker_spaniel'),
 (32, 'collie'),
 (33, 'curly-coated_retriever'),
 (34, 'dandie_dinmont'),
 (35, 'doberman'),
 (36, 'english_foxhound'),
 (37, 'english_setter'),
 (38, 'english_springer'),
 (39, 'entlebucher'),
 (40, 'eskimo_dog'),
 (41, 'flat-coated_retriever'),
 (42, 'french

In [71]:
wikipedia_breeds = [breed.replace("-", "_").title() for breed in dog_breeds]
wikipedia_breeds[2] += "_Terrier"
wikipedia_breeds[4] += "_Sennenhund"
wikipedia_breeds[7] += "_Hound"
wikipedia_breeds[12] = "Cavalier_King_Charles_Spaniel"
wikipedia_breeds[14] += "_Coonhound"
wikipedia_breeds[18] = "Boston_Terrier"
wikipedia_breeds[19] = "Bouvier_des_Flandres"
wikipedia_breeds[21] = "Griffon_Bruxellois"
wikipedia_breeds[24] = "Bullmastiff"
wikipedia_breeds[25] += "_Terrier"
wikipedia_breeds[26] += "_Welsh_Corgi"
wikipedia_breeds[29] += "_Chow"
wikipedia_breeds[30] += "_Spaniel"
wikipedia_breeds[31] = "English_Cocker_Spaniel"
wikipedia_breeds[32] = "Rough_Collie"
wikipedia_breeds[34] += "_Terrier"
wikipedia_breeds[35] += "_Pinscher"
wikipedia_breeds[38] += "_Spaniel"
wikipedia_breeds[39] = "Entlebucher_Mountain_Dog"
wikipedia_breeds[40] = "American_Eskimo_Dog"
wikipedia_breeds[41] = "Flat-Coated_Retriever"
wikipedia_breeds[44] = "German_Shorthaired_Pointer"
wikipedia_breeds[51] += "_dog"
wikipedia_breeds[58] = "Japanese_Chin"
wikipedia_breeds[65] += "er"
wikipedia_breeds[66] += "_Apso"
wikipedia_breeds[67] = "Alaskan_Malamute"
wikipedia_breeds[68] += "_dog"
wikipedia_breeds[69] = "Maltese_(dog)"
wikipedia_breeds[70] += "_Dog"
wikipedia_breeds[79] += "_(dog)"
wikipedia_breeds[80] = "Pekingese"
wikipedia_breeds[81] += "_Welsh_Corgi"
wikipedia_breeds[84] += "_Coonhound"
wikipedia_breeds[87] = "St._Bernard_(dog)"
wikipedia_breeds[89] += "_(dog)"
wikipedia_breeds[91] = "Scottish_Terrier"
wikipedia_breeds[97] = "Australian_Silky_Terrier"
wikipedia_breeds[98] = "Soft-Coated_Wheaten_Terrier"
wikipedia_breeds[99] = "Staffordshire_Bull_Terrier"
wikipedia_breeds[100] = "Poodle"
wikipedia_breeds[105] = "Toy_Fox_Terrier"
wikipedia_breeds[107] = "Treeing_Walker_Coonhound"
wikipedia_breeds[112] = "Wire_Fox_Terrier"

In [72]:
list(zip(dog_breeds, wikipedia_breeds))

[('affenpinscher', 'Affenpinscher'),
 ('afghan_hound', 'Afghan_Hound'),
 ('airedale', 'Airedale_Terrier'),
 ('american_staffordshire_terrier', 'American_Staffordshire_Terrier'),
 ('appenzeller', 'Appenzeller_Sennenhund'),
 ('australian_terrier', 'Australian_Terrier'),
 ('basenji', 'Basenji'),
 ('basset', 'Basset_Hound'),
 ('beagle', 'Beagle'),
 ('bedlington_terrier', 'Bedlington_Terrier'),
 ('bernese_mountain_dog', 'Bernese_Mountain_Dog'),
 ('black-and-tan_coonhound', 'Black_And_Tan_Coonhound'),
 ('blenheim_spaniel', 'Cavalier_King_Charles_Spaniel'),
 ('bloodhound', 'Bloodhound'),
 ('bluetick', 'Bluetick_Coonhound'),
 ('border_collie', 'Border_Collie'),
 ('border_terrier', 'Border_Terrier'),
 ('borzoi', 'Borzoi'),
 ('boston_bull', 'Boston_Terrier'),
 ('bouvier_des_flandres', 'Bouvier_des_Flandres'),
 ('boxer', 'Boxer'),
 ('brabancon_griffon', 'Griffon_Bruxellois'),
 ('briard', 'Briard'),
 ('brittany_spaniel', 'Brittany_Spaniel'),
 ('bull_mastiff', 'Bullmastiff'),
 ('cairn', 'Cairn_Terr

In [73]:
headers = {
    'User-Agent' : 'PoochrScoopr',
    'From' : 'aawiegel@gmail.com'
}

In [74]:
wikipedia_api = "https://en.wikipedia.org/api/rest_v1/page/html/" #Afghan_Hound?redirect=true"

In [75]:
failed_breeds = []
for dog_breed, wikipedia_breed in zip([dog_breeds[3]], [wikipedia_breeds[3]]):
    dog_url = wikipedia_api + wikipedia_breed + "?redirect=true"
    try:
        r = requests.get(dog_url, headers=headers)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        print(dog_breed+" failed!")
        failed_breeds.append(dogbreedinfo_breed)
        time.sleep(random.uniform(1, 2))
        continue
    db.dogbreeds.update_one({'breed' : dog_breed},{"$set" : {'wikipedia_url' : dog_url,
                            'wikipedia_content' : str(r.text)}})
    time.sleep(random.uniform(1, 2)) 

### Clean up data for each website

In [76]:
client = pymongo.MongoClient("mongodb://54.67.76.74/dogbreeds")

In [77]:
db = client.dogbreeds

In [78]:
db.dogbreeds.find_one({"breed" : "beagle"})['dogtime_content']

'<div class="category-article-main"><div id="adunit"></div><div class="pw-widget pw-size-medium"><a class="pw-button-facebook"></a><a class="pw-button-twitter"></a><a class="pw-button-googleplus"></a><a class="pw-button-pinterest"></a><a class="pw-button-reddit"></a></div><header><h1>Beagle</h1><h2></h2><p>Small, compact, and hardy, Beagles are active companions for kids and adults alike. Canines in this <a href="http://dogtime.com/dog-breeds">dog breed</a> are merry and fun loving, but being <a href="http://dogtime.com/hounds.html">hounds</a>, they can also be stubborn and require patient, creative <a href="http://dogtime.com/training.html">training techniques</a>. Their noses guide them through life, and they’re never happier than when following an interesting scent. The Beagle originally was bred as a <a href="http://dogtime.com/games-scent-hounds.html">scenthound</a> to track small game, mostly rabbits and hare. He is still used for this purpose in many countries, including the Uni

In [79]:


client = pymongo.MongoClient("mongodb://54.67.76.74/dogbreeds")


db = client.dogbreeds



dogtime_test = db.dogbreeds.find_one({"breed" : "beagle"})['dogtime_content']



dog_soup = BeautifulSoup(dogtime_test, "lxml")



dog_soup.find("header").find("p").text


char_dict = dict()
for characteristic in dog_soup.find_all(class_="characteristic item-trigger-title"):
    char_dict[characteristic.text.strip()] =\
            int(characteristic.find_next().find_next()['class'][1].split('-')[-1])

        
char_dict

{'Adaptability': 3,
 'Adapts Well to Apartment Living': 4,
 'Affectionate with Family': 5,
 'All Around Friendliness': 5,
 'Amount Of Shedding': 3,
 'Dog Friendly': 5,
 'Drooling Potential': 1,
 'Easy To Groom': 4,
 'Easy To Train': 1,
 'Energy Level': 4,
 'Exercise Needs': 4,
 'Friendly Toward Strangers': 5,
 'General Health': 1,
 'Good For Novice Owners': 3,
 'Health Grooming': 3,
 'Incredibly Kid Friendly Dogs': 5,
 'Intelligence': 4,
 'Intensity': 5,
 'Potential For Mouthiness': 3,
 'Potential For Playfulness': 5,
 'Potential For Weight Gain': 5,
 'Prey Drive': 5,
 'Sensitivity Level': 4,
 'Size': 2,
 'Tendency To Bark Or Howl': 5,
 'Tolerates Being Alone': 1,
 'Tolerates Cold Weather': 2,
 'Tolerates Hot Weather': 4,
 'Trainability': 4,
 'Wanderlust Potential': 5}

In [None]:
trait_synonyms = dict()
trait_synonyms['Adaptability'] = ['adaptable', 'flexible']
trait_synonyms['Adapts Well to Apartment Living'] = ['apartment', 'indoor', 'indoors']
trait_synonyms['Affectionate with Family'] = ['loving', 'affectionate', 'cuddly']
trait_synonyms['All Around Friendliness'] = ['friendly', 'loving']
trait_synonyms['Amount of Shedding'] = ['shedding', 'hair', 'fur']
trait_synonyms['Dog Friendly'] = ['dogs', 'playful', 'other dogs', 'good dogs']
trait_synonyms['Drooling Potential'] = ['drool', 'slobber']
trait_synonyms['Easy to Groom'] = ['easy groom', 'simple grooming', 'easy grooming', 'simple groom', 'low maintenance']
trait_synonyms['Easy to Train'] = ['easy train', 'trainable', 'obedient', 'disciplined']
trait_synonyms['Energy Level'] = ['high-energy', 'energetic', 'hyper', 'high energy', 'play']
trait_synonyms['Exercise Needs'] = ['exercise', 'walks', 'runs', 'walking', 'running', 'fetch']
trait_synonyms['Friendly Toward Strangers'] = ['friendly', 'likes people', 'people dog']
trait_synonyms['General Health'] = ['healthy', 'long-lived']
trait_synonyms['Good for Novice Owners'] = ['novice', 'new owner', 'easygoing', 'adaptable']
trait_synonyms['Health Grooming'] = ['low maintenance', 'healthy']
trait_synonyms['Incredibly Kid Friendly Dogs'] = ['good kids', 'playful', 'gentle', 'tolerant']