In [None]:
# We're going to be gathering data on the animals up for adoption
# at the local animal shelter in Franklin.
# Then we'll use some NLP techniques to decide which pet to adopt ;)

In [18]:
# The overall goal we're seeking to accomplish is to use the animal shelter's listing of pets that are up for adoption to gather metadata on each pet (name, breed, age, life story, etc).
# For your reference, this is the page I'm talking about: https://www.petfinder.com/search/pets-for-adoption/?shelter_id%5B0%5D=TN75&sort%5B0%5D=recently_added
# We'll then clean the data using some string and regex techniques and put the cleaned data into a Pandas dataframe.
# After that, we'll perform some NLP analysis on the content to find the best pet currently up for adoption.


# First, we need to get the data.
# Normally, it is common to use Python's built-in HTTP library, called "requests", for this task.
# However, the "requests" library only works for static websites. Our website is not static and uses Javascript to display content.
# So instead, we need to use a library that can understand Javascript so that we can get the data we need from the site.
# The library we want is called Selenium.

# Selenium is what's called a "headless web browser", and it basically behaves same way that your browser does when you navigate around the web.
# There is some setup involved in order to get Selenium running on our machine.
# We need to install a driver that will allow us to use Selenium.
# Lets install using the package manager called Homebrew
# Install homebrew by using this command in your terminal:
# /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
# then follow the installation prompts

# after Homebrew finishes installing, we can install our driver
# in the terminal, inter the following command to install the driver:
# brew install geckodriver
# Note that geckodriver is the driver used by the selenium Firefox emulator, which is what we'll be using in this exercise.

# Now we're ready to use selenium
# We'll start by making a request to the main website page that lists all the pets up for adoption.
# We start by importing the library like so:
from selenium import webdriver

# and then we declare the url of the web page we want to fetch
url = 'https://www.petfinder.com/search/pets-for-adoption/?shelter_id%5B0%5D=TN75&sort%5B0%5D=recently_added'

# and now we're ready to make the request
# Note we'll be using what is called a GET request. Just FYI, there are 4 main types of requests under the HTTP protocol: GET, POST, PUT, and PATCH.
driver = webdriver.Firefox()
driver.get(url)

In [19]:
# Okay great, the driver object now has the response. The driver object has many properties, but the one we're interested in is "page_source".
# We want to use the "page_source" property on driver to access the page's content, which is the HTML markup for the page.

# A sidenote about HTML:
# HTML is a markup language that controls how browsers interpret the structure of a webpage for display purposes.
# HTML markup is essentially a series of so-called "tags" that developers use to indicate what content on the page is an image, a header, a paragraph, a link, etc.
# For example, a header tag, which we would use to indicate maybe the title of a page, like The New York Times, looks like this <h1>The New York Times</h1>
# Links, usually displayed as text that when you can click on them bring you to a new page, also have a special tag in HTML, called an "a" tag, written as <a>
# All <a> tags have a property called an "href", and this href is how the browser knows where to send the user to when they click the link
# I say all this as background, because now that we have our response content from our earlier request, we need to now extract from that HTML content a list of the links to each pet's profile page
# For example, this is a pet profile page: https://www.petfinder.com/cat/raley-55475696/tn/franklin/williamson-county-animal-center-tn75/
# Once we have that list of links, we'll use it to visit each page and capture the relevant pet metadata that will go into our dataframe

# In order to find the links in the HTML content, we'll need another Python library called BeautifulSoup (also called bs4)
# BeautifulSoup makes it easy read and extract information from HTML documents, like the webpage we just fetched!
# So let's import the BeautifulSoup class from bs4

from bs4 import BeautifulSoup

# First we'll pass our HTML content to BeautifulSoup, preparing the HTML document for processing, and save the resulting object to a variable
soup = BeautifulSoup(driver.page_source, 'html.parser')

# And then let's use BeautifulSoup to extract all the links to the pet profile pages and save them to a list. There were 25 pets listed when I ran this the other day.
pet_profiles = soup.find_all('a', {'class': 'petCard-link'})


In [21]:
# Now that we have our link elements, we can access their href values like so:
pet_profiles[0].attrs['href']

'https://www.petfinder.com/cat/kyiv-55517590/tn/franklin/williamson-county-animal-center-tn75/'

In [None]:
# If you manually inspect the profile page using your browser's Developer Tools, you'll see which HTML elements have the info we want.
# All of the elements are marked with a property named "data-test". The property will have a value, such as "Pet_Name".
# Here's a list of the data-test values we want to capture:

# spans:
## Pet_Name
## Pet_Breeds
## Pet_Age
## Pet_Sex
## Pet_Full_Grown_Size
## Pet_Primary_Color

# divs:
## Pet_About_Section
## Pet_Story_Section

In [67]:
# we're getting ready to create our dataset, so let's import pandas now and create an empty dataset
import pandas as pd

df = pd.DataFrame()

In [68]:
# Now that we have our dataframe ready, let's get some pet profile data!
# Since the pet profile pages are static pages (meaning they don't rely on Javascript to render their content), we don't need Selenium anymore.
# So lets instead use Python's built-in HTTP requests library to fetch the individual pet profile pages.
import requests

# Now this is where things get tricky.
# We have a list of links, so we know that we'll need to loop over the list.
# As we loop, we'll make a request for a single profile page, then extract its content.
# And then we'll append that content to the dataframe we made above.

# So lets first make our loop
for link in pet_profiles:
    # Then let's use the link's href to fetch the profile page
    # We make our GET requests by passing the href into the get method and save the result to a variable we'll call "response".
    response = requests.get(pet_profiles[3].attrs['href'])

    # Great! We have our HTML page. Now let's use BeautifulSoup to extract the text data we want for our dataframe.
    soup = BeautifulSoup(response.content, 'html.parser')
    # Lets find our span elements first
    name = soup.find('span', {'data-test': 'Pet_Name'}).text
    breed = soup.find('span', {'data-test': 'Pet_Breeds'}).text
    age = soup.find('span', {'data-test': 'Pet_Age'}).text
    sex = soup.find('span', {'data-test': 'Pet_Sex'}).text
    size = soup.find('span', {'data-test': 'Pet_Full_Grown_Size'}).text
    color = soup.find('span', {'data-test': 'Pet_Primary_Color'}).text

    # Then lets find our div elements
    about = soup.find('div', {'data-test': 'Pet_About_Section'}).text
    story = soup.find('div', {'data-test': 'Pet_Story_Section'}).text

    # Sweet! We're ready to stash all this in our dataframe using the append() method
    df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'size': size, 'color': color, 'about': about, 'story': story}, ignore_index=True)


  df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'size': size, 'color': color, 'about': about, 'story': story}, ignore_index=True)
  df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'size': size, 'color': color, 'about': about, 'story': story}, ignore_index=True)
  df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'size': size, 'color': color, 'about': about, 'story': story}, ignore_index=True)
  df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'size': size, 'color': color, 'about': about, 'story': story}, ignore_index=True)
  df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'size': size, 'color': color, 'about': about, 'story': story}, ignore_index=True)
  df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'size': size, 'color': color, 'about': about, 'story': story}, ignore_index=True)
  df = df.append({'name': name, 'breed': breed, 'age': age, 'sex': sex, 'siz

In [49]:
# Now lets inspect how our data looks!
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    25 non-null     object
 1   breed   25 non-null     object
 2   age     25 non-null     object
 3   sex     25 non-null     object
 4   size    25 non-null     object
 5   color   25 non-null     object
 6   about   25 non-null     object
 7   story   25 non-null     object
dtypes: object(8)
memory usage: 1.7+ KB


In [50]:
df.head()

Unnamed: 0,name,breed,age,sex,size,color,about,story
0,\n Nova\n,\nSiberian Husky\n,\n Adult\n ...,\n Male\n ...,\n Large\n ...,White / Cream,\n\n\n About\n \n\n\n\nChara...,\n\n\n Meet Nova\n \n\n Meet ...
1,\n Nova\n,\nSiberian Husky\n,\n Adult\n ...,\n Male\n ...,\n Large\n ...,White / Cream,\n\n\n About\n \n\n\n\nChara...,\n\n\n Meet Nova\n \n\n Meet ...
2,\n Nova\n,\nSiberian Husky\n,\n Adult\n ...,\n Male\n ...,\n Large\n ...,White / Cream,\n\n\n About\n \n\n\n\nChara...,\n\n\n Meet Nova\n \n\n Meet ...
3,\n Nova\n,\nSiberian Husky\n,\n Adult\n ...,\n Male\n ...,\n Large\n ...,White / Cream,\n\n\n About\n \n\n\n\nChara...,\n\n\n Meet Nova\n \n\n Meet ...
4,\n Nova\n,\nSiberian Husky\n,\n Adult\n ...,\n Male\n ...,\n Large\n ...,White / Cream,\n\n\n About\n \n\n\n\nChara...,\n\n\n Meet Nova\n \n\n Meet ...


In [70]:
# Uh oh! There seems to be some clutter in the strings, like lots of whitespace and newline markers.
# Lets write a popular string function called "strip" to clean that up.
# And then let's use it on our dataframe's data by leveraging the "apply" method
# Note that you'll need to use a lambda for this

df = df.apply(lambda x: x.str.strip())


In [71]:
# let's inspect our data again
df.head()

Unnamed: 0,name,breed,age,sex,size,color,about,story
0,Nova,Siberian Husky,Adult,Male,Large,White / Cream,About\n \n\n\n\nCharacteristics\nHigh energ...,"Meet Nova\n \n\n Meet Nova, Age 5\nW..."
1,Nova,Siberian Husky,Adult,Male,Large,White / Cream,About\n \n\n\n\nCharacteristics\nHigh energ...,"Meet Nova\n \n\n Meet Nova, Age 5\nW..."
2,Nova,Siberian Husky,Adult,Male,Large,White / Cream,About\n \n\n\n\nCharacteristics\nHigh energ...,"Meet Nova\n \n\n Meet Nova, Age 5\nW..."
3,Nova,Siberian Husky,Adult,Male,Large,White / Cream,About\n \n\n\n\nCharacteristics\nHigh energ...,"Meet Nova\n \n\n Meet Nova, Age 5\nW..."
4,Nova,Siberian Husky,Adult,Male,Large,White / Cream,About\n \n\n\n\nCharacteristics\nHigh energ...,"Meet Nova\n \n\n Meet Nova, Age 5\nW..."


In [73]:
# hmm, close but not quite, strip removed the surrounding whitespace, but there are still newline characters inside the strings
# lets use regex to get rid of those along with pandas's "replace" method

df2 = df.replace(r'\n+', ' ', regex=True)
df2.head()

Unnamed: 0,name,breed,age,sex,size,color,about,story
0,Nova,Siberian Husky,Adult,Male,Large,White / Cream,"About Characteristics High energy, Not Ho...","Meet Nova Meet Nova, Age 5 Weight..."
1,Nova,Siberian Husky,Adult,Male,Large,White / Cream,"About Characteristics High energy, Not Ho...","Meet Nova Meet Nova, Age 5 Weight..."
2,Nova,Siberian Husky,Adult,Male,Large,White / Cream,"About Characteristics High energy, Not Ho...","Meet Nova Meet Nova, Age 5 Weight..."
3,Nova,Siberian Husky,Adult,Male,Large,White / Cream,"About Characteristics High energy, Not Ho...","Meet Nova Meet Nova, Age 5 Weight..."
4,Nova,Siberian Husky,Adult,Male,Large,White / Cream,"About Characteristics High energy, Not Ho...","Meet Nova Meet Nova, Age 5 Weight..."
