# Animal Shelter Intake Analysis in Austin, TX by Location
## Data Preparation
#### Stephen Schadt, Group 2 Team

In [None]:
# Dependencies
import pandas as pd
import requests

# Google API Key
from config import gkey

#### Build animal / location dataframes

In [None]:
#
# Filtered Dataframe: Only intakes from 1/1/16, 12/1/16, 6/1/17, or 9/1/17 and forward 
#  (depending on how many API calls we can make)
#

# *** Below datasets contain larger datasets and should only be loaded if you have ample API calls to make against Google Maps API ***
# df_intakes_2016_and_on = pd.read_csv('raw data/Austin_Animal_Center_Intakes_2016_and_on.csv', encoding='latin-1')
# df_intakes_2017_and_on = pd.read_csv('raw data/Austin_Animal_Center_Intakes_2017_and_on.csv', encoding='latin-1')
# df_intakes_2017_and_on = pd.read_csv('raw data/Austin_Animal_Center_Intakes_092017_and_on.csv', encoding='latin-1')

# *** Below dataset only contains 1000ish records - use this one for testing purposes ***
df_intakes_2017_and_on = pd.read_csv('raw data/Austin_Animal_Center_Intakes_2017-11_and_on.csv', encoding='latin-1')

# Create clean dataframe to populate only rows with applicable addresses
df_intakes_clean = pd.DataFrame(columns=["DateTime", "Found Address", "Intake Type", "Intake Condition",
                                        "Animal Type", "Sex upon Intake", "Age upon Intake", "Breed", "Color"])

In [None]:
print(f"Length of dataset: {len(df_intakes_2017_and_on)}")
df_intakes_2017_and_on.head(70)

In [None]:
# *********************************************
# *** Function to clean address column data ***
# *********************************************
def clean_address(addr):
    '''
    Function: clean_address
    Argument: address
    Return values: tuple final address (string), is_full_address (boolean)
    '''
    # variable determining whether or not this is a full address (defaults to False)
    is_full_address = False

    # Initialize address variable by cleaning off the (TX) part
    addressclean = addr.replace(" (TX)","")

    # Split the address from the city
    address = addressclean.split(" in ")

    # Clean up address
    address_words = addressclean.split(" ")

    # First find out if this address is not applicable
    if addressclean == "Outside Jurisdiction":
        address_final = "NA"
    
    # Next, find out if this is an actual street address
    elif (len(address) > 1):
        is_full_address = True

        # Street address (raw)
        address_street = address[0]
        address_city = address[1]
        
        # Clean up addresses with "/" characters into [street1 and street2] syntax
        address_corner = address_street.split("/")
        if len(address_corner) > 1:
            address_street = f"{address_corner[0]} and {address_corner[1]}"
        else:
            address_street = address_corner[0]
            
        address_final = f"{address_street},{address_city},TX"
        
    # Finally, for non-address strings...single-city listing
    else:
        address_final = f"{addressclean},TX"
        
    return (address_final, is_full_address)


In [None]:
#
# Loop through last-1-year or last-2-years dataset, and only insert rows with clean addresses into clean dataset
#
for index, row in df_intakes_2017_and_on.iterrows():
    # Call function to clean up address into something we can pass to Google API
    address_tuple = clean_address(row["Found Location"])
    address = address_tuple[0]
    is_full_address = address_tuple[1]
    
    if address == "NA":
        print("Outside jurisdiction - skipping")
        continue
    elif is_full_address == False:
        print("No actual address - skipping")
        continue
    else:
        # Fill empty "clean" dataframe with rows we actually want to process
        df_intakes_clean = df_intakes_clean.append({"DateTime": row["DateTime"],
                                "Found Address": address,
                                "Intake Type": row["Intake Type"],
                                "Intake Condition": row["Intake Condition"],        
                                "Animal Type": row["Animal Type"],
                                "Sex upon Intake": row["Sex upon Intake"],
                                "Age upon Intake": row["Age upon Intake"],
                                "Breed": row["Breed"],
                                "Color": row["Color"]}, ignore_index=True)
        

In [None]:
# Visualize cleaned dataset
print(f"Length of entire dataset: {len(df_intakes_clean)}")
df_intakes_clean["Intake Type"].value_counts()
df_intakes_clean.head()

### Google Maps API integration

In [None]:
# Create summary dataframe to house address, lat, long, and boolean indicating if this pet lived in a home
df_summary = pd.DataFrame(columns=["Address", "Latitude", "Longitude", "Pet at Home", "Animal Type"])

#### Function and For-loop to grab lat/lng from Google API

In [None]:
# If we have already populated the summary dataframe, drop any rows where Lat/Long values are zero, 
#  so appending below will be clean
df_summary = df_summary[df_summary.Latitude != 0]

# Visualize the trimmed dataframe
print(f"Length of dataset: {len(df_summary)}")
df_summary.head(50)

In [None]:
# **************************************************************
# *** Function to pull lat / lng values from Google Maps API ***
# **************************************************************
def get_lat_long(address):
    '''
    Function: get_lat_long
    Purpose:  Get lat and long codes from Google maps API, given an address string
    Argument: address
    Returns:  lat, long values
    '''
    # Create endpoint URL
    endpoint_url = f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={gkey}"

    # Run request to grab the JSON at the requested URL
    google_api_json = requests.get(endpoint_url).json()

    # Append the lat/lng to the appropriate columns (use try / except to skip addresses with errors)
    try: 
        lat = google_api_json["results"][0]["geometry"]["location"]["lat"]
        lng = google_api_json["results"][0]["geometry"]["location"]["lng"]
        retval = (lat, lng)

    except IndexError:
        retval = (0, 0)

    return retval
# Initialize loop variables
row_count = 0 
processed_addresses = []

#
# Loop through cleaned dataset and determine lat/lng using Google maps geocoding API
#
# ** NOTE: this loop must not exceed 25,000 calls to the Google API in a 24 hour period, per the Google free API terms ***
#
for index, row in df_intakes_clean.iterrows():
    row_count += 1
    
    # Set address and pet_at_home boolean variable
    addr = row["Found Address"]
    pet_at_home = False
    
    # If we've already done a lookup for this address, no need to call Maps API
    if (addr in processed_addresses):
        print(f"Address already processed: {addr}")
        continue
    else:
        # Verify if this address already has a lat/long value in the table. If so, continue. 
        is_address_in_df = df_summary[df_summary.Address == addr].count()["Address"]
       
        # If this address isn't already in the dataframe, call Google API to populate lat/lon
        if (is_address_in_df == 0):
            print(f"New address being processed: {addr}: {str(row_count)}")
            (latitude, longitude) = get_lat_long(addr)
            
            # Append to addresses array to mark this address as processed
            processed_addresses.append(addr)
        else:
            # Address was found, but the Latitude value is populated
            print(f"Address already populated: {addr}")
            continue
       
    # Set variables for "Pet at Home", "Animal Type"
    animal_type = row["Animal Type"]
    if row["Intake Type"] == "Owner Surrender" or row["Intake Type"] == "Euthanasia Request" or row["Intake Type"] == "Public Assist":
        pet_at_home = True

    # Append values to our summary dataframe
    df_summary = df_summary.append({"Address": addr,
                                    "Latitude": latitude,
                                    "Longitude": longitude,
                                    "Pet at Home": pet_at_home,
                                    "Animal Type": animal_type},
                                    ignore_index=True)
        

In [None]:
print(f"Length of summary dataset: {len(df_summary)}")
df_summary.head(50)

# Save the DataFrame as a csv
df_summary.to_csv("animal_shelter_analysis_summary_clean_LocationData.csv", encoding="utf-8", index=False)

### Create summary dataframes: 2016 to Present and Animals from Homes

#### Create filtered datasets (animals in homes, cats, dogs)

In [None]:
# *** Filter original dataframe into smaller datasets ***

# All intakes not 'Wildlife' or 'Stray' Intake Type: includes "Intake Type" of:
#     - Euthanasia request
#     - Owner surrender
#     - Public assist
df_animals_homes = df_intakes_clean.loc[((df_intakes_clean["Intake Type"] =="Owner Surrender") | 
                                         (df_intakes_clean["Intake Type"] == "Euthanasia Request") |
                                         (df_intakes_clean["Intake Type"] == "Public Assist")),]

# Dogs only
df_animals_dogs = df_intakes_clean.loc[(df_intakes_clean["Animal Type"] =="Dog"),]

# Cats only 
df_animals_cats = df_intakes_clean.loc[(df_intakes_clean["Animal Type"] =="Cat"),]

# Visualize homes dataset
df_animals_homes.head(30)

In [None]:
df_animals_dogs.head(30)

In [None]:
# Strays
df_animals_strays = df_intakes_clean.loc[(df_intakes_clean["Intake Type"] =="Stray"),]
print(f"Length of strays dataset: {len(df_animals_strays)}")

#### Create filtered dataframes of unique address lists

In [None]:
# Calculate address counts for master dataset
address_counts_all = df_intakes_clean["Found Address"].value_counts()
df_address_counts_all = pd.Series.to_frame(address_counts_all).reset_index()
df_address_counts_all = df_address_counts_all.rename(columns={'index': 'Address', 'Found Address': 'Count'})
df_address_counts_all.head(30)

In [None]:
# Function to create cleaned dataframe each filtered addresses dataset
def convert_address_counts_to_df(address_counts):
    '''
    Function: convert_address_counts_to_df
    Description: Convert address value counts to dataframe
    Arguments: Series
    Returns:  Dataframe
    '''
    df_address_counts = pd.Series.to_frame(address_counts).reset_index()
    df_address_counts = df_address_counts.rename(columns={'index': 'Address', 'Found Address': 'Count'})
    return df_address_counts

# Create pets-in-homes-specific counts dataframe
address_counts_homes = df_animals_homes["Found Address"].value_counts()
df_address_counts_homes = convert_address_counts_to_df(address_counts_homes)

# Create dogs-specific counts dataframe
address_counts_dogs = df_animals_dogs["Found Address"].value_counts()
df_address_counts_dogs = convert_address_counts_to_df(address_counts_dogs)

# Create cats-specific counts dataframe 
address_counts_cats = df_animals_cats["Found Address"].value_counts()
df_address_counts_cats = convert_address_counts_to_df(address_counts_cats)

# Create strays-specific counts dataframe 
address_counts_strays = df_animals_strays["Found Address"].value_counts()
df_address_counts_strays = convert_address_counts_to_df(address_counts_strays)

# Visualize Strays address dataframe
df_address_counts_strays.head(30)

In [None]:
# Visualize pets-in-homes addresses counts
df_address_counts_homes.head(30)

#### Create filtered summary dataframes with only valid Latitude/Longitude values

In [None]:
# All animals
df_animals_summary_all = df_summary.loc[(df_summary["Latitude"] != 0),]

# All animals in homes
df_animals_summary_homes = df_summary.loc[(df_summary["Pet at Home"] == True) &
                                                         (df_summary["Latitude"] != 0),]

# Dogs only
df_animals_summary_dogs = df_summary.loc[(df_summary["Animal Type"] == "Dog") &
                                                         (df_summary["Latitude"] != 0),]

# Cats only 
df_animals_summary_cats = df_summary.loc[(df_summary["Animal Type"] == "Cat") & 
                                                         (df_summary["Latitude"] != 0),]


In [None]:
# Visualize filtered dataframes
print(f"Length of 'All Animals' summary dataframe: {len(df_animals_summary_all)}")
print(f"Length of 'Animals in Homes' summary dataframe: {len(df_animals_summary_homes)}")
df_animals_summary_all.head(25)

In [None]:
df_address_counts_cats.head(20)

#### Merge intake address counts into summary dataframes

In [None]:
# Merge address count into master summary dataset
df_summary_all = pd.merge(df_animals_summary_all, df_address_counts_all, on="Address")

In [None]:
# Export and visualize summary of all address counts
df_summary_all.to_csv('raw data/animal_shelter_analysis_address_counts_ALL.csv', encoding='latin-1', index=False)
df_summary_all.head(30)

In [None]:
# Merge address count into pets-in-homes dataset
df_summary_homes = pd.merge(df_animals_summary_homes, df_address_counts_homes, on="Address")

In [None]:
# Export and visualize summary of all address counts
df_summary_homes.to_csv('raw data/animal_shelter_analysis_address_counts_HOMES.csv', encoding='latin-1', index=False)
df_summary_homes.head(30)

In [None]:
# Merge address count into dogs & cats dataset
df_summary_dogs = pd.merge(df_animals_summary_dogs, df_address_counts_dogs, on="Address")
df_summary_cats = pd.merge(df_animals_summary_cats, df_address_counts_cats, on="Address")

In [None]:
# Export and visualize summary of Dogs address counts
df_summary_dogs.to_csv('raw data/animal_shelter_analysis_address_counts_DOGS.csv', encoding='latin-1', index=False)
df_summary_dogs.head(20)

In [None]:
# Export and visualize summary of Cats address counts
df_summary_cats.to_csv('raw data/animal_shelter_analysis_address_counts_CATS.csv', encoding='latin-1', index=False)
df_summary_cats.head(20)

In [None]:
# Merge address count into strays dataset
df_summary_strays = pd.merge(df_summary, df_address_counts_strays, on="Address")

In [None]:
# Export and visualize summary of Dogs address counts
df_summary_strays.to_csv('raw data/animal_shelter_analysis_address_counts_STRAYS.csv', encoding='latin-1', index=False)
df_summary_strays.head(30)

### Dataframes for plotting number of veterinarians vs. number of pet intakes

In [None]:
# Add necessary column to plotting datasets
df_animals_summary_all["Vet Count"] = ""
df_animals_summary_homes["Vet Count"] = ""
df_animals_summary_dogs["Vet Count"] = ""
df_animals_summary_cats["Vet Count"] = ""

In [None]:
# Set Google API key value
from config import gkey

#### Loop through dataset and assign Vet count values by calling Google Radarsearch API

In [None]:
df_animals_summary_all.head()

In [None]:
# Counter
row_count = 0

# Loop through and run Google search to get all banks in 5 mile radius (8000 meters)
for index, row in df_animals_summary_all.iterrows():
    
    # Create endpoint url using Google Places Radar and the lat/lng we identified earlier
    #  - Radius search of roughly 1 mile
    #  - Places type "veterinary_care" only
    target_url =f"https://maps.googleapis.com/maps/api/place/radarsearch/json?location={row['Latitude']},{row['Longitude']}&radius=1700&type=veterinary_care&key={gkey}"

    # This link helps to handily see the JSON generated for each query
    print(f"Now retrieving address #{row_count}: {df_animals_summary_all.loc[index]['Address']}")
    row_count += 1 
    
    # Run request to retrieve JSON from target URL (only if it hasn't been set yet)
    if df_animals_summary_all.loc[index]['Vet Count'] == 0 or df_animals_summary_all.loc[index]['Vet Count'] == "":
        vet_data = requests.get(target_url).json()
        
        # Retrieve vet count via number of results within the radius (2500 meters)
        vet_count = len(vet_data["results"])  
        print(f"Final Vet Count for address '{row['Address']}': {str(vet_count)}")
        print("")    
    
        # Store the vet count into the Data Frame
        df_animals_summary_all.set_value(index, "Vet Count", vet_count)
    else:
        print(f"Vet Count already set for this address: {row['Address']}")
        
    # Reset vet_count, so a previous record cannot influence a later one
    vet_count = 0

# Visualize the new dataset
df_animals_summary_all.head()

In [None]:
# Save the dataframe to CSV
df_animals_summary_all.to_csv('raw data/animal_shelter_analysis_with_Vet_data.csv', encoding='latin-1', index=False)

#### Add number of intake addresses within each vet's lat/long combination to dataframe
   

In [None]:
import math

In [None]:
# Add new tracking column for number of intakes within radius
df_animals_summary_all["Intakes within Radius"] = ""

# Function to calculate if a given lat/long point is contained in the Google place's kilometer radius
def is_location_within_1700meters(check_point_lat, check_point_long, center_point_lat, center_point_long, radius_km):
    '''
    Function: is_location_within_1700meters
    Purpose:  Given lat long values for center point and check point, figure out whether or not check points are within
                X kilometers (in our case, 1.7)
    '''
    
    km_lat = 40000 / 360
    km_lng = math.cos(math.pi * center_point_lat/180) * km_lat
    dst_x = math.fabs(center_point_long - check_point_long) * km_lng
    dst_y = math.fabs(center_point_lat - check_point_lat) * km_lat
    
    return math.sqrt(dst_x * dst_x + dst_y * dst_y) <= radius_km;

In [None]:
# Radius should be 1.7km, to match 1700 meter Google Radarsearch lookup radius
radius = 1.7 

# Loop through dataset and assign how many intake locations are within given lat/long combination
row_count = 0
for index, row in df_animals_summary_all.iterrows():
    # This link helps to handily see the JSON generated for each query
    print(f"Now retrieving address #{row_count}: {df_animals_summary_all.loc[index]['Address']}")
    row_count += 1 

    # Reset inner loop count / boolean variables
    is_found = False
    num_found = 0
    
    # Loop through dataframe again, and determine how many lat/long combinations are within the current lat/long's 1700 meter radius
    for i_inner, r_inner in df_animals_summary_all.iterrows():    
        is_found = is_location_within_1700meters(r_inner["Latitude"], r_inner["Longitude"], row["Latitude"], row["Longitude"], radius)
        if is_found == True:
            num_found += 1
            
    # Set the number of found intakes for this center point 
    df_animals_summary_all.set_value(index, "Intakes within Radius", num_found)
    

In [None]:
# Save the dataframe to CSV, and visualize it
df_animals_summary_all.to_csv('raw data/animal_shelter_analysis_with_VetAndRadius_data.csv', encoding='latin-1', index=False)
df_animals_summary_all.head(30)