In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        18.9 MB  conda-forge
    libcblas-3.8.0             |      11_openblas        

In [3]:
# install lxml in order to get the wikipedia postal code table data
!pip install lxml



In [4]:
# Pennsylvania County Name List, Income and Population (pa_cnty_list_1)
# Read in the table from the Web Page and then display sample contents of the dataframe
pa_cnty_list_1 = pd.read_html("https://en.wikipedia.org/wiki/List_of_Pennsylvania_counties_by_per_capita_income",skiprows=0)[1]
pa_cnty_list_1.head()

Unnamed: 0,Rank,County,Per capitaincome,Medianhouseholdincome,Medianfamilyincome,Population,Number ofhouseholds
0,1.0,Chester,"$41,251","$84,741","$101,760",498886,182900
1,2.0,Montgomery,"$40,076","$76,380","$94,592",799874,307750
2,3.0,Bucks,"$35,687","$74,828","$90,274",625249,234849
3,4.0,Delaware,"$32,067","$61,876","$77,879",558979,208700
4,5.0,Cumberland,"$30,119","$60,219","$75,000",235406,93943


In [5]:
# Drop un-needed columns from dataframe: pa_cnty_list_1: 'Rank', 'Per capitaincome', 'Medianfamilyincome', 'Number ofhouseholds'
pa_cnty_list_1.drop(['Rank', 'Per capitaincome','Medianfamilyincome','Number ofhouseholds'], axis=1, inplace=True)
pa_cnty_list_1.head()

Unnamed: 0,County,Medianhouseholdincome,Population
0,Chester,"$84,741",498886
1,Montgomery,"$76,380",799874
2,Bucks,"$74,828",625249
3,Delaware,"$61,876",558979
4,Cumberland,"$60,219",235406


In [6]:
# Pennsylvania County Name List, Latitude & Longitude (pa_cnty_list_2)
# Read in the table from the CSV file and then display sample contents of the dataframe
pa_cnty_list_2 = pd.read_csv("PA_County_Latitude_Longitude.csv")
pa_cnty_list_2.head()

Unnamed: 0,County Name,County Code Number,County Code text,Longitude,Latitude,New Georeferenced Column
0,Adams,1,1,-77.222243,39.872096,POINT (-77.22224271 39.87209565)
1,Berks,6,6,-75.930773,40.419396,POINT (-75.93077327 40.41939635)
2,Cambria,11,11,-78.718942,40.491275,POINT (-78.71894174 40.49127491)
3,Sullivan,57,57,-76.514792,41.448099,POINT (-76.5147922 41.4480994)
4,Mercer,43,43,-80.260094,41.302378,POINT (-80.26009411 41.30237777)


In [9]:
# Drop un-needed columns from dataframe: pa_cnty_list_2: 'County Code Number', 'County Code text', 'New Georeferenced Column' 
pa_cnty_list_2.drop(['County Code Number', 'County Code text','New Georeferenced Column'], axis=1, inplace=True)
pa_cnty_list_2.head()

KeyError: "['County Code Number' 'County Code text' 'New Georeferenced Column'] not found in axis"

In [8]:
# Now combine (merge) the two dataframes into one dataframe called: pa_cnty_list_final
pa_cnty_list_final = pa_cnty_list_1.merge(pa_cnty_list_2, left_on=['County'], right_on=['County Name'])

In [11]:
# Display sample contents of the dataframe 'pa_cnty_list_final' after merging
pa_cnty_list_final.head()

Unnamed: 0,County,Medianhouseholdincome,Population,County Name,Longitude,Latitude
0,Chester,"$84,741",498886,Chester,-75.756265,39.974871
1,Montgomery,"$76,380",799874,Montgomery,-75.37252,40.209899
2,Bucks,"$74,828",625249,Bucks,-75.112912,40.335011
3,Delaware,"$61,876",558979,Delaware,-75.406277,39.916579
4,Cumberland,"$60,219",235406,Cumberland,-77.268663,40.167598


In [12]:
# Drop un-needed columns from dataframe: pa_cnty_list_final: 'County Name' 
pa_cnty_list_final.drop(['County Name'], axis=1, inplace=True)
pa_cnty_list_final.head()

Unnamed: 0,County,Medianhouseholdincome,Population,Longitude,Latitude
0,Chester,"$84,741",498886,-75.756265,39.974871
1,Montgomery,"$76,380",799874,-75.37252,40.209899
2,Bucks,"$74,828",625249,-75.112912,40.335011
3,Delaware,"$61,876",558979,-75.406277,39.916579
4,Cumberland,"$60,219",235406,-77.268663,40.167598


In [13]:
# Now, Re-order the columns in dataframe: 'pa_cnty_list_final'
pa_cnty_list_final = pa_cnty_list_final[['County', 'Medianhouseholdincome', 'Population', 'Latitude', 'Longitude']] 

In [14]:
# Now display sample data from the dataframe after column re-ordering
pa_cnty_list_final.head()

Unnamed: 0,County,Medianhouseholdincome,Population,Latitude,Longitude
0,Chester,"$84,741",498886,39.974871,-75.756265
1,Montgomery,"$76,380",799874,40.209899,-75.37252
2,Bucks,"$74,828",625249,40.335011,-75.112912
3,Delaware,"$61,876",558979,39.916579,-75.406277
4,Cumberland,"$60,219",235406,40.167598,-77.268663


In [15]:
# Remove the dollar sign ($) from the column: 'Medianhouseholdincome'
pa_cnty_list_final['Medianhouseholdincome'] = pa_cnty_list_final['Medianhouseholdincome'].replace({'\$':''}, regex = True)

# Remove the comma (,) from the column: 'Medianhouseholdincome'
pa_cnty_list_final['Medianhouseholdincome'] = pa_cnty_list_final['Medianhouseholdincome'].replace({'\,':''}, regex = True)

In [16]:
# Show sample data from the dataframe - this is part of the source of the data needed for this project.
pa_cnty_list_final.head()

Unnamed: 0,County,Medianhouseholdincome,Population,Latitude,Longitude
0,Chester,84741,498886,39.974871,-75.756265
1,Montgomery,76380,799874,40.209899,-75.37252
2,Bucks,74828,625249,40.335011,-75.112912
3,Delaware,61876,558979,39.916579,-75.406277
4,Cumberland,60219,235406,40.167598,-77.268663


In [17]:
# Now, Display ALL of the dataframe data, but sorted by County.  This is being done just to make the dataframe data look more-organized, but is not a requirement.
pa_cnty_list_final.sort_values(['County'])

Unnamed: 0,County,Medianhouseholdincome,Population,Latitude,Longitude
17,Adams,56529,101407,39.872096,-77.222243
5,Allegheny,47961,1223348,40.467355,-79.986198
36,Armstrong,42752,68941,40.815095,-79.473169
22,Beaver,46190,170539,40.683492,-80.351074
54,Bedford,40249,49762,40.007375,-78.491165
19,Berks,53470,411442,40.419396,-75.930773
29,Blair,42363,127089,40.48555,-78.349077
48,Bradford,40543,62622,41.791178,-76.518256
2,Bucks,74828,625249,40.335011,-75.112912
6,Butler,56878,183862,40.910832,-79.917118


Use geopy library to get the latitude and longitude values of the Commonwealth of Pennsylvania.

In [18]:
# Define the user agent here
address = 'Pennsylvania'

geolocator = Nominatim(user_agent="pa_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Pennsylvania are: {}, {}.'.format(latitude, longitude))

The geographical coordinates of Pennsylvania are: 40.9699889, -77.7278831.


In [19]:
# create map of Pennsylvania using latitude and longitude values
map_pa = folium.Map(location=[latitude, longitude], zoom_start=8)
    
# add markers to map
for  lat, lng, county, income, popl in zip(pa_cnty_list_final['Latitude'], pa_cnty_list_final['Longitude'], pa_cnty_list_final['County'], pa_cnty_list_final['Medianhouseholdincome'], pa_cnty_list_final['Population']):
    label = '{}'.format(county)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_pa)      
    
map_pa

Next, we are going to start utilizing the Foursquare API to explore the counties and segment them.

Define Foursquare Credentials and Version

In [20]:
CLIENT_ID = 'GM24OWQ25PEZHAV3PLKTL5BHKRV25IVZ55G2YRHPM1TPIVY3' # your Foursquare ID
CLIENT_SECRET = '5TQSDFRDYQCALQGHJJI5TBGGUSNHF0KXRNML4NRIPW40MTQU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: GM24OWQ25PEZHAV3PLKTL5BHKRV25IVZ55G2YRHPM1TPIVY3
CLIENT_SECRET:5TQSDFRDYQCALQGHJJI5TBGGUSNHF0KXRNML4NRIPW40MTQU


In [21]:
# Get the first Pennsylavania County name here
pa_cnty_list_final.loc[0, 'County']

'Chester'

Get the county's latitude and longitude values.

In [22]:
# Get the county's latitude and longitude values.
county_latitude = pa_cnty_list_final.loc[0, 'Latitude'] # county latitude value
county_longitude = pa_cnty_list_final.loc[0, 'Longitude'] # county longitude value

county_name = pa_cnty_list_final.loc[0, 'County'] # county name

print('Latitude and longitude values of {} County are: {}, {}.'.format(county_name, 
                                                               county_latitude, 
                                                               county_longitude))

Latitude and longitude values of Chester County are: 39.97487056, -75.75626498.


Now, let's get the top 50 Urgent Care Centers that are in Chester County within a radius of 50 miles (or, 80,467 meters)

First, let's create the GET request URL. Name your URL url.

In [24]:
# Create the URL that will be used to get a list of Urgent Care Centers in Chester County
radius = 80467
LIMIT = 50
categoryId='56aa371be4b08b9a8d573526' # Urgent Care Center 
search_query = county_name
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}&categoryId={}'.format(CLIENT_ID, CLIENT_SECRET, county_latitude, county_longitude, VERSION, radius, LIMIT, categoryId)
url

'https://api.foursquare.com/v2/venues/explore?client_id=GM24OWQ25PEZHAV3PLKTL5BHKRV25IVZ55G2YRHPM1TPIVY3&client_secret=5TQSDFRDYQCALQGHJJI5TBGGUSNHF0KXRNML4NRIPW40MTQU&ll=39.97487056,-75.75626498&v=20180605&radius=80467&limit=50&categoryId=56aa371be4b08b9a8d573526'

In [25]:
# Call the GET using the previously-created URL
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e290c0f963d29001b00c48c'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Current map view',
  'headerFullLocation': 'Current map view',
  'headerLocationGranularity': 'unknown',
  'query': 'urgent care center',
  'totalResults': 167,
  'suggestedBounds': {'ne': {'lat': 40.699074284203725,
    'lng': -74.81299634644621},
   'sw': {'lat': 39.250666835796274, 'lng': -76.69953361355378}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4c2e69487cc0c9b62203ea9a',
       'name': 'Patient First - Aberdeen',
       'location': {'address': '995 Hospitality Way',
        'lat': 39.52415940108369,
        'lng': -76.18540018631576,
        'labeledLatLn

In [26]:
# define a function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [27]:
# Use this block of code to get a list of the Urgent Care Centers and then display a sample of them
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Patient First - Aberdeen,Urgent Care Center,39.524159,-76.1854
1,AFC Urgent Care Downingtown,Urgent Care Center,40.008127,-75.701885
2,Patient First - Downingtown,Urgent Care Center,40.0177,-75.681019
3,Premier Urgent Care,Urgent Care Center,40.055876,-75.66744
4,Premier Urgent Care,Urgent Care Center,39.862024,-75.677863


In [28]:
# Show how many venues were returned by using the Foursquare API
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

50 venues were returned by Foursquare.


Let's create a function to repeat the same process to all the Counties in Pennsylvania

In [29]:
# Define a function that will process ALL the Counties in Pennsylvania
def getNearbyHospitals(names, latitudes, longitudes, radius=80467, categoryId='56aa371be4b08b9a8d573526'):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            categoryId)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['County', 
                  'County Latitude', 
                  'County Longitude', 
                  'Hospital', 
                  'Hospital Latitude', 
                  'Hospital Longitude', 
                  'Hospital Category']
    
    return(nearby_venues)

In [30]:
# Now write the code to run the above function on each county and create a new dataframe called pa_counties.
pa_counties = getNearbyHospitals(names=pa_cnty_list_final['County'],
                                   latitudes=pa_cnty_list_final['Latitude'],
                                   longitudes=pa_cnty_list_final['Longitude']
                                  )

Chester
Montgomery
Bucks
Delaware
Cumberland
Allegheny
Butler
Northampton
Dauphin
Pike
Lehigh
York
Pennsylvania
Montour
Washington
Lancaster
Westmoreland
Adams
Lebanon
Berks
Franklin
Monroe
Beaver
Lackawanna
Centre
Perry
Luzerne
Carbon
Wyoming
Blair
Elk
Erie
Wayne
Columbia
Susquehanna
Warren
Armstrong
Lycoming
Mercer
Fulton
Union
Lawrence
Schuylkill
Cameron
Cambria
Philadelphia
Snyder
McKean
Bradford
Juniata
Northumberland
Huntingdon
Potter
Indiana
Bedford
Venango
Crawford
Tioga
Jefferson
Clarion
Greene
Clearfield
Somerset
Sullivan
Clinton
Fayette
Mifflin
Forest


In [33]:
# Now display the size of the dataframe that contains ALL the Urgent Care Centers, plus a sample list of the Urgent Care Centers in that dataframe
print(pa_counties.shape)
pa_counties.head()

(1828, 7)


Unnamed: 0,County,County Latitude,County Longitude,Hospital,Hospital Latitude,Hospital Longitude,Hospital Category
0,Chester,39.974871,-75.756265,Patient First - Aberdeen,39.524159,-76.1854,Urgent Care Center
1,Chester,39.974871,-75.756265,AFC Urgent Care Downingtown,40.008127,-75.701885,Urgent Care Center
2,Chester,39.974871,-75.756265,Patient First - Downingtown,40.0177,-75.681019,Urgent Care Center
3,Chester,39.974871,-75.756265,Premier Urgent Care,40.055876,-75.66744,Urgent Care Center
4,Chester,39.974871,-75.756265,Premier Urgent Care,39.862024,-75.677863,Urgent Care Center


In [35]:
# Get a count of the number of Urgent Care Centers in each County here then show a sample listing
pa_counties_cnt = pa_counties.groupby('County').count()
pa_counties_cnt.head()

Unnamed: 0_level_0,County Latitude,County Longitude,Hospital,Hospital Latitude,Hospital Longitude,Hospital Category
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adams,50,50,50,50,50,50
Allegheny,40,40,40,40,40,40
Armstrong,38,38,38,38,38,38
Beaver,43,43,43,43,43,43
Bedford,14,14,14,14,14,14


In [36]:
# Reset the dataframe index and then show a sample listing again
pa_counties_cnt = pa_counties_cnt.reset_index()
pa_counties_cnt.head()

Unnamed: 0,County,County Latitude,County Longitude,Hospital,Hospital Latitude,Hospital Longitude,Hospital Category
0,Adams,50,50,50,50,50,50
1,Allegheny,40,40,40,40,40,40
2,Armstrong,38,38,38,38,38,38
3,Beaver,43,43,43,43,43,43
4,Bedford,14,14,14,14,14,14


In [40]:
# Pull into a new dataframe the County Name along with its Urgent Care Center count (the 'County Latitude' column will hold the count for now)
hosp_count = pa_counties_cnt.loc[:,'County':'County Latitude']

In [44]:
# Display a sample of the Pennsylvania Counties along with their Urgent Care Center count
hosp_count.head()

Unnamed: 0,County,County Latitude
0,Adams,50
1,Allegheny,40
2,Armstrong,38
3,Beaver,43
4,Bedford,14


In [45]:
# Rename the incorrect 'County Latitude' column name to be 'CenterCount'
hosp_count.rename(columns = {'County Latitude':'CenterCount'}, inplace = True) 

In [46]:
# Display a small sample from the dataframe with the corrected column name
hosp_count.head()

Unnamed: 0,County,CenterCount
0,Adams,50
1,Allegheny,40
2,Armstrong,38
3,Beaver,43
4,Bedford,14


In [47]:
# Save a copy of the 'hosp_count' dataframe in case we need to recover it later
hosp_count_save = hosp_count

In [48]:
# Now combine (merge) the two dataframes into one dataframe called: pa_cnty_list_final2
pa_cnty_demog = hosp_count.merge(pa_cnty_list_final, left_on=['County'], right_on=['County'])

In [50]:
# Display a small sample of the new 'pa_cnty_demog' dataframe.
# THIS IS THE DATAFRAME THAT WILL BE USED TO DETERMINE IF THE INVESTORS WILL BUILD NEW FOR-PROFIT URGENT CARE CENTERS IN ANY OF THE COUNTIES.
# THE DECISION TO BUILD WILL BE BASED ON CRITERIA IN THE NEXT CODE BLOCK
pa_cnty_demog.head()

Unnamed: 0,County,CenterCount,Medianhouseholdincome,Population,Latitude,Longitude
0,Adams,50,56529,101407,39.872096,-77.222243
1,Allegheny,40,47961,1223348,40.467355,-79.986198
2,Armstrong,38,42752,68941,40.815095,-79.473169
3,Beaver,43,46190,170539,40.683492,-80.351074
4,Bedford,14,40249,49762,40.007375,-78.491165


In [51]:
# Now we need to determine what Counties the Investors would want to build new for-profit Urgent Care Centers based on the following criteria:
# 
#      Existing Urgent Care Centers in a County:  Ten (10) or Less
#      Total County Population:                   30,000 people or more
#      Median Household Income in a County:       $40,000 or more
#
# Apply the following criteria to the 'pa_cnty_list_final2' dataframe to get the prospective county name list:
#
# Filter: Number of Existing Urgent Care Centers:  Must be Ten (10) or Less
centerLE10 =  pa_cnty_demog['CenterCount']<=10
pa_cnty_demog2 = pa_cnty_demog[centerLE10]

# Filter: Total County Population:       Must be at least 30,000 people or more
poplGE30k =  pa_cnty_demog2['Population']>=30000
pa_cnty_demog3 = pa_cnty_demog2[poplGE30k]

# Filter: Median Household Income:       Must be at least $40,000 or more
pa_cnty_demog3["Medianhouseholdincome"] = pd.to_numeric(pa_cnty_demog3["Medianhouseholdincome"])

incGE40k =  pa_cnty_demog3['Medianhouseholdincome']>=40000
pa_cnty_demog4 = pa_cnty_demog3[incGE40k]

# NOTICE: THE RESULTING 'WARNING' MESSAGE WILL NOT FAIL THE NOTEBOOK OR THE OUTCOME OF THIS PROJECT.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [53]:
# Now display ALL the Counties that meet the Investor's criteria.   These are the Counties that the investors plan to build new for-profit Urgent Care Centers
pa_cnty_demog4

Unnamed: 0,County,CenterCount,Medianhouseholdincome,Population,Latitude,Longitude
6,Blair,9,42363,127089,40.48555,-78.349077
13,Centre,5,47016,153990,40.919314,-77.825001
15,Clarion,7,40028,39988,41.192791,-79.424836
23,Elk,2,43745,31946,41.428194,-78.649477
30,Huntingdon,10,41700,45913,40.416105,-77.982766
40,Lycoming,3,42689,116111,41.344598,-77.069425
41,McKean,4,40097,43450,41.810371,-78.57122
59,Tioga,4,40338,41981,41.773338,-77.257881


In [55]:
# display the size of the resulting dataframe
print(pa_cnty_demog4.shape)

(8, 6)


In [None]:
# THIS CONCLUDES THE ANALYSIS FOR CAPSTONE PROJECT OF WEEK 5 #