# Capstone Notebook
#### IBM Data Science Capstone Project on Coursera

## Import/Install Required Libraries

In [1]:
import pandas as pd
pd.set_option('max_rows', 200)
import numpy as np
! pip install geocoder
import geocoder as gc
import requests
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
!conda install -c conda-forge folium=0.5.0
import folium
import random
import matplotlib.cm as cm
import matplotlib.colors as colors
print('--Imports Completed--')

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 17.7MB/s 
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Collecting package metadata: done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::anaconda==5.3.1=py37_0
  - defaults/linux-64::astropy==3.0.4=py37h14c3975_0
  - defaults/linux-64::bkcharts==0.2=py37_0
  - defaults/linux-64::blaze==0.11.3=py37_0
  - defaults/linux-64::bokeh==0.13.0=py37_0
  - defaults/linux-64::bottleneck==1

## Los Angeles County Zipcodes

#### Retrieve/Format Table of CA Zip Codes

In [2]:
#Download & Format Available California Zipcodes
url = 'https://www.geonames.org/postal-codes/US/CA/california.html'
laData = pd.read_html(url)
laData = laData[2]  #Select Correct Table from url
laData
laData = laData.drop(laData.columns[0],axis=1) 
laData = laData.drop(laData.columns[5:len(laData.columns)],axis=1).reset_index(drop=True) #Drop extraneous columns
#Rename Columns
cols = list(laData.columns)
cols[3] = 'State'
cols[4] = 'County'
laData.columns = cols
laData = laData.drop(len(laData.index)-1).reset_index(drop=True) #Drop extraneous row (last row)

In [3]:
#extract coordinates fromm odd rows 
#reinsert as columns 
lat = [] 
long = [] 
for count, n in enumerate(laData['Place']):
    if (count % 2 != 0):
        temp = n.split('/')
        lat.append(temp[0])
        long.append(temp[1])
        laData = laData.drop(count)
laData = laData.reset_index(drop=True)

#add Latitude and Logitude to LA Dataframe
data = {'Latitude':lat,'Longitude':long}
temp = pd.DataFrame(data,dtype='float')
laData = laData.join(temp)
laData = laData[laData['County'].str.contains('Los Angeles')] #Select only zip codes in Los Angeles

#### Remove Zipcodes with Duplicate Coordinates

In [4]:
laData = laData.sort_values(by = ['Latitude','Longitude']).reset_index(drop=True)

for (row) in range(len(laData.index)-1):
    if laData.loc[row,'Latitude'] == laData.loc[row+1,'Latitude']:
        if laData.loc[row,'Longitude'] == laData.loc[row+1,'Longitude']:
            laData = laData.drop(row)

laData = laData.sort_values('Code').reset_index(drop=True) #resort by zipcode

#### Complete LA County Zip Codes

In [5]:
#Display Corrected Dataframe & its shape
print('Rows:',laData.shape[0],'\nColumns:',laData.shape[1])
laData

Rows: 123 
Columns: 7


Unnamed: 0,Place,Code,Country,State,County,Latitude,Longitude
0,Los Angeles,90001,United States,California,Los Angeles,33.973,-118.248
1,Los Angeles,90002,United States,California,Los Angeles,33.95,-118.246
2,Los Angeles,90003,United States,California,Los Angeles,33.965,-118.273
3,Los Angeles,90004,United States,California,Los Angeles,34.076,-118.303
4,Los Angeles,90005,United States,California,Los Angeles,34.059,-118.301
5,Los Angeles,90006,United States,California,Los Angeles,34.049,-118.292
6,Los Angeles,90007,United States,California,Los Angeles,34.029,-118.287
7,Los Angeles,90008,United States,California,Los Angeles,34.012,-118.341
8,Los Angeles,90010,United States,California,Los Angeles,34.061,-118.303
9,Los Angeles,90011,United States,California,Los Angeles,34.008,-118.258


## Map of Zip Code Locations

In [6]:
#Get Coordinates for Map
address = 'Los Angeles, CA'
geolocator = Nominatim(user_agent="LA_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(address,'\nLatitude:',latitude,'\nLongitude:',longitude)

Los Angeles, CA 
Latitude: 34.0536909 
Longitude: -118.2427666


In [7]:
#Create Map
laMap = folium.Map(location=[latitude, longitude],tiles = 'Stamen Toner', zoom_start=10)

#add Zipcode labels to map
for lat, lng, code, place in zip(laData['Latitude'], laData['Longitude'], laData['Code'], laData['Place']):
    label = '{}: {}'.format(place, code)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.5,
        parse_html=False).add_to(laMap)

#display map
display(laMap)

## Retrieve Venues via Foursquare API

In [8]:
#Define Foursquare Credentials
CLIENT_ID = 'Insert Your Foursquare ID' #Foursquare ID
CLIENT_SECRET = 'Insert Your Foursquare Secret' #Foursquare Secret
VERSION = '20180605'

In [None]:
#Define Foursquare Credentials
CLIENT_ID = 'V5GTPEARUI4LNNVRL2Z2CBTSKQX32TZB14ZUP4KKDB0DS01L' #Foursquare ID
CLIENT_SECRET = 'FNNRQJQGSE23SQVTQ1MX0DKED01IW5B5O50H0R5K3CKFEBWL' #Foursquare Secret
VERSION = '20180605'

In [9]:
#Function to Create Dataframe of Venues from Foursquare API
def getVenues(code, latitudes, longitudes, radius=500):
    
    venues_list=[]
    dropList = []  
    for code, lat, lng in zip(code, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            500)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        if results == []:
            print('Warning no venues returned for: ',code)
            dropList.append(code)
        # return only relevant information for each nearby venue
        venues_list.append([(
            code, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Zipcode', 
                  'Zipcode Latitude', 
                  'Zipcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']  
    return(nearby_venues, dropList)

In [10]:
laVenues, laDrop = getVenues(laData['Code'],laData['Latitude'],laData['Longitude'])



In [11]:
#Display Venues
laVenues.head()

Unnamed: 0,Zipcode,Zipcode Latitude,Zipcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,90001,33.973,-118.248,Superior Grocers,33.97328,-118.247079,Grocery Store
1,90001,33.973,-118.248,Rite Aid,33.974383,-118.246351,Pharmacy
2,90001,33.973,-118.248,Bill's Drive In,33.9745,-118.244225,Burger Joint
3,90001,33.973,-118.248,SUBWAY,33.975311,-118.248038,Sandwich Place
4,90001,33.973,-118.248,Jack in the Box,33.975167,-118.250313,Fast Food Restaurant


## K-Clusters Analysis

In [12]:
#Create Dataframe for kClusters Analysis
laEncode = pd.get_dummies(laVenues[['Venue Category']], prefix="", prefix_sep="")
laEncode['Zipcode'] = laVenues['Zipcode']
fixed_columns = [laEncode.columns[-1]] + list(laEncode.columns[:-1])
laEncode = laEncode[fixed_columns]
laSorted = laEncode.groupby(by='Zipcode').mean().reset_index()
laCluster = laSorted.drop('Zipcode', axis=1)

#### Average Venue Category by Zip Code

In [13]:
#Show mean values of category types sorted by zipcode
laSorted.head(10)

Unnamed: 0,Zipcode,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Alternative Healer,American Restaurant,Antique Shop,Arcade,Art Gallery,...,Video Store,Vietnamese Restaurant,Watch Shop,Weight Loss Center,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Yoshoku Restaurant
0,90001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,90002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
2,90003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,90004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,90005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.025641,0.0,0.0,0.0,0.025641,0.0
5,90006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,90007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0
7,90008,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
8,90010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019231,0.0,0.0,0.0,0.019231,0.0
9,90011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Top 5 Venue Categories for each Zipcode

In [24]:
#Create DataFrame of Top 5 Venue Categories for each Zipcode
topVen = pd.DataFrame()
#Get top 5 venues for each Zipcode
for count, z in enumerate(laSorted['Zipcode']):
    temp = laSorted[laSorted['Zipcode'] == z]  #Select all of a single zipcode
    temp = temp.drop('Zipcode',axis=1)
    temp.reset_index()
    temp = temp.sort_values(by=count,axis=1,ascending=False)
    topVen = topVen.append(pd.Series(temp.iloc[:,0:5].columns),ignore_index = True)
    
#Rename Columns & Display first 5 rows
topVen.columns = ['#1 Venue','#2 Venue','#3 Venue','#4 Venue','#5 Venue'] 
print(topVen.shape)
topVen.head()

(121, 5)


Unnamed: 0,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
0,Mexican Restaurant,Burger Joint,Fast Food Restaurant,Pizza Place,Grocery Store
1,Women's Store,Park,ATM,Nail Salon,Noodle House
2,Taco Place,ATM,Nail Salon,Noodle House,Nightclub
3,Convenience Store,Cocktail Bar,Mexican Restaurant,Spa,Sushi Restaurant
4,Korean Restaurant,Coffee Shop,Karaoke Bar,Hotel,Café


#### Create k Clusters from Category Data

In [25]:
#Fit Data
k = 7
kmeans = KMeans(k, random_state=0).fit(laCluster)

In [26]:
#add Cluster Labels to Top Venues Dataframe
topVen.insert(0,'Cluster Labels',kmeans.labels_ + 1)

#### Create Final Dataframe:
##### Zipcode, Location Name, Coordinates, k Cluster Group, Top 5 Venues

In [27]:
# Join Zipcodes & Coordinates with Cluster Labels and Top Venues
laFinal = pd.DataFrame()
laFinal = laData.loc[:,['Code','Place','Latitude','Longitude']].sort_values(by='Code').reset_index(drop=True)
for x in laDrop:
    if laFinal['Code'].str.contains(x).any():
        laFinal = laFinal.drop(laFinal.index[laFinal['Code']==x])
laFinal = laFinal.reset_index(drop=True)
laFinal = laFinal.join(topVen)
laFinal = laFinal.sort_values('Cluster Labels').reset_index(drop=True)
laFinal

Unnamed: 0,Code,Place,Latitude,Longitude,Cluster Labels,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
0,90001,Los Angeles,33.973,-118.248,1,Mexican Restaurant,Burger Joint,Fast Food Restaurant,Pizza Place,Grocery Store
1,90033,Los Angeles,34.049,-118.208,1,Mexican Restaurant,Burger Joint,Fast Food Restaurant,Pharmacy,Taco Place
2,90032,Los Angeles,34.082,-118.175,1,Thrift / Vintage Store,Neighborhood,Liquor Store,Mexican Restaurant,North Indian Restaurant
3,90031,Los Angeles,34.078,-118.211,1,Fast Food Restaurant,Mexican Restaurant,Convenience Store,Sandwich Place,Fried Chicken Joint
4,90255,Huntington Park,33.977,-118.216,1,Mexican Restaurant,Convenience Store,Pizza Place,Fast Food Restaurant,Chinese Restaurant
5,90301,Inglewood,33.955,-118.356,1,Mexican Restaurant,Rental Car Location,Donut Shop,BBQ Joint,Grocery Store
6,90023,Los Angeles,34.024,-118.197,1,Grocery Store,Mexican Restaurant,Astrologer,Liquor Store,Pizza Place
7,90022,Los Angeles,34.025,-118.156,1,Mexican Restaurant,Donut Shop,Shoe Store,ATM,New American Restaurant
8,90304,Inglewood,33.938,-118.359,1,Mexican Restaurant,Taco Place,Mobile Phone Shop,Park,Burger Joint
9,90065,Los Angeles,34.107,-118.227,1,Café,Park,Mexican Restaurant,Salad Place,Newsstand


#### Create Map of Clusters

In [38]:
##### initialize map
clusterMap = folium.Map(location=[latitude, longitude],tiles='Stamen Toner', zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**4 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
random.shuffle(rainbow)
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(laFinal['Latitude'], laFinal['Longitude'], laFinal['Code'], laFinal['Cluster Labels']):
    label = folium.Popup(str(poi) + ': Cluster ' + str(cluster), parse_html=True)
    folium.Circle(
        [lat, lon],
        radius=500,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.75).add_to(clusterMap)
print('K    Count\n-----------')
print(laFinal['Cluster Labels'].value_counts(sort=False))
display(clusterMap)

K    Count
-----------
1    14
2     3
3     1
4    99
5     1
6     1
7     2
Name: Cluster Labels, dtype: int64
