## IBM Data Science Capstone Proejct

#### First import everything

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

#### get the data

In [9]:
LA = requests.get('http://www.laalmanac.com/communications/cm02a90001-90899.php').text
soup = BeautifulSoup(LA, "lxml")

#create data frame
PostalCode = []
Neighborhood = []

for fields in soup.tbody.find_all('tr')[1:]:
    PostalCode.append(fields.find('td').text)
    
    Neighborhood.append(fields.find_all('td')[1].text[:-1])
    

df_data = {"PostalCode": PostalCode,
        "Neighborhood": Neighborhood}
df = pd.DataFrame(df_data)

# #
df = df.reset_index().drop('index', axis = 1)
df.head()


Unnamed: 0,PostalCode,Neighborhood
0,90002,"Los Angeles (Southeast Los Angeles, Watts"
1,90003,"Los Angeles (South Los Angeles, Southeast Los ..."
2,90004,"Los Angeles (Hancock Park, Rampart Village, Vi..."
3,90005,"Los Angeles (Hancock Park, Koreatown, Wilshire..."
4,90006,"Los Angeles (Byzantine-Latino Quarter, Harvard..."


In [6]:
# combine the neighbourhoods with the same zip code
i = 1
while(i < len(df)):
    if df['PostalCode'].iloc[i] == df['PostalCode'].iloc[i - 1]:
        df.at[i - 1, 'Neighborhood'] = df.Neighborhood.iloc[i - 1] +', ' + df.Neighborhood.iloc[i]
        df.drop(index = i, inplace = True)
        df = df.reset_index().drop('index', axis = 1)
    else:
        i += 1
        

df.head(10)

Unnamed: 0,PostalCode,Neighborhood
0,90002,"Los Angeles (Southeast Los Angeles, Watts"
1,90003,"Los Angeles (South Los Angeles, Southeast Los ..."
2,90004,"Los Angeles (Hancock Park, Rampart Village, Vi..."
3,90005,"Los Angeles (Hancock Park, Koreatown, Wilshire..."
4,90006,"Los Angeles (Byzantine-Latino Quarter, Harvard..."
5,90007,"Los Angeles (Southeast Los Angeles, Univerity ..."
6,90008,"Los Angeles (Baldwin Hills, Crenshaw, Leimert ..."
7,90009,Los Angeles (Los Angeles International Airport...
8,90010,"Los Angeles (Hancock Park, Wilshire Center, Wi..."
9,90011,Los Angeles (Southeast Los Angeles


In [7]:
df.shape

(257, 2)

## Part 3: clustering and segmentation

In [None]:
import folium


#### this is from the foursquare api class

In [118]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100, version='20181111'):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID = 'VWZTLZYRESSHPHPC140BKKD0JRZ2KJYZ2XLNAZLJML2UFME0', 
            CLIENT_SECRET = 'CTUN3454OIZ2AAJI4JEL12HRBEPYJGY0PLXKZWHYXYYZLRF1', 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

In [119]:
nearby_venues_df = getNearbyVenues(names=df.PostalCode, latitudes=df.Latitude, longitudes=df.Longitude) # this is a trimmed down version of df

In [124]:
#find the number of categories in each postal code 
categories = nearby_venues_df.groupby('PostalCode').mean()

#### clustering starts with a cluster number of 3

In [126]:
clusters = 3

kmeans = KMeans(n_clusters=clusters, random_state=5).fit(categories) # reproducability

# check to make sure categories are normal
kmeans.labels_[0:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

#### seems many of them fall into three categoeies pretty evenly 

In [135]:
df = pd.merge(left=df, right=clusters, on='PostalCode')
nearby_venues_df = nearby_venues_df.merge(on='PostalCode', right=clusters)

#### generate maps with different colors representing different clusters

In [132]:

def make_map(df, label_col='PostalCode'):
    latitude, longitude = 34.05, 118.24 # LA coordinates

    # create map
    map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

    # set color scheme for the clusters
    x = np.arange(df.Cluster.nunique())
    ys = [i+x+(i*x)**2 for i in range(kclusters)]
    colors_array = cm.Set1(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map
    markers_colors = []
    for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df[label_col], df['Cluster']):
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.5).add_to(map_clusters)
    
    return map_clusters

In [None]:
make_map(df)

#### list the categories of businesses in each cluster

In [136]:
def cluster_summery(df, top=10):
    for i in range(df.Cluster.nunique()):
        data_slice = df.loc[df.Cluster == i]
        summery = data_slice.groupby('Venue Category').count().sort_values(by='Cluster', ascending=False)
        print(f'Cluster #{i} summery:')
        print(summery.iloc[0:top, 1:2], '\n')

In [137]:
cluster_summery(nearby_venues_df, top=10)

Cluster #0 summery:
                      Venue
Venue Category             
Fast Food Restaurant     13
Coffee Shop              10
Pizza Place               7
Clothing Store            7
Bakery                    6
Breakfast Spot            5
Chinese Restaurant        5
Skating Rink              4
Park                      4
Pharmacy                  4 

Cluster #1 summery:
                     Venue
Venue Category            
Coffee Shop            178
Café                    98
Restaurant              57
Hotel                   44
Italian Restaurant      44
Park                    39
Bar                     38
Japanese Restaurant     37
Bakery                  35
Pizza Place             35 

Cluster #2 summery:
                      Venue
Venue Category             
Pizza Place              12
Coffee Shop              12
Sandwich Place            9
Café                      7
Grocery Store             7
Park                      7
Fast Food Restaurant      6
Fried Chicken Joint     

#### from the result above, we can see that cluster #1 is the most popular area
#### conslusion: cluster #1 is the downtown area, with more merchants and lots of restaurants. #0 and #2 are suburban area with sparsly populated merchants