# Segmenting and Clustering Neighborhoods in Dallas
By Alex P. Blizzard

## Data Preprocessing

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import requests
import os
from sklearn.cluster import KMeans
import folium 
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###############################

In [2]:
#Import  Dallas data
dallas_df = pd.read_csv('Dallas_Data.csv')
print('Dallas data read successfully!')
dallas_df.set_index('Neighborhood')

Dallas data read successfully!


Unnamed: 0_level_0,Latitude,Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Baylor District,32.797826,-96.778186
The Cedars,32.769897,-96.785852
Civic Center District,32.776255,-96.7985
Dallas Arts District,32.788885,-96.798924
Dallas Farmers Market,32.778543,-96.788246
Deep Ellum,32.784001,-96.778559
Design District,32.789055,-96.82159
Main Street District,32.780931,-96.798114
Reunion District,32.773613,-96.807676
Riverfront District,32.768468,-96.812929


In [3]:
# Create map of downtown Dallas
map_dallas = folium.Map(location=[32.7791, -96.8003], zoom_start=13)
for lat, lng, label in zip(dallas_df['Latitude'], dallas_df['Longitude'], \
dallas_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dallas)  
    
map_dallas

In [4]:
#Define Foursquare credientials
CLIENT_ID = 'UXIK2GMZJZPTNAV4P5XYRUFRJIK2BDHKBUAAYV4HRBQVLRLI' # your Foursquare ID
CLIENT_SECRET = '35SMWRDEEFGMXGSVETWBVLNKMZWRUQSXMU4MZLSFD430SZZL' # your Foursquare Secret
VERSION = '20180604'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UXIK2GMZJZPTNAV4P5XYRUFRJIK2BDHKBUAAYV4HRBQVLRLI
CLIENT_SECRET:35SMWRDEEFGMXGSVETWBVLNKMZWRUQSXMU4MZLSFD430SZZL


In [5]:
#Define API from Foursquare
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#Get nearby venues in Dallas
dallas_venues = getNearbyVenues(names=dallas_df['Neighborhood'],
latitudes=dallas_df['Latitude'], longitudes=dallas_df['Longitude'])
dallas_venues.groupby('Neighborhood').count()

Baylor District
The Cedars
Civic Center District
Dallas Arts District
Dallas Farmers Market
Deep Ellum
Design District
Main Street District
Reunion District
Riverfront District
South Side
Thanksgiving Commercial Center
Uptown
Victory Park
West End Historic District


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Baylor District,27,27,27,27,27,27
Civic Center District,28,28,28,28,28,28
Dallas Arts District,37,37,37,37,37,37
Dallas Farmers Market,21,21,21,21,21,21
Deep Ellum,50,50,50,50,50,50
Design District,23,23,23,23,23,23
Main Street District,100,100,100,100,100,100
Reunion District,22,22,22,22,22,22
Riverfront District,6,6,6,6,6,6
South Side,25,25,25,25,25,25


## Data Processing

In [6]:
#Analyzing each neighborhood
dallas_onehot = pd.get_dummies(dallas_venues[['Venue Category']], prefix="", prefix_sep="")
dallas_onehot['Neighborhood'] = dallas_venues['Neighborhood'] 
fixed_columns = [dallas_onehot.columns[-1]] + list(dallas_onehot.columns[:-1])
dallas_onehot = dallas_onehot[fixed_columns]

#Group average Neighborhood
dallas_grouped = dallas_onehot.groupby('Neighborhood').mean().reset_index()

#5 most common venues
num_top_venues = 10
for hood in dallas_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = dallas_grouped[dallas_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Baylor District----
                venue  freq
0         Pizza Place  0.11
1      Discount Store  0.07
2  Mexican Restaurant  0.07
3      Sandwich Place  0.07
4          Taco Place  0.07
5   Convenience Store  0.04
6         Supermarket  0.04
7        Dance Studio  0.04
8          Donut Shop  0.04
9         Coffee Shop  0.04


----Civic Center District----
                venue  freq
0               Hotel  0.18
1                Café  0.07
2               Plaza  0.07
3                 Bar  0.07
4         Coffee Shop  0.07
5         IT Services  0.04
6  Mexican Restaurant  0.04
7   Indian Restaurant  0.04
8   Mobile Phone Shop  0.04
9           Gift Shop  0.04


----Dallas Arts District----
                     venue  freq
0               Food Truck  0.11
1    Performing Arts Venue  0.08
2               Steakhouse  0.08
3                  Theater  0.05
4      American Restaurant  0.05
5               Art Museum  0.05
6      Japanese Restaurant  0.05
7                      Gym  0.03


In [7]:
#Define and sort 10 most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

#Create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#Create a new dataframe of sorted neighborhoods
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = dallas_grouped['Neighborhood']

for ind in np.arange(dallas_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dallas_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Baylor District,Pizza Place,Discount Store,Sandwich Place,Mexican Restaurant,Taco Place,Grocery Store,Mobile Phone Shop,Donut Shop,Nail Salon,Fast Food Restaurant
1,Civic Center District,Hotel,Coffee Shop,Bar,Café,Plaza,IT Services,Seafood Restaurant,Salad Place,Cocktail Bar,Department Store
2,Dallas Arts District,Food Truck,Performing Arts Venue,Steakhouse,American Restaurant,Japanese Restaurant,Art Museum,Theater,Sandwich Place,Seafood Restaurant,Dog Run
3,Dallas Farmers Market,American Restaurant,Farmers Market,Food Truck,Vietnamese Restaurant,Pool,Dessert Shop,Sandwich Place,Dog Run,Coffee Shop,Design Studio
4,Deep Ellum,Bar,BBQ Joint,Music Venue,American Restaurant,Mexican Restaurant,Dessert Shop,Dive Bar,Coffee Shop,Art Gallery,Pizza Place


## Data Modeling

In [8]:
#Clustering Dallas venues data
kclusters = 5
dallas_grouped_clustering = dallas_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dallas_grouped_clustering)
kmeans.labels_[0:10] 
neighborhoods_venues_sorted.insert(0, 'Cluster Labels2', kmeans.labels_)
dallas_merged = dallas_df

# Merge Dallas data
dallas_merged = dallas_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
dallas_merged.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels2,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Baylor District,32.797826,-96.778186,4,Pizza Place,Discount Store,Sandwich Place,Mexican Restaurant,Taco Place,Grocery Store,Mobile Phone Shop,Donut Shop,Nail Salon,Fast Food Restaurant
1,The Cedars,32.769897,-96.785852,0,Skate Park,Food Court,Museum,History Museum,Art Gallery,Coffee Shop,Pharmacy,Dive Bar,BBQ Joint,Dog Run
2,Civic Center District,32.776255,-96.7985,4,Hotel,Coffee Shop,Bar,Café,Plaza,IT Services,Seafood Restaurant,Salad Place,Cocktail Bar,Department Store
3,Dallas Arts District,32.788885,-96.798924,1,Food Truck,Performing Arts Venue,Steakhouse,American Restaurant,Japanese Restaurant,Art Museum,Theater,Sandwich Place,Seafood Restaurant,Dog Run
4,Dallas Farmers Market,32.778543,-96.788246,1,American Restaurant,Farmers Market,Food Truck,Vietnamese Restaurant,Pool,Dessert Shop,Sandwich Place,Dog Run,Coffee Shop,Design Studio


## Results

In [9]:
#Locate and show Cluster 0
cluster_0 = dallas_merged.loc[dallas_merged['Cluster Labels2'] == 0]
cluster_0

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels2,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,The Cedars,32.769897,-96.785852,0,Skate Park,Food Court,Museum,History Museum,Art Gallery,Coffee Shop,Pharmacy,Dive Bar,BBQ Joint,Dog Run


In [10]:
#Locate and show Cluster 1
cluster_1 = dallas_merged.loc[dallas_merged['Cluster Labels2'] == 1]
cluster_1

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels2,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Dallas Arts District,32.788885,-96.798924,1,Food Truck,Performing Arts Venue,Steakhouse,American Restaurant,Japanese Restaurant,Art Museum,Theater,Sandwich Place,Seafood Restaurant,Dog Run
4,Dallas Farmers Market,32.778543,-96.788246,1,American Restaurant,Farmers Market,Food Truck,Vietnamese Restaurant,Pool,Dessert Shop,Sandwich Place,Dog Run,Coffee Shop,Design Studio
5,Deep Ellum,32.784001,-96.778559,1,Bar,BBQ Joint,Music Venue,American Restaurant,Mexican Restaurant,Dessert Shop,Dive Bar,Coffee Shop,Art Gallery,Pizza Place
10,South Side,32.765736,-96.79498,1,Music Venue,Bar,American Restaurant,Coffee Shop,Rock Club,Business Service,Record Shop,Pub,Cocktail Bar,Convenience Store
12,Uptown,32.802121,-96.800784,1,Cocktail Bar,Bar,Seafood Restaurant,Pizza Place,Café,Gym,Italian Restaurant,Burger Joint,Breakfast Spot,Salon / Barbershop


In [11]:
#Locate and show Cluster 2
cluster_2 = dallas_merged.loc[dallas_merged['Cluster Labels2'] == 2]
cluster_2

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels2,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Riverfront District,32.768468,-96.812929,2,Liquor Store,BBQ Joint,Nightclub,Gay Bar,Business Service,Design Studio,Dive Bar,Discount Store,Diner,Dessert Shop


In [12]:
#Locate and show Cluster 3
cluster_3 = dallas_merged.loc[dallas_merged['Cluster Labels2'] == 3]
cluster_3

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels2,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Design District,32.789055,-96.82159,3,Art Gallery,Furniture / Home Store,Antique Shop,Mexican Restaurant,Convenience Store,Shipping Store,Brewery,Pizza Place,Steakhouse,Bridal Shop


In [13]:
#Locate and show Cluster 4
cluster_4 = dallas_merged.loc[dallas_merged['Cluster Labels2'] == 4]
cluster_4

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels2,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Baylor District,32.797826,-96.778186,4,Pizza Place,Discount Store,Sandwich Place,Mexican Restaurant,Taco Place,Grocery Store,Mobile Phone Shop,Donut Shop,Nail Salon,Fast Food Restaurant
2,Civic Center District,32.776255,-96.7985,4,Hotel,Coffee Shop,Bar,Café,Plaza,IT Services,Seafood Restaurant,Salad Place,Cocktail Bar,Department Store
7,Main Street District,32.780931,-96.798114,4,Hotel,Coffee Shop,Mexican Restaurant,Sandwich Place,Bar,Cocktail Bar,Salad Place,Park,Café,Sports Bar
8,Reunion District,32.773613,-96.807676,4,Hotel,American Restaurant,Scenic Lookout,Plaza,Bar,Restaurant,Event Space,Coffee Shop,Beer Garden,Food Truck
11,Thanksgiving Commercial Center,32.784866,-96.796783,4,Gym,Coffee Shop,Cocktail Bar,Italian Restaurant,Hotel,Steakhouse,Taco Place,New American Restaurant,Boutique,Sandwich Place
13,Victory Park,32.788206,-96.810155,4,Restaurant,Lounge,Sports Bar,Bar,Yoga Studio,Cocktail Bar,Ramen Restaurant,Furniture / Home Store,Pizza Place,Pharmacy
14,West End Historic District,32.780877,-96.807525,4,History Museum,Sandwich Place,Plaza,Hotel,Music Venue,Liquor Store,Pharmacy,Coffee Shop,Convenience Store,Lounge


In [15]:
# Create clusters map of downtown Dallas
map_clusters = folium.Map(location=[32.7791, -96.8003], zoom_start=13)
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Assign colors to cluster markers
markers_colors = []
for lat, lon, poi, cluster in zip(dallas_merged['Latitude'], dallas_merged['Longitude'], \
    dallas_merged['Neighborhood'], dallas_merged['Cluster Labels2']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters