## Part 1: Distance statistics between nightlife businesses within each neighborhood

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np 
from scipy import stats
from sklearn.neighbors import NearestNeighbors

In [2]:
nightlife = pd.read_csv('XYZ_wOutliers.csv')

In [3]:
#Get all rows that have longlats within the zip code boundaries
#1.8% of all data has points outside the zip code boundaries which will not be counted in this calculation

nightlife_within = nightlife[nightlife['outsideZip'] == 0]

In [4]:
#Create list of all neighborhoods
hoods = nightlife_within['Neighborho'].unique()

In [5]:
#function to calculate distance
def distance(p1, p2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    lon1, lat1 = p1
    lon2, lat2 = p2
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    ft = km * 3280.84
    return ft

In [6]:
#create new dataframe
columns = ['mean (ft)','min (ft)','max (ft)','variance','skewness']
finalStats = pd.DataFrame(index=hoods, columns=columns)

In [7]:
#calculate distance statistics in feet
for x in hoods:
    workingHood = nightlife_within[nightlife_within['Neighborho'] == x].copy()
    
    points = [(x, y) for x,y in zip(workingHood['latitude'], workingHood['longitude'])]
    nbrs = NearestNeighbors(n_neighbors=2, metric=distance).fit(points)
    distances, indices = nbrs.kneighbors(points)
    result = distances[:, 1]
    
    finalStats.loc[x, 'mean (ft)'] = result.mean()
    finalStats.loc[x, 'min (ft)'] = result.min()
    finalStats.loc[x, 'max (ft)'] = result.max()
    finalStats.loc[x, 'variance'] = result.var()
    finalStats.loc[x, 'skewness'] = stats.skew(result)
    
finalStats

Unnamed: 0,mean (ft),min (ft),max (ft),variance,skewness
Elmhurst,111.577,4.22628,884.339,22286.4,2.55031
Chelsea,61.1815,2.35878,49370.2,1049490.0,46.0548
South Bronx,96.4843,5.21117,465.701,7927.57,1.78995
Williamsburg,67.1671,1.62975,540.753,3944.66,2.68974
Astoria/LIC,91.4487,3.75649,10790.5,144471.0,25.4733
Bed-Stuy,63.5672,1.46827,614.529,6687.12,3.11284
Ridgewood,117.782,5.67773,1102.88,19269.2,3.94466
LES,34.9395,3.2355,471.175,1304.53,5.17127
Inwood,63.2641,5.74869,1354.21,12375.7,8.67937
Flushing,74.9585,2.78351,1450.77,17931.5,4.9472


In [17]:
finalStats.to_csv('neighborhoodsDistanceStats.csv')

## Part 2: Category variety within each neighborhood (XYZ vs. Yelp)

In [8]:
#new pivot for neighborhoods and their business categories
xyzCat = pd.crosstab(nightlife.Neighborho, nightlife.primary_ca)

Calculate percent of category of all categories per neighborhood

In [9]:
#new column for total num businesses/neighborhood
xyzCat['Total'] = xyzCat.sum(axis=1)

In [10]:
for x in range(len(xyzCat)):
    for y in range(14):
        xyzCat.iloc[x,y] = (xyzCat.iloc[x,y]/xyzCat.iloc[x,14]*100).round(2)

Calculate percent of category of all categories in all neighborhoods

In [11]:
#new table for categories
xyzCat_counts = nightlife.groupby(['primary_ca']).size().reset_index(name='counts')

In [12]:
#count of businesses
total = xyzCat_counts['counts'].sum()

In [13]:
#calculate percent of total (percent distribution)
xyzCat_counts['nycPercentDist'] = (xyzCat_counts['counts']/total*100).round(2)
xyzCat_counts

Unnamed: 0,primary_ca,counts,nycPercentDist
0,Arts & Culture,471,5.51
1,Body,4,0.05
2,Drinks,1212,14.18
3,Entertainment,291,3.4
4,Essentials,562,6.57
5,Fashion,9,0.11
6,Fitness,3,0.04
7,Food,5448,63.73
8,Groups,220,2.57
9,Home & Hobby,16,0.19


Use the new column <b>xyzCat_counts['nycPercentDist']</b> as threshold for being counted into variety within a neighborhood

In [14]:
#new dataframe xyzCat, add new column for num of existing categories per neighborhood and for variety count

xyzCat_new = xyzCat.drop(['Total'],axis=1)
xyzCat_new['NumCategories'] = 0
xyzCat_new['VarietyCount'] = 0

In [15]:
#pos = count num of existing categories per neighborhood
#tally = count number of existing categories that have a percentage equal to or greater than the city distribution

for x in range(len(xyzCat_new)):
    pos = 0
    tally = 0
    for y in range(14):
        if xyzCat_new.iloc[x,y] > 0:
            pos += 1
        if ((xyzCat_new.iloc[x,y]) >= (xyzCat_counts.iloc[y, 2])):
            tally += 1
    xyzCat_new.iloc[x,14] = pos
    xyzCat_new.iloc[x,15] = tally

In [16]:
xyzCat_new['VarietyPerc'] = (xyzCat_new['VarietyCount']/xyzCat_new['NumCategories']*100).round(2)
xyzCat_new

primary_ca,Arts & Culture,Body,Drinks,Entertainment,Essentials,Fashion,Fitness,Food,Groups,Home & Hobby,Lodging,Misc,Parks & Rec,Services,NumCategories,VarietyCount,VarietyPerc
Neighborho,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Astoria/LIC,2.35,0.11,15.79,1.9,4.26,0.0,0.11,69.09,1.79,0.22,4.26,0.0,0.0,0.11,11,7,63.64
Bed-Stuy,3.54,0.13,12.71,0.79,11.8,0.13,0.0,66.06,3.54,0.13,1.18,0.0,0.0,0.0,10,5,50.0
Bed-Stuy/Bushwick,5.08,0.0,9.6,1.13,13.56,0.56,0.56,64.97,2.82,0.0,1.69,0.0,0.0,0.0,9,5,55.56
Bushwick,2.42,0.0,11.29,1.08,5.65,0.0,0.0,75.81,2.15,0.27,0.81,0.54,0.0,0.0,9,3,33.33
Chelsea,8.45,0.04,16.42,6.4,1.37,0.12,0.0,56.46,3.18,0.16,7.28,0.08,0.04,0.0,12,7,58.33
Elmhurst,0.0,0.0,8.23,1.73,8.66,0.0,0.0,77.06,2.6,0.0,1.73,0.0,0.0,0.0,6,3,50.0
Flushing,1.31,0.0,6.99,3.06,1.31,0.0,0.0,83.84,0.44,0.22,2.84,0.0,0.0,0.0,8,2,25.0
Greenpoint,4.37,0.44,18.78,3.93,6.11,0.0,0.0,58.52,6.55,0.44,0.87,0.0,0.0,0.0,9,5,55.56
Harlem,3.52,0.0,7.54,3.77,17.84,0.0,0.0,63.82,2.26,0.0,1.01,0.25,0.0,0.0,8,4,50.0
Inwood,1.55,0.0,7.25,1.55,13.99,0.0,0.0,74.61,0.52,0.0,0.0,0.52,0.0,0.0,7,3,42.86


In [18]:
xyzCat_new.to_csv('neighborhoodsVariety.csv')