In [1]:
import pandas as pd
import geopandas as gp
import numpy as np
from scipy import spatial

In [2]:
df_house = pd.read_csv('../Data/Cleaned/House/house_with_neigh.csv')
df_airbnb = pd.read_csv('../Data/Cleaned/Airbnb/airbnb_master_listing_reduced.csv')

Dropping duplicates from airbnb dataset

In [3]:
dedup_airbnb = df_airbnb.drop_duplicates(subset=['id'],keep='last')

In [4]:
print(f'Without duplicates = {dedup_airbnb.shape} with duplicates = {df_airbnb.shape}')

Without duplicates = (27806, 10) with duplicates = (142101, 10)


In [5]:
gpf_house = gp.GeoDataFrame(df_house, crs="EPSG:4326", geometry=gp.points_from_xy(df_house.long,df_house.lat))
gpf_airbnb = gp.GeoDataFrame(dedup_airbnb, crs="EPSG:4326", geometry=gp.points_from_xy(dedup_airbnb.longitude,dedup_airbnb.latitude))

In [6]:
df_house.head(1)

Unnamed: 0,lat,long,sqft,parking,mean_district_income,bedrooms_bg,bedrooms_ag,bathrooms,final_price,type_Att/Row/Twnhouse,...,type_Comm Element Condo,type_Condo Apt,type_Condo Townhouse,type_Detached,type_Link,type_Plex,type_Semi-Detached,type_Store W/Apt/Offc,neighbourhood,geometry
0,43.753182,-79.507053,,4,29958,1,3,2,930000,0,...,0,0,0,1,0,0,0,0,York University Heights,POINT (-79.50705 43.75318)


In [7]:
df_airbnb.head(1)

Unnamed: 0,id,host_id,neighbourhood,latitude,longitude,room_type,price,calculated_host_listings_count,availability_365,file
0,1419,1565,Little Portugal,43.64617,-79.42451,Entire home/apt,468,1,0,04june2019_listings.csv


*The following method take input as two dataset house and airbnb listings, <br>
followed by column name to create and distance for radius <br>
The ouput is number of airbnbs in the neighborhoods based on house*

In [8]:
# distance should be in meteres
def return_neighbors(houses, airbnbs, col_name, distance):
    houses['geometry'] = houses.geometry.to_crs('EPSG:5234')
    airbnbs['geometry'] = airbnbs.geometry.to_crs('EPSG:5234')
    

    list_house = np.array(list(houses.geometry.apply(lambda point: (point.x, point.y))))
    list_airbnb = np.array(list(airbnbs.geometry.apply(lambda point: (point.x, point.y))))
    
    btree = spatial.cKDTree(list_airbnb)
        
    dist = btree.query_ball_point(list_house, distance)
    
    house_gdf = pd.concat(
        [
            houses.reset_index(drop=True),
            pd.Series(dist, name=col_name)
        ], 
        axis=1,)
    
    house_gdf[col_name] = house_gdf[col_name].apply(lambda x: len(x))
    
    return house_gdf

In [9]:
dummy_house_gpf = gpf_house.copy()

In [10]:
dummy_house_gpf = return_neighbors(dummy_house_gpf,gpf_airbnb, 'within_500_m' ,500)

In [11]:
dummy_house_gpf = return_neighbors(dummy_house_gpf,gpf_airbnb, 'within_1_km' ,1000)

In [13]:
dummy_house_gpf['within_500_1km'] = dummy_house_gpf.apply(lambda row: row.within_1_km - row.within_500_m,axis=1)

In [14]:
dummy_house_gpf.head()

Unnamed: 0,lat,long,sqft,parking,mean_district_income,bedrooms_bg,bedrooms_ag,bathrooms,final_price,type_Att/Row/Twnhouse,...,type_Detached,type_Link,type_Plex,type_Semi-Detached,type_Store W/Apt/Offc,neighbourhood,geometry,within_500_m,within_1_km,within_500_1km
0,43.753182,-79.507053,,4,29958,1,3,2,930000,0,...,1,0,0,0,0,York University Heights,POINT (-1388957.608 14388213.367),26,45,19
1,43.75309,-79.491536,1300.0,1,29958,0,3,3,413000,0,...,0,0,0,0,0,York University Heights,POINT (-1390210.682 14387913.391),12,31,19
2,43.75309,-79.491536,1300.0,3,29958,1,3,2,400000,0,...,0,0,0,0,0,York University Heights,POINT (-1390210.682 14387913.391),12,31,19
3,43.753177,-79.490821,950.0,1,29958,0,2,1,343000,0,...,0,0,0,0,0,York University Heights,POINT (-1390265.892 14387889.376),11,28,17
4,43.753177,-79.490821,650.0,1,29958,0,1,1,318000,0,...,0,0,0,0,0,York University Heights,POINT (-1390265.892 14387889.376),11,28,17


In [15]:
dummy_house_gpf.to_csv('../Data/Cleaned/House/House_num_airbnb.csv', index = False)