In [32]:
import pandas as pd
# Loading airbnb data
air_df = pd.read_csv('../data/raw/Airbnb_Open_Data.csv', low_memory=False)

In [33]:
# removing unnecessary columns

air_df.drop(
    columns=[
        'host id', 
        'host_identity_verified',
        'host name',
        'country',
        'country code',
        'instant_bookable',
        'cancellation_policy',
        'Construction year',
        'service fee',
        'last review',
        'reviews per month',
        'calculated host listings count',
        'availability 365',
        'license'
    ], 
    axis=1, 
    inplace=True
    )
air_df.dropna(subset=['lat', 'long'], inplace=True)
air_df.head()

Unnamed: 0,id,NAME,neighbourhood group,neighbourhood,lat,long,room type,price,minimum nights,number of reviews,review rate number,house_rules
0,1001254,Clean & quiet apt home by the park,Brooklyn,Kensington,40.64749,-73.97237,Private room,$966,10.0,9.0,4.0,Clean up and treat the home the way you'd like...
1,1002102,Skylit Midtown Castle,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,$142,30.0,45.0,4.0,Pet friendly but please confirm with me if the...
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,Harlem,40.80902,-73.9419,Private room,$620,3.0,0.0,5.0,"I encourage you to use my kitchen, cooking and..."
3,1002755,,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,$368,30.0,270.0,4.0,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,$204,10.0,9.0,3.0,"Please no smoking in the house, porch or on th..."


In [41]:
mask_bad_lat = pd.to_numeric(air_df['lat'], errors='coerce').isna()
mask_bad_long = pd.to_numeric(air_df['long'], errors='coerce').isna()

bad_rows = air_df[mask_bad_lat | mask_bad_long]
print(f"{len(bad_rows)} lignes non numériques trouvées :")
# display(bad_rows[['id', 'lat', 'long']])

0 lignes non numériques trouvées :


In [35]:
# loading common places
common_df = pd.read_csv('../data/raw/CommonPlace_20251109.csv')
common_df.drop(
    columns=[
        'SEGMENTID',
        'COMPLEXID',
        'SAFTYPE',
        'BIN',
        'SOURCE',
        'SOS INDICATOR',
        'SOURCE ID',
        'CREATED_BY',
        'CREATED_DATE',
        'MODIFIED_BY',
        'MODIFIED_DATE',
        'B7SC',
        'PRIMARY ADDRESS POINT ID',
        'SECURITY LEVEL'
    ],
    axis=1,
    inplace=True
)
common_df.dropna(subset=['the_geom'])


Unnamed: 0,the_geom,PLACEID,OBJECTID,FACILITY DOMAINS,BOROUGH CODE,FACILITY TYPE,FEATURE NAME
0,POINT (-74.097961931446 40.634604200807),11947,10555,2,5.0,2,IS 61 WILLIAM A MORRIS
1,POINT (-73.981379489555 40.589105561411),12280,5120,3,3.0,2,PS 721 BROOKLYN OCCUPATIONAL TRAINING CENTER
2,POINT (-73.943478646583 40.724827480825),1036755,20345,10,3.0,4,MCGOLRICK PLAYGROUND COMFORT STATION
3,POINT (-73.858490015009 40.708424926703),3068,7507,2,4.0,7,HOME DEPOT WOODHAVEN BLVD
4,POINT (-74.024085858699 40.672444940613),6914,7005,11,3.0,6,BAY RIDGE CHANNEL LIGHTED GONG BUOY 11
...,...,...,...,...,...,...,...
20581,POINT (-73.844473884361 40.85251373917),1041027,21203,5,2.0,1,EINSTEIN COLLEGE EASTCHESTER ROAD RESIDENCE CO...
20582,POINT (-73.908288302158 40.752918727974),1041002,21206,3,4.0,2,MOTION PICTURE TECHNICAL HIGH SCHOOL
20583,POINT (-73.846407149463 40.852797606314),1041031,21207,8,2.0,2,EINSTEIN COLLEGE VAN ETTEN BUILDING
20584,POINT (-73.845280010227 40.850863253613),1027172,17110,7,2.0,2,EINSTEIN COLLEGE BELFER CENTER


In [36]:
# Supprimer le texte 'POINT (' et ')'
common_df['the_geom'] = common_df['the_geom'].str.replace('POINT ', '', regex=False)
common_df['the_geom'] = common_df['the_geom'].str.strip('()')

# Séparer longitude et latitude
common_df[['longitude', 'latitude']] = common_df['the_geom'].str.split(' ', expand=True).astype(float)

In [40]:
import numpy as np
from sklearn.neighbors import BallTree

air_df = air_df.reset_index(drop=True)
common_df = common_df.reset_index(drop=True)

airbnb_rad = np.deg2rad(air_df[['lat', 'long']])
lieux_rad = np.deg2rad(common_df[['latitude', 'longitude']])

lieux_tree = BallTree(lieux_rad, metric='haversine')

X = 5

distances, indices = lieux_tree.query(airbnb_rad, k=X)
distances_km = distances * 6371

proximite_airbnb = []
for i, (d, idx) in enumerate(zip(distances_km, indices)):
    for j in range(X):
        proximite_airbnb.append({
            'id_airbnb': air_df.loc[i, 'id'],
            'id_lieu': common_df.loc[idx[j], 'PLACEID'],
            'distance_km': d[j]
        })

df_proximite_airbnb = pd.DataFrame(proximite_airbnb)
df_proximite_airbnb.head()

Unnamed: 0,id_airbnb,id_lieu,distance_km
0,1001254,1027376,0.072064
1,1001254,14710,0.168645
2,1001254,16271,0.227847
3,1001254,6297,0.271033
4,1001254,1006705,0.285519
