In [1]:
import pandas as pd
from scipy.spatial.distance import cdist

In [2]:
def closest_point(point, points):
    """ Find closest point from a of list tuples with coordinates. """
    return points[cdist([point], points).argmin()]

def create_zipped_coords_series(df):
    """ Zips lon and lat columns to create a series of coords tuples. """
    return [(x, y) for x,y in zip(df['lat'], df['lon'])]

def unzip_coord_series_to_lon_and_lat(df, zipped_colname):
    df['lat'] = df[zipped_colname].apply(lambda x: x[0])
    df['lon'] = df[zipped_colname].apply(lambda x: x[1])
    df = df.drop(zipped_colname , axis=1)
    return df

In [3]:
coords_map = pd.read_parquet('2019_09_30T18_48_16.parquet')
coords_map.head()

Unnamed: 0,coords_mean_price_m2,lat,lon,coords_category
0,500.0,52.7072,16.6101,1
1,500.03,53.2665,18.661,2
2,500.62,54.0566,20.6824,3
3,500.95,53.738,23.3934,4
4,501.92,53.316,15.6326,5


In [4]:
sale_df = pd.read_parquet('sale_clean_2019_09_09T19_29_51.parquet')
df = sale_df
df.head()

Unnamed: 0,balcony,building_height,building_material,building_type,building_year,desc_len,direct,flat_state,floor,heating,...,internet_no_info,gas_no_info,building_material_brick,building_material_concrete_slab,building_material_other,building_type_apart,building_type_block,building_type_hist,building_type_house,building_type_other
0,1.0,4.0,3,2,0.0,880,0,3,4.0,3.0,...,1,1,1,0,0,0,0,0,0,1
1,0.0,4.0,3,1,61.0,1227,0,1,3.0,3.0,...,1,1,1,0,0,0,1,0,0,0
2,0.0,1.0,3,2,99.0,947,0,3,1.0,3.0,...,1,1,1,0,0,0,0,0,0,1
3,0.0,3.0,1,1,37.0,1537,0,4,3.0,3.0,...,1,1,0,1,0,0,1,0,0,0
4,1.0,2.0,1,1,27.0,1523,0,4,0.0,3.0,...,1,1,0,1,0,0,1,0,0,0


In [5]:
df['coords_tuple'] = create_zipped_coords_series(df)
coords_map['coords_tuple'] = create_zipped_coords_series(coords_map)

# assign a closest point 
df['coords_closest_tuple'] = [
                closest_point(x, list(coords_map['coords_tuple'])) for x in df['coords_tuple']
                ]


In [11]:
final = pd.merge(df,
                 coords_map,
                 left_on='coords_closest_tuple',
                 right_on='coords_tuple',
                 how='left',
                 suffixes=('', 'duplicate'))

for col in final.columns:
    if 'tuple' in col or 'duplicate' in col:
        final = final.drop(col, axis=1)

In [12]:
#TODO: Add distance to center coord https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude

balcony
building_height
building_material
building_type
building_year
desc_len
direct
flat_state
floor
heating
lat
lift
lon
market_type
offer_id
parking
price
price_m2
promotion_counter
room_n
size
taras
view_count
basement
telecom
driveway
fence
parking_spot
date_added_days_from_2018
date_refreshed_days_from_2018
furniture
kitchen_furniture
foor_n
internet
water
gas
electricity
sewers
balcony_no_info
bthrm_wc_together_no_info
building_height_no_info
building_material_no_info
building_year_no_info
celling_height_no_info
energy_cons_est_no_info
floor_no_info
heating_no_info
kitchen_type_no_info
lat_no_info
lift_no_info
lon_no_info
ownership_type_no_info
parking_no_info
room_n_no_info
taras_no_info
basement_no_info
telecom_no_info
driveway_no_info
fence_no_info
parking_spot_no_info
furniture_no_info
kitchen_furniture_no_info
foor_n_no_info
internet_no_info
gas_no_info
building_material_brick
building_material_concrete_slab
building_material_other
building_type_apart
building_type_block
b

In [14]:
final.head()

Unnamed: 0,balcony,building_height,building_material,building_type,building_year,desc_len,direct,flat_state,floor,heating,...,building_material_brick,building_material_concrete_slab,building_material_other,building_type_apart,building_type_block,building_type_hist,building_type_house,building_type_other,coords_mean_price_m2,coords_category
0,1.0,4.0,3,2,0.0,880,0,3,4.0,3.0,...,1,0,0,0,0,0,0,1,9795.452994,1365
1,0.0,4.0,3,1,61.0,1227,0,1,3.0,3.0,...,1,0,0,0,1,0,0,0,3170.73,775
2,0.0,1.0,3,2,99.0,947,0,3,1.0,3.0,...,1,0,0,0,0,0,0,1,4700.798,1142
3,0.0,3.0,1,1,37.0,1537,0,4,3.0,3.0,...,0,1,0,0,1,0,0,0,5877.983805,1258
4,1.0,2.0,1,1,27.0,1523,0,4,0.0,3.0,...,0,1,0,0,1,0,0,0,1613.543333,231
