In [1]:
import pandas as pd
from scipy.spatial.distance import cdist
from geopy.distance import great_circle

In [2]:
def closest_point(point, points):
    """ Find closest point from a of list tuples with coordinates. """
    return points[cdist([point], points).argmin()]

def create_zipped_coords_series(df):
    """ Zips lon and lat columns to create a series of coords tuples. """
    return [(x, y) for x,y in zip(df['lat'], df['lon'])]

def unzip_coord_series_to_lon_and_lat(df, zipped_colname):
    df['lat'] = df[zipped_colname].apply(lambda x: x[0])
    df['lon'] = df[zipped_colname].apply(lambda x: x[1])
    df = df.drop(zipped_colname , axis=1)
    return df

In [3]:
coords_map = pd.read_parquet('2019_09_30T18_48_16.parquet')
coords_map.head()

Unnamed: 0,coords_mean_price_m2,lat,lon,coords_category
0,500.0,52.7072,16.6101,1
1,500.03,53.2665,18.661,2
2,500.62,54.0566,20.6824,3
3,500.95,53.738,23.3934,4
4,501.92,53.316,15.6326,5


In [4]:
sale_df = pd.read_parquet('sale_clean_2019_10_05T09_42_17.parquet')
df = sale_df
df.head()

Unnamed: 0,balcony,building_height,building_material,building_type,building_year,desc_len,direct,flat_state,floor,heating,...,internet_no_info,gas_no_info,building_material_brick,building_material_concrete_slab,building_material_other,building_type_apart,building_type_block,building_type_hist,building_type_house,building_type_other
0,1.0,4.0,3,2,0.0,880,0,3,4.0,3.0,...,1,1,1,0,0,0,0,0,0,1
1,0.0,4.0,3,1,61.0,1227,0,1,3.0,3.0,...,1,1,1,0,0,0,1,0,0,0
2,0.0,1.0,3,2,99.0,947,0,3,1.0,3.0,...,1,1,1,0,0,0,0,0,0,1
3,0.0,3.0,1,1,37.0,1537,0,4,3.0,3.0,...,1,1,0,1,0,0,1,0,0,0
4,1.0,2.0,1,1,27.0,1523,0,4,0.0,3.0,...,1,1,0,1,0,0,1,0,0,0


In [5]:
df['temp_coords_tuple'] = create_zipped_coords_series(df)
coords_map['temp_coords_tuple'] = create_zipped_coords_series(coords_map)

# assign a closest point 
df['temp_coords_closest_tuple'] = [
                closest_point(x, list(coords_map['temp_coords_tuple'])) for x in df['temp_coords_tuple']
                ]


In [6]:
final = pd.merge(df,
                 coords_map,
                 left_on='temp_coords_closest_tuple',
                 right_on='temp_coords_tuple',
                 how='left',
                 # add temp suffix to indicate duplicate col
                 suffixes=('', 'temp'))

def get_distance(coords_df):
    """
    Calculate distace between two coords tuples.
    Takes dataframe with coords tuples as an argument.
    """
    distances = []
    for _, row in coords_df.iterrows():
        distances.append(great_circle(row[coords_df.columns[0]],
                                      row[coords_df.columns[1]]).km)
    return [round(dist, 3) for dist in distances]

In [7]:

final['coords_center_distance'] = final[['temp_coords_tuple', 'temp_coords_closest_tuple']].pipe(get_distance)
                      
for col in final.columns:
    if 'temp' in col:
        final = final.drop(col, axis=1)

In [8]:
final.head()

Unnamed: 0,balcony,building_height,building_material,building_type,building_year,desc_len,direct,flat_state,floor,heating,...,building_material_concrete_slab,building_material_other,building_type_apart,building_type_block,building_type_hist,building_type_house,building_type_other,coords_mean_price_m2,coords_category,coords_center_distance
0,1.0,4.0,3,2,0.0,880,0,3,4.0,3.0,...,0,0,0,0,0,0,1,9795.452994,1365,2.291
1,0.0,4.0,3,1,61.0,1227,0,1,3.0,3.0,...,0,0,0,1,0,0,0,3170.73,775,0.0
2,0.0,1.0,3,2,99.0,947,0,3,1.0,3.0,...,0,0,0,0,0,0,1,4700.798,1142,0.0
3,0.0,3.0,1,1,37.0,1537,0,4,3.0,3.0,...,1,0,0,1,0,0,0,5877.983805,1258,5.144
4,1.0,2.0,1,1,27.0,1523,0,4,0.0,3.0,...,1,0,0,1,0,0,0,1613.543333,231,0.756


In [9]:
final.shape

(170874, 76)

In [10]:
final.to_parquet('morizon_sale_2019_10_05T09_42_17.parquet')
