In [1]:
import pandas as pd
import shapely
import geopandas as gpd

from tqdm.auto import tqdm

In [2]:
crmls = pd.read_csv('data/all_listings_lat_long_city_crmls.csv')
for i in range(2,47):
    crmls = pd.concat([crmls, pd.read_csv(f'data/all_listings_lat_long_city_crmls_{i}.csv')])
len(crmls)

11178605

In [3]:
crmls.head()

Unnamed: 0,mls_area_major,city,county_or_parish,latitude,longitude
0,61,STAN,OR,33.784077,-117.989219
1,82,BP,OR,33.849421,-117.983828
2,CS,CDM,OR,33.591342,-117.857918
3,TUL,TUL,LA,34.144482,-118.364059
4,M3,LAM,LA,33.90745,-117.992478


In [4]:
lookups = pd.read_csv('data/CRMLS_mapping_fields_and_lookups.csv', delimiter='\t')
lookups.head()

Unnamed: 0,resource,field_long,field_short,searchable,data_type,lookup_long,lookup_short,raw_value
0,Property,City,City,True,Character,Wrightwood,WRIW,WRIW
1,Property,City,City,True,Character,Acton,AC,AC
2,Property,City,City,True,Character,Agoura,AGO,AGO
3,Property,City,City,True,Character,Agoura Hills,AGHI,AGHI
4,Property,City,City,True,Character,Agua Dulce,ADUL,ADUL


In [5]:
lookups['field_long'].unique()

array(['City', 'CountyOrParish', 'MLSAreaMajor'], dtype=object)

In [6]:
crmls[crmls['mls_area_major'] == '61']

Unnamed: 0,mls_area_major,city,county_or_parish,latitude,longitude
0,61,STAN,OR,33.784077,-117.989219
18,61,STAN,OR,33.801741,-118.009193
414,61,STAN,OR,33.791337,-117.997725
1415,61,STAN,OR,33.778590,-117.995679
1701,61,STAN,OR,33.808336,-117.994360
...,...,...,...,...,...
1597,61,GG,OR,33.776020,-117.943912
1807,61,GG,OR,33.779013,-117.946726
2025,61,ANA,OR,33.842293,-117.968842
2382,61,ANA,OR,33.793134,-117.988333


In [7]:
crmls['geometry'] = [shapely.Point(long, lat) for lat, long in tqdm(zip(crmls['longitude'], crmls['latitude']), total=len(crmls))]
crmls = gpd.GeoDataFrame(crmls, crs='EPSG:4326')

  0%|          | 0/11178605 [00:00<?, ?it/s]

In [8]:
mls_area_major = {'mls_area_major': crmls['mls_area_major'].unique()}
centroids = [shapely.MultiPoint(crmls[crmls['mls_area_major'] == area]['geometry'].values).centroid for area in tqdm(mls_area_major['mls_area_major'], total=len(crmls['mls_area_major'].unique()))]

mls_area_major['centroid'] = centroids
mls_area_major = pd.DataFrame(mls_area_major)

  0%|          | 0/1315 [00:00<?, ?it/s]

In [9]:
city = {'city': crmls['city'].unique()}
centroids = [shapely.MultiPoint(crmls[crmls['city'] == area]['geometry'].values).centroid for area in tqdm(city['city'], total=len(crmls['city'].unique()))]

city['centroid'] = centroids
city = pd.DataFrame(city)

  0%|          | 0/1524 [00:00<?, ?it/s]

In [10]:
county_or_parish = {'county_or_parish': crmls['county_or_parish'].unique()}
centroids = [shapely.MultiPoint(crmls[crmls['county_or_parish'] == area]['geometry'].values).centroid for area in tqdm(county_or_parish['county_or_parish'], total=len(crmls['county_or_parish'].unique()))]

county_or_parish['centroid'] = centroids
county_or_parish = pd.DataFrame(county_or_parish)

  0%|          | 0/63 [00:00<?, ?it/s]

In [11]:
county_or_parish.head()

Unnamed: 0,county_or_parish,centroid
0,OR,POINT (33.731145635215384 -117.69873425734731)
1,LA,POINT (34.07895466681212 -118.20558358787336)
2,RI,POINT (33.79116320489932 -116.9300519756651)
3,SB,POINT (34.276407741010075 -117.25810459182603)
4,SD,POINT (32.922413048353185 -117.10506099287146)


In [12]:
def get_radius(listings: gpd.GeoDataFrame, centroid: shapely.Point, thresh=0.95, radius_incr=0.005) -> float:
    if len(listings) == 0:
        return 0
    radius = 0 + radius_incr
    check = gpd.GeoDataFrame({'check': [0], 'geometry': [centroid.buffer(radius)]}, crs='EPSG:4326')
    while len(listings.sjoin(check, predicate='intersects'))/len(listings) < thresh:
        radius += radius_incr
        check = gpd.GeoDataFrame({'check': [0], 'geometry': [centroid.buffer(radius)]}, crs='EPSG:4326')
    # print(len(listings.sjoin(check, predicate='intersects'))/len(listings))
    return radius
        

In [13]:
get_radius(crmls[crmls['county_or_parish'] == 'OR'], county_or_parish['centroid'][0])

0.3450000000000002

In [14]:
radii = [get_radius(crmls[crmls['mls_area_major'] == l], c) for l, c in tqdm(zip(mls_area_major['mls_area_major'],
                                                                                  mls_area_major['centroid']),
                                                                            total=len(mls_area_major))]
mls_area_major['radius'] = radii

  0%|          | 0/1315 [00:00<?, ?it/s]

In [15]:
radii = [get_radius(crmls[crmls['city'] == l], c) for l, c in tqdm(zip(city['city'],
                                                                       city['centroid']), 
                                                                   total=len(city))]
city['radius'] = radii

  0%|          | 0/1524 [00:00<?, ?it/s]

In [16]:
radii = [get_radius(crmls[crmls['county_or_parish'] == l], c, radius_incr=0.01) for l, c in tqdm(zip(county_or_parish['county_or_parish'],
                                                                                   city['centroid']),
                                                                               total=len(county_or_parish))]
county_or_parish['radius'] = radii

  0%|          | 0/63 [00:00<?, ?it/s]

In [17]:
city = city.merge(lookups[lookups['field_long'] == 'City'][['lookup_long', 'lookup_short']], how='left', 
           left_on='city', right_on='lookup_short')[['city', 'lookup_long', 'centroid', 'radius']]

mls_area_major = mls_area_major.merge(lookups[lookups['field_long'] == 'MLSAreaMajor'][['lookup_long','lookup_short']],
                                       how='left',left_on='mls_area_major', 
                                       right_on='lookup_short')[['mls_area_major', 'lookup_long', 'centroid', 'radius']]

county_or_parish = county_or_parish.merge(lookups[lookups['field_long'] == 'CountyOrParish'][['lookup_long','lookup_short']], 
                                          how='left', left_on='county_or_parish', 
                                          right_on='lookup_short')[['county_or_parish', 'lookup_long', 'centroid', 'radius']]

In [18]:
mls_area_major.to_csv('data/mls_area_major.csv', index=False)
city.to_csv('data/city.csv', index=False)
county_or_parish.to_csv('data/county_or_parish.csv', index=False)