### Installing Packages

#### Conda environnment
You can create the same conda environment that was used to run this notebook using the following command:

```
conda env create -f requirements.yml
```

In [3]:
# standard python packages
import pandas as pd
import ast
import numpy as np
import sys
import json


#geospatial manipulation
import geopandas
from geopy.distance import geodesic
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from shapely.ops import nearest_points
from shapely import wkt

#### Import the Properati and Census data 

In [4]:
properati = pd.read_csv('properati_data/rent/rent-combined.csv', encoding= "ISO-8859-1")
census = pd.read_csv('census/cleaned_data2.csv',encoding= "ISO-8859-1")

In [5]:
properati.head(5)

Unnamed: 0,dataset_date,id,created_on,operation,property_type,place_name,place_with_parent_names,country_name,state_name,geonames_id,...,lat,lon,price,currency,price_aprox_local_currency,price_aprox_usd,properati_url,description,title,image_thumbnail
0,201501,592e5e03a301d299a7bd9dd5cb25b99bd2dba9cc,2014-12-22,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,-38.416097,-63.616672,8000.0,ARS,9171.37,530.92,http://www.properati.com.ar/k4pm_alquiler_depa...,Departamento dÃÂºplex de 70 m2 con cochera in...,Departamentos alquiler,https://thumbs4.properati.com/6/CmiBjApL3aZtLb...
1,201501,9b254ef5797b842cf18729c4df9827332fdca285,2014-08-26,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,-34.607902,-58.390337,12000.0,ARS,13757.06,796.38,http://www.properati.com.ar/i5yi_alquiler_ofic...,DUEÃâO DIRECTO CENTRO TRIBUNALES SAN NICOLA...,Departamentos alquiler,https://thumbs4.properati.com/5/EsfoslNeUvEOJF...
2,201501,6052eff893a6541b80eff7c92e2a84244a4a52f7,2015-01-22,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,-34.581524,-58.433261,700.0,USD,12092.15,700.0,http://www.properati.com.ar/kk3l_alquiler_depa...,Alquilar vivienda de 3 habitaciones totalmente...,Departamentos alquiler,https://thumbs4.properati.com/0/xboIRcD66_k4Ao...
3,201501,0127fd9d7382e8e8e98654b0f74c09a82d54f049,2014-12-02,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,-34.596975,-58.408897,9000.0,ARS,10317.88,597.29,http://www.properati.com.ar/jx0w_venta_departa...,EXCELENTE DEPARTAMENTO DE 2 AMBIENTES AMPLIOS!...,Departamentos alquiler,https://thumbs4.properati.com/0/qw_5oLQ2qj9kK0...
4,201501,c7fc7d25b11f8cb7fa5ca9d8ea913422f99cde64,2014-07-14,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,-34.558106,-58.455842,19000.0,ARS,21782.28,1260.95,http://www.properati.com.ar/hfyj_alquiler_depa...,Elerosa,Departamento alquiler,https://thumbs4.properati.com/2/SrozX6FK39DkAr...


In [6]:
census.head(5)

Unnamed: 0,id,AREA,Commune,DEPTO,FRAC,Computer Percent,Computer Quantile,Cellular Percent,Cellular Quantile,Rent Percent,...,RADIO,RADPAIS_,REDCODE,Regular Percent,Regular Quantile,TIPO,Uninhabited Percent,Uninhabited Quantile,coordinates,type
0,32794,17608.88,1,1.0,9.0,75.0,6,91.3,8,62.2,...,7.0,32794.0,20010907.0,89.9,6,,11.8,9,"[[[-58.37828, -34.598910000000004], [-58.37826...",Polygon
1,32795,14524.75,5,5.0,2.0,87.5,9,95.4,9,31.2,...,8.0,32795.0,20050208.0,86.4,3,,4.6,6,"[[[-58.41259, -34.60161], [-58.4128, -34.60251...",Polygon
2,32796,34959.75,1,1.0,9.0,70.5,4,89.0,6,51.9,...,9.0,32796.0,20010909.0,85.2,3,,6.0,8,"[[[-58.37537, -34.59871], [-58.37534, -34.5998...",Polygon
3,32799,17023.88,1,1.0,9.0,71.1,4,87.2,4,54.4,...,8.0,32799.0,20010908.0,79.9,1,,9.9,9,"[[[-58.37684, -34.59881], [-58.3768, -34.59995...",Polygon
4,32801,73197.38,11,11.0,6.0,69.2,4,87.0,4,21.0,...,3.0,32801.0,20110603.0,87.6,4,,3.7,5,"[[[-58.51344, -34.60832], [-58.51267, -34.6091...",Polygon


In [7]:
def modify_poly(data):
    coord_list = []
    #row = ast.literal_eval(data)
    for coord in data:
        coord_list.append(tuple(coord))
    return coord_list

In [8]:
# create a new column called coordinates on properati data frame
properati['coordinates'] = list(zip(properati.lon, properati.lat))
properati['coordinates'] = properati['coordinates'].apply(Point)

# convert properati data into geopandas dataframe
properati_gdf = geopandas.GeoDataFrame(properati, geometry='coordinates')

print ('Number of rows in Properati before merge: {}'.format(len(properati_gdf)))

# convert census polygon data into tuples and polygon
census.coordinates = census.coordinates.apply(lambda x: ast.literal_eval(x)[0])
census.coordinates = census.coordinates.apply(lambda x: modify_poly(x))
census.coordinates = census.coordinates.apply(Polygon)

census_gdf = geopandas.GeoDataFrame(census, geometry='coordinates')
print ('Number of rows in Census dataset: {}'.format(len(census_gdf)))

properati_census_gdf = geopandas.sjoin(properati_gdf, census_gdf, how="inner", op='within')
print ('Percent of properties within Buenos Aires: {:.2f}%'.format((len(properati_census_gdf)/len(properati_gdf))*100))
print ('Number of Properati rows within Buenos Aires: {}'.format(len(properati_census_gdf)))

Number of rows in Properati before merge: 265181
Number of rows in Census dataset: 3552


  outputs = ufunc(*inputs)


Percent of properties within Buenos Aires: 24.73%
Number of Properati rows within Buenos Aires: 65574


In [9]:
properati_census_gdf.head(10)

Unnamed: 0,dataset_date,id_left,created_on,operation,property_type,place_name,place_with_parent_names,country_name,state_name,geonames_id,...,PROV,RADIO,RADPAIS_,REDCODE,Regular Percent,Regular Quantile,TIPO,Uninhabited Percent,Uninhabited Quantile,type
1,201501,9b254ef5797b842cf18729c4df9827332fdca285,2014-08-26,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
2868,201502,9b254ef5797b842cf18729c4df9827332fdca285,2014-08-26,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
59432,201605,693f3f98f8b3c4c8e8774c6b02fce12f260f5bc5,2016-05-19,rent,store,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
63470,201606,693f3f98f8b3c4c8e8774c6b02fce12f260f5bc5,2016-05-19,rent,store,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
63478,201606,7d7bc7ff8756413088e8f672c82c579b48272821,2016-06-10,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
67260,201607,7d7bc7ff8756413088e8f672c82c579b48272821,2016-06-10,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
67270,201607,693f3f98f8b3c4c8e8774c6b02fce12f260f5bc5,2016-05-19,rent,store,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
71086,201608,7d7bc7ff8756413088e8f672c82c579b48272821,2016-06-10,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
75207,201609,7d7bc7ff8756413088e8f672c82c579b48272821,2016-06-10,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
108455,201702,799aca1e37fb3d2010fa1c54d5b851e311aa81a9,2017-03-09,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon


In [10]:
prop_census_df = pd.DataFrame(properati_census_gdf)
prop_census_df.to_csv('properati_data/rent/properati_census_rent.csv')

In [12]:
prop_census_df = prop_census_df.drop_duplicates(subset='id_left')
prop_census_df.to_csv('properati_data/rent/rent_census_stripped.csv', columns=['id_left', 'lat', 'lon'])
properati_census_gdf = geopandas.GeoDataFrame(prop_census_df, geometry='coordinates')

In [13]:
prop_census_df.head(5)

Unnamed: 0,dataset_date,id_left,created_on,operation,property_type,place_name,place_with_parent_names,country_name,state_name,geonames_id,...,PROV,RADIO,RADPAIS_,REDCODE,Regular Percent,Regular Quantile,TIPO,Uninhabited Percent,Uninhabited Quantile,type
1,201501,9b254ef5797b842cf18729c4df9827332fdca285,2014-08-26,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
59432,201605,693f3f98f8b3c4c8e8774c6b02fce12f260f5bc5,2016-05-19,rent,store,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
63478,201606,7d7bc7ff8756413088e8f672c82c579b48272821,2016-06-10,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
108455,201702,799aca1e37fb3d2010fa1c54d5b851e311aa81a9,2017-03-09,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon
109346,201703,2377705bd05e4626070b206396b66e99f28be369,2017-04-28,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,...,2.0,13.0,33303.0,20011113.0,87.3,4,,7.6,8,Polygon


In [14]:
barrios = pd.read_csv('shape files/barrios.csv')
barrios.WKT = barrios.WKT.apply(wkt.loads)

# normalize barrio names
barrios['barrio'] = (barrios['barrio'].apply(lambda x: str(x.replace('VELEZ SARSFIELD', 'VELEZ SARFIELD'))))
barrios['barrio'] = (barrios['barrio'].apply(lambda x: str(x.replace('VILLA GRAL. MITRE', 'VILLA GENERAL MITRE'))))
barrios['barrio'] = (barrios['barrio'].apply(lambda x: str(x.replace('VILLA GRAL. MITR', 'VILLA GENERAL MITRE'))))
barrios['barrio'] = (barrios['barrio'].apply(lambda x: str(x.replace('VILLA GRAL MITRE', 'VILLA GENERAL MITRE'))))
barrios['barrio'] = (barrios['barrio'].apply(lambda x: str(x.replace('VERSALLES', 'VERSAILLES'))))
barrios['barrio'] = (barrios['barrio'].apply(lambda x: str(x.replace('MONSERRAT', 'MONTSERRAT'))))

barrio_df = pd.read_csv('barrio_table.csv')
barrio_df['b_id'] = [int(barrio_df[barrio_df.Barrio == i].id) for i in barrio_df.Barrio]
barrio_df['barrio'] = barrio_df.Barrio

barrios = pd.merge(barrios, barrio_df, on='barrio', how='left')

barrios_gdf = geopandas.GeoDataFrame(barrios, geometry='WKT')

In [15]:
barrios_gdf.head(5)

Unnamed: 0,WKT,barrio,comuna,perimetro,area,Barrio,id,b_id
0,"POLYGON ((-58.4528200492791 -34.5959886570639,...",CHACARITA,15,7725.695228,3118101.0,CHACARITA,26,26
1,"POLYGON ((-58.4655768128541 -34.5965577078058,...",PATERNAL,15,7087.513295,2229829.0,PATERNAL,27,27
2,"POLYGON ((-58.4237529813037 -34.5978273383243,...",VILLA CRESPO,15,8132.699348,3613584.0,VILLA CRESPO,6,6
3,"POLYGON ((-58.4946097568899 -34.6148652395239,...",VILLA DEL PARQUE,11,7705.389797,3399596.0,VILLA DEL PARQUE,29,29
4,"POLYGON ((-58.4128700313089 -34.6141162515854,...",ALMAGRO,5,8537.901368,4050752.0,ALMAGRO,12,12


In [16]:
properati_census_gdf = properati_census_gdf.drop(columns=['id_left', 'id_right', 'index_right'], axis=1)
print (properati_census_gdf.columns.values)
final_gdf = geopandas.sjoin(properati_census_gdf , barrios_gdf, how="inner", op='within')
print (final_gdf.columns.values)


['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'place_with_parent_names' 'country_name' 'state_name' 'geonames_id'
 'lat_lon' 'lat' 'lon' 'price' 'currency' 'price_aprox_local_currency'
 'price_aprox_usd' 'properati_url' 'description' 'title' 'image_thumbnail'
 'coordinates' 'AREA' 'Commune' 'DEPTO' 'FRAC' 'Computer Percent'
 'Computer Quantile' 'Cellular Percent' 'Cellular Quantile' 'Rent Percent'
 'Rent Quantile' 'LINK' 'Immigration Percent' 'Immigration Quantile'
 'Education Percent' 'Education Quantile' 'PERIMETER' 'Owner Percent'
 'Owner Quantile' 'PROV' 'RADIO' 'RADPAIS_' 'REDCODE' 'Regular Percent'
 'Regular Quantile' 'TIPO' 'Uninhabited Percent' 'Uninhabited Quantile'
 'type']
['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'place_with_parent_names' 'country_name' 'state_name' 'geonames_id'
 'lat_lon' 'lat' 'lon' 'price' 'currency' 'price_aprox_local_currency'
 'price_aprox_usd' 'properati_url' 'description' 'title' 'image_thumbnai

In [17]:
final_gdf.head(5)

Unnamed: 0,dataset_date,created_on,operation,property_type,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat_lon,...,Uninhabited Quantile,type,index_right,barrio,comuna,perimetro,area,Barrio,id,b_id
1,201501,2014-08-26,rent,apartment,Capital Federal,|Argentina|Capital Federal|,Argentina,Capital Federal,3433955.0,"-34.607902,-58.390337",...,8,Polygon,42,SAN NICOLAS,1,6548.084741,2289008.0,SAN NICOLAS,43,43
59432,201605,2016-05-19,rent,store,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,"-34.608204,-58.389802",...,8,Polygon,42,SAN NICOLAS,1,6548.084741,2289008.0,SAN NICOLAS,43,43
63478,201606,2016-06-10,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,"-34.607908569,-58.3893042187",...,8,Polygon,42,SAN NICOLAS,1,6548.084741,2289008.0,SAN NICOLAS,43,43
108455,201702,2017-03-09,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,"-34.6089971818,-58.3893797591",...,8,Polygon,42,SAN NICOLAS,1,6548.084741,2289008.0,SAN NICOLAS,43,43
109346,201703,2017-04-28,rent,apartment,Congreso,|Argentina|Capital Federal|Congreso|,Argentina,Capital Federal,3435259.0,"-34.6084511721,-58.3904457092",...,8,Polygon,42,SAN NICOLAS,1,6548.084741,2289008.0,SAN NICOLAS,43,43


In [18]:
filtered_gdf = final_gdf[['dataset_date', 'created_on', 'operation', 'property_type', 'place_name', 'place_with_parent_names',\
                          'state_name', 'price', 'currency', 'price_aprox_local_currency', 'price_aprox_usd',\
                         'Commune', 'Computer Percent', 'Computer Quantile', 'Cellular Percent', 'Cellular Quantile',\
                          'Rent Percent','Rent Quantile','Immigration Percent', 'Immigration Quantile','Education Percent', \
                          'Education Quantile', 'Owner Percent','Owner Quantile','Regular Percent','Regular Quantile', \
                          'Uninhabited Percent','Uninhabited Quantile','barrio','b_id', 'comuna', 'perimetro','area']]

filter_df = pd.DataFrame(filtered_gdf)
filter_df.to_csv('properati_data/rent/properati_census_barrios.csv')


In [19]:
barrios_filter_df = filter_df[['barrio','b_id','comuna','perimetro','area','price_aprox_local_currency', 'price_aprox_usd','created_on']]
grouped = barrios_filter_df.groupby(['b_id','barrio'], as_index=False).agg({"price_aprox_local_currency": ["min", "max", "mean", "std"], "price_aprox_usd": ["min", "max", "mean", "std"],\
                                                "created_on": "count"})
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
grouped.to_csv('properati_data/rent/properati_barrios_grouped.csv')


In [20]:
grouped.head(5)

Unnamed: 0,b_id_,barrio_,price_aprox_local_currency_min,price_aprox_local_currency_max,price_aprox_local_currency_mean,price_aprox_local_currency_std,price_aprox_usd_min,price_aprox_usd_max,price_aprox_usd_mean,price_aprox_usd_std,created_on_count
0,0,PALERMO,0.0,1501039.0,34566.886943,49085.363402,0.0,94000.0,2016.866082,2889.67763,4433
1,1,BELGRANO,0.0,705780.0,35751.130246,50794.044685,0.0,40000.0,2077.634832,2947.750069,1727
2,2,NUÑEZ,0.0,419657.92,26568.049533,37840.219108,0.0,26280.36,1555.494518,2275.854456,672
3,3,VILLA ORTUZAR,0.0,237583.5,23799.356016,35600.828397,0.0,12100.0,1371.153125,2009.757447,131
4,4,BALVANERA,0.0,1390000.0,30939.876621,73083.385352,0.0,80000.0,1807.461825,4266.068559,995


In [21]:
grouped.to_json("properati_data/properati_barrios.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)



In [22]:
filter_df['created_on'] = pd.to_datetime(filter_df['created_on'])
filter_df['month'] = filter_df.created_on.apply(lambda x: x.month)
filter_df['year'] = filter_df.created_on.apply(lambda x: x.year)

barrios_filter_df = filter_df[['barrio','b_id','dataset_date','comuna','perimetro','area','price_aprox_local_currency', 'price_aprox_usd', 'Computer Percent',\
                              'Cellular Percent', 'Rent Percent','Immigration Percent', 'Education Percent',\
                              'Owner Percent', 'created_on', 'month', 'year']]

In [23]:
barrios_filter_df.head(5)

Unnamed: 0,barrio,b_id,dataset_date,comuna,perimetro,area,price_aprox_local_currency,price_aprox_usd,Computer Percent,Cellular Percent,Rent Percent,Immigration Percent,Education Percent,Owner Percent,created_on,month,year
1,SAN NICOLAS,43,201501,1,6548.084741,2289008.0,13757.06,796.38,70.8,89.7,41.6,11.3,51.9,45.7,2014-08-26,8,2014
59432,SAN NICOLAS,43,201605,1,6548.084741,2289008.0,14425.14,903.35,70.8,89.7,41.6,11.3,51.9,45.7,2016-05-19,5,2016
63478,SAN NICOLAS,43,201606,1,6548.084741,2289008.0,8670.09,542.95,70.8,89.7,41.6,11.3,51.9,45.7,2016-06-10,6,2016
108455,SAN NICOLAS,43,201702,1,6548.084741,2289008.0,12070.26,755.88,70.8,89.7,41.6,11.3,51.9,45.7,2017-03-09,3,2017
109346,SAN NICOLAS,43,201703,1,6548.084741,2289008.0,,,70.8,89.7,41.6,11.3,51.9,45.7,2017-04-28,4,2017


In [24]:
grouped_month = barrios_filter_df.groupby(['barrio', 'b_id', 'month', 'year'], as_index=False).agg({"price_aprox_local_currency": ["min", "max", "mean", "std"], "price_aprox_usd": ["min", "max", "mean", "std"],\
                                                "created_on": "count"}).fillna(0) #for the std deviation values that resulted in NaN 
grouped_month.columns = ["_".join(x) for x in grouped_month.columns.ravel()]



In [25]:
grouped_month.head(5)

Unnamed: 0,barrio_,b_id_,month_,year_,price_aprox_local_currency_min,price_aprox_local_currency_max,price_aprox_local_currency_mean,price_aprox_local_currency_std,price_aprox_usd_min,price_aprox_usd_max,price_aprox_usd_mean,price_aprox_usd_std,created_on_count
0,AGRONOMIA,25,1,2015,3212.36,3212.36,3212.36,0.0,185.96,185.96,185.96,0.0,1
1,AGRONOMIA,25,1,2017,20117.11,20117.11,20117.11,0.0,1259.8,1259.8,1259.8,0.0,1
2,AGRONOMIA,25,1,2018,4137.68,12503.18,8124.037143,2540.152002,210.73,636.78,413.752857,129.368453,7
3,AGRONOMIA,25,3,2015,11121.84,11121.84,11121.84,0.0,643.83,643.83,643.83,0.0,1
4,AGRONOMIA,25,3,2017,4466.22,7543.83,6420.608,1274.775346,279.69,472.42,402.08,79.83061,5


In [26]:
grouped_month.to_csv('properati_data/rent/properati_barrios_grouped_month.csv')
grouped_month.to_json("properati_data/rent/properati_barrios_month.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)

