In [108]:
# standard python packages
import pandas as pd
import ast
import numpy as np
import sys
import json


#geospatial manipulatio 
import geopandas
from geopy.distance import geodesic
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from shapely.ops import nearest_points
from shapely import wkt

#### Import the Properati and Census data 

In [109]:
properati = pd.read_csv('properati_data/rent-combined.csv', encoding= "ISO-8859-1")
census = pd.read_csv('census/cleaned_data2.csv',encoding= "ISO-8859-1")

In [110]:
def modify_poly(data):
    coord_list = []
    #row = ast.literal_eval(data)
    for coord in data:
        coord_list.append(tuple(coord))
    return coord_list

In [111]:
# create a new column called coordinates on properati data frame
properati['coordinates'] = list(zip(properati.lon, properati.lat))
properati['coordinates'] = properati['coordinates'].apply(Point)

# convert properati data into geopandas dataframe
properati_gdf = geopandas.GeoDataFrame(properati, geometry='coordinates')

# convert census polygon data into tuples and polygon
census.coordinates = census.coordinates.apply(lambda x: ast.literal_eval(x)[0])
census.coordinates = census.coordinates.apply(lambda x: modify_poly(x))
census.coordinates = census.coordinates.apply(Polygon)

census_gdf = geopandas.GeoDataFrame(census, geometry='coordinates')

properati_census_gdf = geopandas.sjoin(properati_gdf, census_gdf, how="inner", op='within')
print ('{:.2f}%'.format((len(properati_census_gdf)/len(properati_gdf))*100))

print (len(properati_census_gdf))

24.73%
65574


In [112]:
prop_census_df = pd.DataFrame(properati_census_gdf)
#prop_census_df.to_csv('properati_data/properati_census_rent.csv')

In [98]:
prop_census_df = prop_census_df.drop_duplicates(subset='id_left')
print (prop_census_df.head(5))
prop_census_df.to_csv('properati_data/rent_census_stripped.csv', columns=['id_left', 'lat', 'lon'])
properati_census_gdf = geopandas.GeoDataFrame(prop_census_df, geometry='coordinates')

        dataset_date                                   id_left  created_on  \
1             201501  9b254ef5797b842cf18729c4df9827332fdca285  2014-08-26   
59432         201605  693f3f98f8b3c4c8e8774c6b02fce12f260f5bc5  2016-05-19   
63478         201606  7d7bc7ff8756413088e8f672c82c579b48272821  2016-06-10   
108455        201702  799aca1e37fb3d2010fa1c54d5b851e311aa81a9  2017-03-09   
109346        201703  2377705bd05e4626070b206396b66e99f28be369  2017-04-28   

       operation property_type       place_name  \
1           rent     apartment  Capital Federal   
59432       rent         store         Congreso   
63478       rent     apartment         Congreso   
108455      rent     apartment         Congreso   
109346      rent     apartment         Congreso   

                     place_with_parent_names country_name       state_name  \
1                |Argentina|Capital Federal|    Argentina  Capital Federal   
59432   |Argentina|Capital Federal|Congreso|    Argentina  Capital F

In [99]:
barrios = pd.read_csv('shape files/barrios.csv')
barrios.WKT = barrios.WKT.apply(wkt.loads)

barrio_df = pd.read_csv('barrio_table.csv')
barrio_df['b_id'] = [int(barrio_df[barrio_df.Barrio == i].id) for i in barrio_df.Barrio]
barrio_df['barrio'] = barrio_df.Barrio

barrios = pd.merge(barrios, barrio_df, on='barrio', how='left')
print (barrios.head(5))

barrios_gdf = geopandas.GeoDataFrame(barrios, geometry='WKT')

                                                 WKT            barrio  \
0  POLYGON ((-58.4528200492791 -34.5959886570639,...         CHACARITA   
1  POLYGON ((-58.4655768128541 -34.5965577078058,...          PATERNAL   
2  POLYGON ((-58.4237529813037 -34.5978273383243,...      VILLA CRESPO   
3  POLYGON ((-58.4946097568899 -34.6148652395239,...  VILLA DEL PARQUE   
4  POLYGON ((-58.4128700313089 -34.6141162515854,...           ALMAGRO   

   comuna    perimetro          area  Unnamed: 0            Barrio    id  b_id  
0      15  7725.695228  3.118101e+06        26.0         CHACARITA  26.0  26.0  
1      15  7087.513295  2.229829e+06        27.0          PATERNAL  27.0  27.0  
2      15  8132.699348  3.613584e+06         6.0      VILLA CRESPO   6.0   6.0  
3      11  7705.389797  3.399596e+06        29.0  VILLA DEL PARQUE  29.0  29.0  
4       5  8537.901368  4.050752e+06        12.0           ALMAGRO  12.0  12.0  


In [100]:
#print (properati_census_gdf.columns.values)
#properati_census_gdf.index = properati_census_gdf.index_left
properati_census_gdf = properati_census_gdf.drop(columns=['id_left', 'id_right', 'index_right'], axis=1)
print (properati_census_gdf.columns.values)
final_gdf = geopandas.sjoin(properati_census_gdf , barrios_gdf, how="inner", op='within')
print (final_gdf.columns.values)
print (len(final_gdf))


['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'place_with_parent_names' 'country_name' 'state_name' 'geonames_id'
 'lat_lon' 'lat' 'lon' 'price' 'currency' 'price_aprox_local_currency'
 'price_aprox_usd' 'properati_url' 'description' 'title' 'image_thumbnail'
 'coordinates' 'AREA' 'Commune' 'DEPTO' 'FRAC' 'Computer Percent'
 'Computer Quantile' 'Cellular Percent' 'Cellular Quantile' 'Rent Percent'
 'Rent Quantile' 'LINK' 'Immigration Percent' 'Immigration Quantile'
 'Education Percent' 'Education Quantile' 'PERIMETER' 'Owner Percent'
 'Owner Quantile' 'PROV' 'RADIO' 'RADPAIS_' 'REDCODE' 'Regular Percent'
 'Regular Quantile' 'TIPO' 'Uninhabited Percent' 'Uninhabited Quantile'
 'type']
['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'place_with_parent_names' 'country_name' 'state_name' 'geonames_id'
 'lat_lon' 'lat' 'lon' 'price' 'currency' 'price_aprox_local_currency'
 'price_aprox_usd' 'properati_url' 'description' 'title' 'image_thumbnai

In [101]:

#ba_data_gdf.coordinates.to_file(driver = 'ESRI Shapefile', filename= "shape files/ba_data.shp")
#final_gdf.coordinates.to_file(driver = 'ESRI Shapefile', filename= "shape files/final_data.shp")
filtered_gdf = final_gdf[['dataset_date', 'created_on', 'operation', 'property_type', 'place_name', 'place_with_parent_names',\
                          'state_name', 'price', 'currency', 'price_aprox_local_currency', 'price_aprox_usd',\
                         'Commune', 'Computer Percent', 'Computer Quantile', 'Cellular Percent', 'Cellular Quantile',\
                          'Rent Percent','Rent Quantile','Immigration Percent', 'Immigration Quantile','Education Percent', \
                          'Education Quantile', 'Owner Percent','Owner Quantile','Regular Percent','Regular Quantile', \
                          'Uninhabited Percent','Uninhabited Quantile','barrio','b_id', 'comuna', 'perimetro','area']]

filter_df = pd.DataFrame(filtered_gdf)
#filter_df.to_csv('properati_data/properati_census_barrios.csv')


In [102]:
barrios_gdf.WKT.to_file(driver = 'ESRI Shapefile', filename= "shape files/barrios.shp")

In [103]:
barrios_filter_df = filter_df[['barrio','b_id','comuna','perimetro','area','price_aprox_local_currency', 'price_aprox_usd','created_on']]
grouped = barrios_filter_df.groupby(['b_id','barrio'], as_index=False).agg({"price_aprox_local_currency": ["min", "max", "mean", "std"], "price_aprox_usd": ["min", "max", "mean", "std"],\
                                                "created_on": "count"})
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
print (grouped.head(5))
grouped.to_csv('properati_data/properati_barrios_grouped.csv')


   b_id_        barrio_  price_aprox_local_currency_min  \
0    0.0        PALERMO                             0.0   
1    1.0       BELGRANO                             0.0   
2    2.0          NUÑEZ                             0.0   
3    3.0  VILLA ORTUZAR                             0.0   
4    4.0      BALVANERA                             0.0   

   price_aprox_local_currency_max  price_aprox_local_currency_mean  \
0                      1501039.00                     34566.886943   
1                       705780.00                     35751.130246   
2                       419657.92                     26568.049533   
3                       237583.50                     23799.356016   
4                      1390000.00                     30939.876621   

   price_aprox_local_currency_std  price_aprox_usd_min  price_aprox_usd_max  \
0                    49085.363402                  0.0             94000.00   
1                    50794.044685                  0.0            

In [104]:
grouped.to_json("properati_data/properati_barrios.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)



In [105]:
filter_df['created_on'] = pd.to_datetime(filter_df['created_on'])
filter_df['month'] = filter_df.created_on.apply(lambda x: x.month)
filter_df['year'] = filter_df.created_on.apply(lambda x: x.year)

barrios_filter_df = filter_df[['barrio','b_id','dataset_date','comuna','perimetro','area','price_aprox_local_currency', 'price_aprox_usd', 'Computer Percent',\
                              'Cellular Percent', 'Rent Percent','Immigration Percent', 'Education Percent',\
                              'Owner Percent', 'created_on', 'month', 'year']]
print (barrios_filter_df.head(5))

             barrio  b_id  dataset_date  comuna    perimetro          area  \
1       SAN NICOLAS  43.0        201501       1  6548.084741  2.289008e+06   
59432   SAN NICOLAS  43.0        201605       1  6548.084741  2.289008e+06   
63478   SAN NICOLAS  43.0        201606       1  6548.084741  2.289008e+06   
108455  SAN NICOLAS  43.0        201702       1  6548.084741  2.289008e+06   
109346  SAN NICOLAS  43.0        201703       1  6548.084741  2.289008e+06   

        price_aprox_local_currency  price_aprox_usd  Computer Percent  \
1                         13757.06           796.38              70.8   
59432                     14425.14           903.35              70.8   
63478                      8670.09           542.95              70.8   
108455                    12070.26           755.88              70.8   
109346                         NaN              NaN              70.8   

        Cellular Percent  Rent Percent  Immigration Percent  \
1                   89.7     

In [106]:
grouped_month = barrios_filter_df.groupby(['barrio', 'b_id', 'month', 'year'], as_index=False).agg({"price_aprox_local_currency": ["min", "max", "mean", "std"], "price_aprox_usd": ["min", "max", "mean", "std"],\
                                                "created_on": "count"}).fillna(0) #for the std deviation values that resulted in NaN 
grouped_month.columns = ["_".join(x) for x in grouped_month.columns.ravel()]
print (grouped_month.head(5))
print (len(grouped_month))
#grouped.to_csv('properati_data/properati_barrios_grouped.csv')

     barrio_  b_id_  month_  year_  price_aprox_local_currency_min  \
0  AGRONOMIA   25.0       1   2015                         3212.36   
1  AGRONOMIA   25.0       1   2017                        20117.11   
2  AGRONOMIA   25.0       1   2018                         4137.68   
3  AGRONOMIA   25.0       3   2015                        11121.84   
4  AGRONOMIA   25.0       3   2017                         4466.22   

   price_aprox_local_currency_max  price_aprox_local_currency_mean  \
0                         3212.36                      3212.360000   
1                        20117.11                     20117.110000   
2                        12503.18                      8124.037143   
3                        11121.84                     11121.840000   
4                         7543.83                      6420.608000   

   price_aprox_local_currency_std  price_aprox_usd_min  price_aprox_usd_max  \
0                        0.000000               185.96               185.96   


In [107]:
grouped_month.to_csv('properati_data/properati_barrios_grouped_month.csv')
grouped_month.to_json("properati_data/properati_barrios_month.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)

