In [103]:
# standard python packages
import pandas as pd
import ast
import numpy as np
import sys
import json


#geospatial manipulatio 
import geopandas
from geopy.distance import geodesic
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from shapely.ops import nearest_points
from shapely import wkt

#### Import the Properati and Census data 

In [61]:
properati = pd.read_csv('properati_data/rent-combined.csv', encoding= "ISO-8859-1")
census = pd.read_csv('census/cleaned_data2.csv',encoding= "ISO-8859-1")

#### Change the format of the geojson file to something that can be imported into Pandas

In [62]:
with open('geojson/baires.json', 'r+') as fd:
    ba_dict = json.load(fd)
    
ba_polygon = ba_dict['features']
header = ['type', 'id', 'coordinates', 'barrio', 'perimeter', 'area', 'commune']
ba_data_list = []
for feature in ba_polygon:
    type_name = feature['type']
    type_id = feature['id']
    coordinates = feature['geometry']['coordinates']
    barrio = feature['properties']['barrios']
    perimeter = feature['properties']['perimetro']
    area = feature['properties']['area']
    commune = feature['properties']['comunas']
    
    ba_data_list.append([type_name, type_id, coordinates, barrio, perimeter, area, commune])

ba_data = pd.DataFrame(ba_data_list, columns=header)

    
    
    

In [63]:
def modify_poly(data):
    coord_list = []
    #row = ast.literal_eval(data)
    for coord in data:
        coord_list.append(tuple(coord))
    return coord_list

In [64]:
# create a new column called coordinates on properati data frame
properati['coordinates'] = list(zip(properati.lon, properati.lat))
properati['coordinates'] = properati['coordinates'].apply(Point)

# convert properati data into geopandas dataframe
properati_gdf = geopandas.GeoDataFrame(properati, geometry='coordinates')

# convert census polygon data into tuples and polygon
census.coordinates = census.coordinates.apply(lambda x: ast.literal_eval(x)[0])
census.coordinates = census.coordinates.apply(lambda x: modify_poly(x))
census.coordinates = census.coordinates.apply(Polygon)

census_gdf = geopandas.GeoDataFrame(census, geometry='coordinates')

properati_census_gdf = geopandas.sjoin(properati_gdf, census_gdf, how="inner", op='within')
print ('{:.2f}%'.format((len(properati_census_gdf)/len(properati_gdf))*100))

print (len(properati_census_gdf))

24.73%
65574


In [65]:
def modify_json_coordinates(coordinates):
    coord_list = []
    for coord in coordinates[0][0]:
        coord_list.append(tuple(coord))
    return coord_list
        

In [67]:
ba_data.coordinates = ba_data.coordinates.apply(lambda x: modify_json_coordinates(x))
#print (ba_data.coordinates[0])
ba_data['coordinates'] = ba_data['coordinates'].apply(Polygon)
#print (ba_data.coordinates[0])
ba_data_gdf = geopandas.GeoDataFrame(ba_data, geometry='coordinates')

In [104]:
barrios = pd.read_csv('shape files/barrios.csv')
barrios.WKT = barrios.WKT.apply(wkt.loads)
barrios_gdf = geopandas.GeoDataFrame(barrios, geometry='WKT')

In [126]:
#print (properati_census_gdf.columns.values)
#properati_census_gdf.index = properati_census_gdf.index_left
#properati_census_gdf = properati_census_gdf.drop(columns=['id_left', 'id_right', 'index_right'], axis=1)
print (properati_census_gdf.columns.values)
final_gdf = geopandas.sjoin(properati_census_gdf , barrios_gdf, how="inner", op='within')
print (final_gdf.columns.values)
print (len(final_gdf))


['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'place_with_parent_names' 'country_name' 'state_name' 'geonames_id'
 'lat_lon' 'lat' 'lon' 'price' 'currency' 'price_aprox_local_currency'
 'price_aprox_usd' 'properati_url' 'description' 'title' 'image_thumbnail'
 'coordinates' 'AREA' 'Commune' 'DEPTO' 'FRAC' 'Computer Percent'
 'Computer Quantile' 'Cellular Percent' 'Cellular Quantile' 'Rent Percent'
 'Rent Quantile' 'LINK' 'Immigration Percent' 'Immigration Quantile'
 'Education Percent' 'Education Quantile' 'PERIMETER' 'Owner Percent'
 'Owner Quantile' 'PROV' 'RADIO' 'RADPAIS_' 'REDCODE' 'Regular Percent'
 'Regular Quantile' 'TIPO' 'Uninhabited Percent' 'Uninhabited Quantile'
 'type']
['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'place_with_parent_names' 'country_name' 'state_name' 'geonames_id'
 'lat_lon' 'lat' 'lon' 'price' 'currency' 'price_aprox_local_currency'
 'price_aprox_usd' 'properati_url' 'description' 'title' 'image_thumbnai

In [127]:

#ba_data_gdf.coordinates.to_file(driver = 'ESRI Shapefile', filename= "shape files/ba_data.shp")
#final_gdf.coordinates.to_file(driver = 'ESRI Shapefile', filename= "shape files/final_data.shp")
filtered_gdf = final_gdf[['dataset_date', 'created_on', 'operation', 'property_type', 'place_name', 'place_with_parent_names',\
                          'state_name', 'price', 'currency', 'price_aprox_local_currency', 'price_aprox_usd',\
                         'Commune', 'Computer Percent', 'Computer Quantile', 'Cellular Percent', 'Cellular Quantile',\
                          'Rent Percent','Rent Quantile','Immigration Percent', 'Immigration Quantile','Education Percent', \
                          'Education Quantile', 'Owner Percent','Owner Quantile','Regular Percent','Regular Quantile', \
                          'Uninhabited Percent','Uninhabited Quantile','barrio', 'comuna', 'perimetro','area']]

filter_df = pd.DataFrame(filtered_gdf)
#filter_df.to_csv('properati_data/properati_census_barrios.csv')


In [109]:
barrios_gdf.WKT.to_file(driver = 'ESRI Shapefile', filename= "shape files/barrios.shp")

In [133]:
barrios_filter_df = filter_df[['barrio','comuna','perimetro','area','price_aprox_local_currency', 'price_aprox_usd', 'Computer Percent',\
                              'Cellular Percent', 'Rent Percent','Immigration Percent', 'Education Percent',\
                              'Owner Percent', 'created_on']]
grouped = barrios_filter_df.groupby('barrio').agg({"price_aprox_local_currency": ["min", "max", "mean", "std"], "price_aprox_usd": ["min", "max", "mean", "std"],\
                                                "Computer Percent": "mean", "Cellular Percent": "mean", "Rent Percent": "mean",\
                                                "Immigration Percent": "mean", "Education Percent": "mean", "Owner Percent": "mean",\
                                                "created_on": "count"})
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
print (grouped.head(5))
grouped.to_csv('properati_data/properati_barrios_grouped.csv')


           price_aprox_local_currency_min  price_aprox_local_currency_max  \
barrio                                                                      
AGRONOMIA                            0.00                        98175.00   
ALMAGRO                              0.00                       878267.50   
BALVANERA                            0.00                      1408440.00   
BARRACAS                          3461.49                       312417.98   
BELGRANO                             0.00                       705780.00   

           price_aprox_local_currency_mean  price_aprox_local_currency_std  \
barrio                                                                       
AGRONOMIA                     12696.511158                    17617.961264   
ALMAGRO                       20709.412613                    59404.834040   
BALVANERA                     37086.928929                    79472.735289   
BARRACAS                      37424.249225                    52830.38