In [14]:
import pandas as pd
import ast
import numpy as np
import sys
import geopandas
from geopy.distance import geodesic
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from shapely.ops import nearest_points
from shapely import wkt

### import data

In [5]:
properati = pd.read_csv('properati_data/combined_sell.csv', encoding= "ISO-8859-1")
prop_data = pd.read_csv('prop_data/cleaned_data.csv', encoding= "ISO-8859-1")
census = pd.read_csv('census/cleaned_data2.csv',encoding= "ISO-8859-1")

### Create GeoPandas dataframes

In [6]:
def modify_poly(data):
    coord_list = []
    #row = ast.literal_eval(data)
    for coord in data:
        coord_list.append(tuple(coord))
    return coord_list

In [7]:
# create a new column called coordinates on properati data frame
properati['coordinates'] = list(zip(properati.lon, properati.lat))
properati['coordinates'] = properati['coordinates'].apply(Point)

# convert properati data into geopandas dataframe
properati_gdf = geopandas.GeoDataFrame(properati, geometry='coordinates')

# convert the longitudes and latitudes in the properaty values dataframe into float (they were originally in string)
prop_data.Latitude = pd.to_numeric(prop_data.Latitude, errors='coerce')
prop_data.Longitude = pd.to_numeric(prop_data.Longitude, errors='coerce')

#print (properati_gdf.coordinates.head(5))
print (properati_gdf.coordinates.isnull().values.any())
print (len(properati_gdf))

# convert property values data into geopandas dataframe
prop_data['coordinates'] = list(zip(prop_data.Longitude, prop_data.Latitude))
#prop_data['coordinates'] = list(zip(prop_data.Latitude, prop_data.Longitude))
prop_data['coordinates'] = prop_data['coordinates'].apply(Point)
prop_data_gdf = geopandas.GeoDataFrame(prop_data, geometry='coordinates')


# convert census polygon data into tuples and polygon
census.coordinates = census.coordinates.apply(lambda x: ast.literal_eval(x)[0])
census.coordinates = census.coordinates.apply(lambda x: modify_poly(x))
census.coordinates = census.coordinates.apply(Polygon)

census_gdf = geopandas.GeoDataFrame(census, geometry='coordinates')
print (len(census_gdf))
#print (census_gdf.coordinates[0])
print (census_gdf.coordinates.isnull().values.any())

False
1871732
3552
False


### Filter only data that is in the city of the Buenos Aires

In [8]:
properati_census_gdf = geopandas.sjoin(properati_gdf, census_gdf, how="inner", op='within')
print ('{:.2f}%'.format(len(properati_census_gdf)/len(properati_gdf)))

print (len(properati_census_gdf))

  outputs = ufunc(*inputs)


0.25%
464869


#### save the properati/census combined data because the sjoin takes forever

In [15]:
temp_df = pd.DataFrame(properati_census_gdf)
temp_df.to_csv('properati_data/sell_census.csv')

#### import barrios geojson

In [16]:
barrios = pd.read_csv('shape files/barrios.csv')
barrios.WKT = barrios.WKT.apply(wkt.loads)
barrios_gdf = geopandas.GeoDataFrame(barrios, geometry='WKT')

In [20]:
#properati_census_gdf = properati_census_gdf.drop(columns=['id_left', 'id_right', 'index_right'], axis=1)
print (properati_census_gdf.columns.values)
final_gdf = geopandas.sjoin(properati_census_gdf , barrios_gdf, how="inner", op='within')
print (final_gdf.columns.values)
print (len(final_gdf))


['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'country_name' 'state_name' 'geonames_id' 'lat_lon' 'lat' 'lon' 'price'
 'currency' 'price_aprox_local_currency' 'price_aprox_usd' 'coordinates'
 'AREA' 'Commune' 'DEPTO' 'FRAC' 'Computer Percent' 'Computer Quantile'
 'Cellular Percent' 'Cellular Quantile' 'Rent Percent' 'Rent Quantile'
 'LINK' 'Immigration Percent' 'Immigration Quantile' 'Education Percent'
 'Education Quantile' 'PERIMETER' 'Owner Percent' 'Owner Quantile' 'PROV'
 'RADIO' 'RADPAIS_' 'REDCODE' 'Regular Percent' 'Regular Quantile' 'TIPO'
 'Uninhabited Percent' 'Uninhabited Quantile' 'type']
['dataset_date' 'created_on' 'operation' 'property_type' 'place_name'
 'country_name' 'state_name' 'geonames_id' 'lat_lon' 'lat' 'lon' 'price'
 'currency' 'price_aprox_local_currency' 'price_aprox_usd' 'coordinates'
 'AREA' 'Commune' 'DEPTO' 'FRAC' 'Computer Percent' 'Computer Quantile'
 'Cellular Percent' 'Cellular Quantile' 'Rent Percent' 'Rent Quantile'
 'LINK'

In [21]:
filtered_gdf = final_gdf[['dataset_date', 'created_on', 'operation', 'property_type', 'place_name',\
                          'state_name', 'price', 'currency', 'price_aprox_local_currency', 'price_aprox_usd',\
                         'Commune', 'Computer Percent', 'Computer Quantile', 'Cellular Percent', 'Cellular Quantile',\
                          'Rent Percent','Rent Quantile','Immigration Percent', 'Immigration Quantile','Education Percent', \
                          'Education Quantile', 'Owner Percent','Owner Quantile','Regular Percent','Regular Quantile', \
                          'Uninhabited Percent','Uninhabited Quantile','barrio', 'comuna', 'perimetro','area']]

filter_df = pd.DataFrame(filtered_gdf)

In [22]:
barrios_filter_df = filter_df[['barrio','comuna','perimetro','area','price_aprox_local_currency', 'price_aprox_usd', 'Computer Percent',\
                              'Cellular Percent', 'Rent Percent','Immigration Percent', 'Education Percent',\
                              'Owner Percent', 'created_on']]
grouped = barrios_filter_df.groupby('barrio', as_index=False).agg({"price_aprox_local_currency": ["min", "max", "mean", "std"], "price_aprox_usd": ["min", "max", "mean", "std"],\
                                                "Computer Percent": "mean", "Cellular Percent": "mean", "Rent Percent": "mean",\
                                                "Immigration Percent": "mean", "Education Percent": "mean", "Owner Percent": "mean",\
                                                "created_on": "count"})
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
print (grouped.head(5))
grouped.to_csv('properati_data/properati_sell_grouped.csv')

     barrio_  price_aprox_local_currency_min  price_aprox_local_currency_max  \
0  AGRONOMIA                             0.0                      11388300.0   
1    ALMAGRO                             0.0                     197603750.0   
2  BALVANERA                             0.0                      88222500.0   
3   BARRACAS                             0.0                     265072500.0   
4   BELGRANO                             0.0                     107992500.0   

   price_aprox_local_currency_mean  price_aprox_local_currency_std  \
0                     2.855905e+06                    2.077619e+06   
1                     2.700272e+06                    3.901816e+06   
2                     2.747228e+06                    3.509469e+06   
3                     3.983442e+06                    1.030250e+07   
4                     6.334873e+06                    8.465324e+06   

   price_aprox_usd_min  price_aprox_usd_max  price_aprox_usd_mean  \
0                  0.0       

In [23]:
grouped.to_json("properati_data/properati_sell.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)



In [25]:
filter_df['created_on'] = pd.to_datetime(filter_df['created_on'])
filter_df['month'] = filter_df.created_on.apply(lambda x: x.month)
filter_df['year'] = filter_df.created_on.apply(lambda x: x.year)

barrios_filter_df = filter_df[['barrio','dataset_date','comuna','perimetro','area','price_aprox_local_currency', 'price_aprox_usd', 'Computer Percent',\
                              'Cellular Percent', 'Rent Percent','Immigration Percent', 'Education Percent',\
                              'Owner Percent', 'created_on', 'month', 'year']]
print (barrios_filter_df.head(5))

          barrio  dataset_date  comuna     perimetro          area  \
1       BARRACAS        201501       4  13018.210271  7.961000e+06   
13547   BARRACAS        201501       4  13018.210271  7.961000e+06   
38029   BARRACAS        201502       4  13018.210271  7.961000e+06   
51933   BARRACAS        201502       4  13018.210271  7.961000e+06   
109853  BARRACAS        201502       4  13018.210271  7.961000e+06   

        price_aprox_local_currency  price_aprox_usd  Computer Percent  \
1                       3109410.00        180000.00              67.1   
13547                   2591175.00        150000.00              67.1   
38029                   1627587.50         95000.00              67.1   
51933                     85275.81          4977.43              67.1   
109853                  2826862.50        165000.00              67.1   

        Cellular Percent  Rent Percent  Immigration Percent  \
1                   88.1          39.7                 13.0   
13547         

In [26]:
grouped_month = barrios_filter_df.groupby(['barrio', 'month', 'year'], as_index=False).agg({"price_aprox_local_currency": ["min", "max", "mean", "std"], "price_aprox_usd": ["min", "max", "mean", "std"],\
                                                "created_on": "count"}).fillna(0) #for the std deviation values that resulted in NaN 
grouped_month.columns = ["_".join(x) for x in grouped_month.columns.ravel()]
print (grouped_month.head(5))
print (len(grouped_month))

     barrio_  month_  year_  price_aprox_local_currency_min  \
0  AGRONOMIA       1   2015                      1347411.00   
1  AGRONOMIA       1   2016                      1037952.50   
2  AGRONOMIA       1   2017                            0.00   
3  AGRONOMIA       1   2018                      1366497.82   
4  AGRONOMIA       2   2015                            0.00   

   price_aprox_local_currency_max  price_aprox_local_currency_mean  \
0                       1347411.0                     1.347411e+06   
1                       1516410.0                     1.251206e+06   
2                       4774581.5                     2.259034e+06   
3                      11388300.0                     4.791015e+06   
4                       1209215.0                     3.847502e+05   

   price_aprox_local_currency_std  price_aprox_usd_min  price_aprox_usd_max  \
0                    0.000000e+00              78000.0              78000.0   
1                    2.147109e+05         

In [27]:
grouped_month.to_csv('properati_data/properati_sell_grouped_month.csv')
grouped_month.to_json("properati_data/properati_sell_month.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)

