In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
location= 'Data/properatti.csv'

df = pd.read_csv(location, index_col=0)

In [3]:
# Filtro sólo CABA y elmino los stores del tipo de propiedad
df2 = df[(df['property_type'].isin(['apartment','house','PH'])) & (df['state_name'] == 'Capital Federal')]

# Chequeo NaN
display(df2.info())

# Saco las columnas operatcion y estado, ya que son columnas con un solo valor y sin NA
df2 = df2.drop(['properati_url','expenses','operation', 'state_name', 'lat','lon','place_with_parent_names', 'country_name', 'geonames_id','price','currency','price_aprox_local_currency'], axis=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30952 entries, 0 to 121219
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   operation                   30952 non-null  object 
 1   property_type               30952 non-null  object 
 2   place_name                  30952 non-null  object 
 3   place_with_parent_names     30952 non-null  object 
 4   country_name                30952 non-null  object 
 5   state_name                  30952 non-null  object 
 6   geonames_id                 29737 non-null  float64
 7   lat-lon                     22631 non-null  object 
 8   lat                         22631 non-null  float64
 9   lon                         22631 non-null  float64
 10  price                       27828 non-null  float64
 11  currency                    27828 non-null  object 
 12  price_aprox_local_currency  27828 non-null  float64
 13  price_aprox_usd             27

None

In [4]:
df2['place_name'].value_counts()

Belgrano           2925
Palermo            2773
Caballito          2235
Villa Urquiza      1605
Recoleta           1453
                   ... 
Palermo Viejo        36
Villa Real           36
Villa Soldati        13
Villa Riachuelo       6
Catalinas             3
Name: place_name, Length: 62, dtype: int64

In [5]:
df2.columns

Index(['property_type', 'place_name', 'lat-lon', 'price_aprox_usd',
       'surface_total_in_m2', 'surface_covered_in_m2', 'price_usd_per_m2',
       'price_per_m2', 'floor', 'rooms', 'description', 'title',
       'image_thumbnail'],
      dtype='object')

## Columnas a agregar/eliminar

- Amenities: Booleano. Pileta, garage/cochera, parrilla, sum, garden/jardin
- Cuota: Booleano. Ver si sigue aplicando para planes, y si eliminamos o no.
- superficie: unificar los regex de los dos grupos
- Sup total y cubierta: Tomas buscará en los trabajos del resto. Julio había imputado esto, traer el código al nuevo dataset. Revisar le tema de las que tiene valores diferentes.
- image_thumbnail: Ver si los duplicados que se encontraron son relevantes en el nuevo dataset, y ulego eliminarla
- Floor: Aplicar el regex y ver con cuantos datos nos quedamos.
- rooms y ambientes: Ver los regex que se armaron, como aplican al nuevo dataset
- expensas por ahora se elimina
- Description y title: Se saca todos los datos y se elimina

## Feature engineering

## test modelado básico

In [43]:
# importo los librerías para el modelo
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn import metrics

In [25]:
#Dropeo todas las features que necesitan tratamiento previo para servir
df3 = df2.drop(['description','title','image_thumbnail','price_per_m2','lat-lon', 'rooms','floor'], axis=1)

In [26]:
mask_barrios = (df3['place_name'].isin(['Belgrano','Caballito']))
mask_appt = (df3['property_type'] == 'apartment')

df3 = df3[mask_barrios & mask_appt]

In [27]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4840 entries, 7 to 121215
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   property_type          4840 non-null   object 
 1   place_name             4840 non-null   object 
 2   price_aprox_usd        4224 non-null   float64
 3   surface_total_in_m2    4119 non-null   float64
 4   surface_covered_in_m2  4643 non-null   float64
 5   price_usd_per_m2       3530 non-null   float64
dtypes: float64(4), object(2)
memory usage: 264.7+ KB


In [28]:
#Descarto TODAS las que tienen algun NaN, sin otro criterio
df3 = df3.dropna()

In [46]:
df3['place_name'].value_counts()

Belgrano     1874
Caballito    1528
Name: place_name, dtype: int64

In [35]:
# Selecciono las columnas que considero features para modelar 
X = df3.loc[:,['place_name','surface_total_in_m2','surface_covered_in_m2']]
y = df3['price_usd_per_m2']

# Convierto a dummies las categóricas
X = pd.get_dummies(X,drop_first=True)

# Separo en train/test
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=33)

X.shape, y.shape, X_train.shape,X_test.shape, y_train.shape, y_test.shape

((3402, 3), (3402,), (2551, 3), (851, 3), (2551,), (851,))

In [38]:
lr = LinearRegression()

lr.fit(X_train,y_train)
lr.intercept_, lr.coef_

(3031.862519234963, array([  -9.35812977,   12.63713738, -600.1193796 ]))

In [44]:
y_pred = lr.predict(X_test)

metrics.r2_score(y_test, y_pred)

0.27414053361027124

In [52]:
#prueba con crossvalidation con Ridge
rcv = RidgeCV(alphas=(0.0001,0.001,0.01,0.1,1,10,100,1000,10000), cv=5)

rcv.fit(X_train,y_train)
rcv.intercept_, rcv.coef_, rcv.alpha_

(3031.8624621393838,
 array([  -9.35812972,   12.63713749, -600.11928047]),
 0.0001)