### Cargue de librerías

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

pd.options.display.float_format = '{:.6f}'.format

### Lectura del dataset de Properatti listo para modelos

In [35]:
data_modelo = pd.read_csv('propertti_to_model_price_m2.csv')

In [37]:
# Realizamos un listado de los 5 state_name con mayor participación en el dataset

top_5_states=data_modelo.state_name.value_counts().index[0:5]
top_5_states

Index(['Capital Federal', 'Bs.As. G.B.A. Zona Norte',
       'Buenos Aires Costa Atlántica', 'Bs.As. G.B.A. Zona Sur',
       'Bs.As. G.B.A. Zona Oeste'],
      dtype='object')

In [38]:
# Construimos una nueva tabla conservando solamente apartamentos, casas y registros ubicados en los 5 estados más predominantes del dataset

df = data_modelo[((data_modelo.property_type=="apartment")|(data_modelo.property_type=="house"))
        &(data_modelo.state_name.isin(top_5_states))
        ]

In [39]:
# Chequeamos el tamaño del nuevo dataset

df.shape

(27285, 7)

In [40]:
# Construimos una tabla adicional para contar la cantidad de inmuebles por cada combinación de property_type y place_name

cant_prop_type_place = df.groupby(['property_type', 'place_name']).size().reset_index(name='cant_prop_type_place')
cant_prop_type_place.sort_values(['cant_prop_type_place'])

Unnamed: 0,property_type,place_name,cant_prop_type_place
122,apartment,San Antonio De Padua,1
313,house,Punta Alta,1
283,house,Matheu,1
113,apartment,Presidente Perón,1
29,apartment,Cañuelas,1
...,...,...,...
23,apartment,Caballito,1150
345,house,Tigre,1266
138,apartment,Tigre,1358
101,apartment,Palermo,1441


In [41]:
# Filtramos nuestro dataset conservando registros cuyo conteo por grupo de property_type y place_name sea superior a 400 inmuebles

df = df.merge(cant_prop_type_place, on=['property_type', 'place_name'], how='left')
df = df.loc[df.cant_prop_type_place>=400]
df.shape

(14069, 8)

In [42]:
# Eliminamos las columnas 'cant_prop_type_place' y 'state_name'

df.drop(['cant_prop_type_place','state_name'], inplace=True,axis=1)

In [43]:
# Separamos nuestras variables predictoras 'X' y nuestra variable target 'y' y generamos variables dummies.

X=df.drop(columns={"price_usd_per_m2"})
X=pd.get_dummies(X, drop_first=True)
feature_cols = X.columns
y=df["price_usd_per_m2"]

In [44]:
# Visualizamos los primeros 3 registros de las variables predictoras

X.head(3)

Unnamed: 0,lat,lon,surface_total_in_m2,property_type_house,place_name_Barrio Norte,place_name_Belgrano,place_name_Caballito,place_name_Flores,place_name_Mar del Plata,place_name_Palermo,place_name_Pilar,place_name_Recoleta,place_name_San Telmo,place_name_Tigre,place_name_Villa Crespo,place_name_Villa Urquiza
2,-34.559873,-58.443362,45.0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,-34.559873,-58.443362,65.0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,-34.425087,-58.579658,270.0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [45]:
# Escalamos las variables numéricas

scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)

In [46]:
# Partimos el dataset en train (entrenamiento) y test (prueba)

Xtrain, Xtest, ytrain, ytest=train_test_split(X,y,shuffle=False,random_state=42)

# Inicializamos una regresión lineal, la entrenamos y realizamos predicciones sobre la data de prueba

linreg=LinearRegression()
linreg.fit(Xtrain,ytrain)
ypred=linreg.predict(Xtest)
print ('RMSE:', np.sqrt(metrics.mean_squared_error(ytest, ypred)))
print ('R2:', metrics.r2_score(ytest, ypred))

RMSE: 570.9004997546965
R2: 0.47641782297647683


In [47]:
# Imprimimos coeficientes
print(linreg.intercept_)
list(zip(feature_cols, linreg.coef_))

2273.344586606613


[('lat', -196.3292211730504),
 ('lon', 75.61719146374514),
 ('surface_total_in_m2', -167.41957263545996),
 ('property_type_house', -251.7329635425373),
 ('place_name_Barrio Norte', 131.47271803598213),
 ('place_name_Belgrano', 193.2186243727997),
 ('place_name_Caballito', 56.576651448225405),
 ('place_name_Flores', -35.666790017375504),
 ('place_name_Mar del Plata', -419.5029724816185),
 ('place_name_Palermo', 259.8715415082768),
 ('place_name_Pilar', 8.793750713312882),
 ('place_name_Recoleta', 206.441516488081),
 ('place_name_San Telmo', 8.070283429923855),
 ('place_name_Tigre', 149.49190260967947),
 ('place_name_Villa Crespo', 22.884299005092736),
 ('place_name_Villa Urquiza', 77.26410799897207)]