# Data Extraction

In [2]:
# Normal imports
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#Specific imports
import rtree
import pygeos
import geopandas
from geopandas import GeoDataFrame
from shapely import wkt
from shapely.geometry import Point
from sklearn.model_selection import train_test_split


#Internal imports
from src.data_extraction.data_extraction import extract_initial_data
from src.constants import (
    IDEALISTA_COLORS, 
    NUM_VARIABLES_TO_SEE_DISTRIBUTION
)
from src.preprocessing.preprocessing_utils import (
    generate_pandas_profiling_report, visualize_distribution)

# Settings
warnings.filterwarnings("ignore")
%matplotlib inline
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# Read csv train

In [4]:
df_train = pd.read_csv(r'C:\Users\aimartins\OneDrive - Parfois, SA\Desktop\MDS6\MDS6-IDEALISTA\output_data\df_train_util.csv')

# Insights

Pandas Profiling

In [None]:
# Generate pandas profiling: select all columns except those selected
generate_pandas_profiling_report(df=df_train[df_train.columns.difference(['geometry','cusec', 'barrio_id'])])

Duplex

In [24]:
# Calculate mean price for duplex and non-duplex houses
mean_price_duplex = df_train[df_train.duplex==1]['precio_unitario_m2'].mean()
mean_price_no_duplex = df_train[df_train.duplex!=1]['precio_unitario_m2'].mean()
print('Duplex prices are, in average,', round(mean_price_duplex/mean_price_no_duplex,2), 'times the price of a non duplex house')

Duplex prices are, in average, 1.01 times the price of a non duplex house


In [25]:
# Calculate mean price for duplex and non-duplex houses
mean_price_duplex = df_train[df_train.duplex==1]['precio'].mean()
mean_price_no_duplex = df_train[df_train.duplex!=1]['precio'].mean()
print('Duplex prices are, in average,', round(mean_price_duplex/mean_price_no_duplex,2), 'times the price of a non duplex house')

Duplex prices are, in average, 1.28 times the price of a non duplex house


Studio

In [26]:
# Calculate mean price for studio and non-studio houses
mean_price_estudio = df_train[df_train.estudio==1]['precio_unitario_m2'].mean()
mean_price_no_estudio = df_train[df_train.estudio!=1]['precio_unitario_m2'].mean()
print('Studio prices are, in average,', round(mean_price_estudio/mean_price_no_estudio,2), 'times the price of a non studio house')

print('Studios distinct number of bedrooms:', 
      df_train[df_train.estudio==1][['n_habitaciones']].value_counts())

Studio prices are, in average, 1.22 times the price of a non studio house
Studios distinct number of bedrooms: n_habitaciones
0                 1536
1                    4
2                    2
3                    1
Name: count, dtype: int64


In [27]:
df_aux = df_train[(df_train.estudio==0) & (df_train.n_habitaciones>0)][['area_construida', 'n_habitaciones']]
df_aux['m2_n_habitaciones'] = df_aux.area_construida/df_aux.n_habitaciones
print('Metros cuadrados por habitacion en casas NO ESTUDIO', np.mean(df_aux.m2_n_habitaciones))
print('Media de area total NO ESTUDIO', np.mean(df_aux.area_construida))

df_aux_1 = df_train[(df_train.estudio==1) & (df_train.n_habitaciones>0)][['area_construida', 'n_habitaciones']]
df_aux_1['m2_n_habitaciones'] = df_aux_1.area_construida/df_aux_1.n_habitaciones
print('Metros cuadrados por habitacion en casas ESTUDIO', np.mean(df_aux_1.m2_n_habitaciones))
print('Media de area total NO ESTUDIO', np.mean(df_aux_1.area_construida))

print(df_train[(df_train.estudio==1) & (df_train.n_habitaciones>0)][['area_construida', 'n_habitaciones']])


Metros cuadrados por habitacion en casas NO ESTUDIO 39.228213467750535
Media de area total NO ESTUDIO 99.38311885510382
Metros cuadrados por habitacion en casas ESTUDIO 51.333333333333336
Media de area total NO ESTUDIO 69.28571428571429
       area_construida  n_habitaciones
23073               30               1
30486               35               1
32320              106               3
38957               41               2
41300               69               2
48969              180               1
55380               24               1


In [48]:
df_train.loc[(df_train['parking'] == 0) & (df_train['precio_parking'] > 1), 'precio_parking'].value_counts()

precio_parking
20001     137
30001     130
25001     115
15001     102
40001      92
         ... 
770001      1
21201       1
150001      1
4001        1
161         1
Name: count, Length: 129, dtype: int64

Orientaciones

In [None]:
# Calculate the sum of all orientations to see if we have information about orientation of all houses
df_train['orientacion_total'] = df_train.orientacion_s+df_train.orientacion_e+df_train.orientacion_n+df_train.orientacion_o

# Create a cross-tabulation between 'orientacion_total' and 'interior'
cross_tab = pd.crosstab(df_train['orientacion_total'], df_train['interior'])
print('Compare total orientation with if is an interior house or not')
print(cross_tab)

### Visualizations

In [None]:
visualize_distribution(
    df=df_train, numerical_columns=NUM_VARIABLES_TO_SEE_DISTRIBUTION
)

#### Pairwise relationships and distributions between multiple variables

In [None]:
sns.pairplot(df_train[NUM_VARIABLES_TO_SEE_DISTRIBUTION],diag_kind='kde')

#### Mean average price duplex vs non duplex

In [None]:
# Create a bar plot
plt.figure(figsize=(8, 6))
bars = plt.bar(['Duplex', 'Non-Duplex'], [mean_price_duplex, mean_price_no_duplex], color=['blue', 'orange'])

# Add labels to each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 0).astype(int), va='bottom')

plt.xlabel('House Type')
plt.ylabel('Mean Price')
plt.title('Mean Price Comparison between Duplex and Non-Duplex Houses')
plt.show()

#### Price Density (Precio del Vuelo)

In [None]:
# Price density
df_train.precio.plot.kde()

In [None]:
# Price density droping by 1M€
ax = df_train.precio.plot.kde()
plt.xlim([0, 1000000])
plt.show()

In [None]:
# Change scale of the price
df_train['precio_logaritmico'] = np.log(df_train['precio'])
ax = df_train.precio_logaritmico.plot.kde()
plt.show()


Temas a tener en cuenta:

* La distribución de valores es multimodal
* La distribución de valores no es simétrica
* El rango de valores puede ser muy amplio

El precio depende de muchos factores, pero en la literatura existen dos grandes factores:

* Precio del suelo (el suelo donde está construido)
* Precio del vuelo (lo que está construido)

Para empezar, una forma de controlar el precio del suelo es incorporar información de la zona y una forma para controla el precio del suelo es normalizar por metros cuadrados (es nuestra variable __UNITPRICE__).

En la siguiente gráfica observamos el fenómeno de la multimodalidad, significa que podemos encontrarnos inmuebles con las mismas características constructivas con distintos precios €/m², ¿por qué?, principalmente por el otro factor: __el precio del suelo__.

#### Unit Price Density (Precio del suelo)

In [None]:
ax = df_train.precio_unitario_m2.plot.kde()
plt.xlim([0, 10000])
plt.show()

#### Mean price by Barrio

In [None]:
# Mean price by barrio
df_metrics_barrios= df_train.groupby(['barrio']).agg({'precio':['median', 'mean', 'std'], 
                                   'precio_unitario_m2':['median', 'mean', 'std']}).reset_index()

df_metrics_barrios.columns = ['barrio', 
                         'precio_median_barrio', 'precio_mean_barrio', 'precio_std_barrio', 
                         'precio_unitario_m2_median_barrio', 'precio_unitario_m2_mean_barrio', 'precio_unitario_m2_std_barrio']

In [51]:
df_metrics_barrios

Unnamed: 0,barrio,precio_median_barrio,precio_mean_barrio,precio_std_barrio,precio_unitario_m2_median_barrio,precio_unitario_m2_mean_barrio,precio_unitario_m2_std_barrio
0,12 de Octubre-Orcasur,317500.0,444223.118280,442760.662201,4309.894737,4396.394570,1663.131772
1,Abrantes,172500.0,256714.285714,219183.983168,2446.428571,2768.754521,1239.535470
2,Acacias,274000.0,355086.261981,266947.656267,3965.217391,3967.685006,1522.527774
3,Adelfas,365000.0,413043.927649,279776.058917,4264.367816,4068.375849,1524.887937
4,Aeropuerto,233000.0,262979.310345,102375.091166,2663.461538,2703.018024,738.911528
...,...,...,...,...,...,...,...
130,Vinateros,175000.0,246149.253731,270734.422658,2467.532468,2627.260940,1073.183097
131,Virgen del Cortijo - Manoteras,291500.0,325268.292683,217325.155991,3453.736878,3360.139762,1540.457857
132,Vista Alegre,152000.0,199939.597315,154957.414734,2035.714286,2197.922056,772.098222
133,Zofío,275000.0,416016.393443,369095.813455,3862.500000,3834.199355,1506.978940


In [None]:
# Sort DataFrame by highest mean price
df_metrics_barrios_sorted = df_metrics_barrios.sort_values(by='precio_mean_barrio', ascending=False)

# Set the custom color palette
sns.set_palette(IDEALISTA_COLORS)

# Plot mean price by barrio (sorted)
plt.figure(figsize=(20, 10))
plt.bar(df_metrics_barrios_sorted['barrio'], df_metrics_barrios_sorted['precio_mean_barrio'], color='skyblue')
plt.xlabel('Barrio')
plt.ylabel('Mean Price')
plt.title('Mean Price by Barrio (Ordered by Highest Price)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


#### Box Plot

In [None]:
# Plot horizontal boxplot with price in millions
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_train, x=df_train['precio'])
plt.title('Box plot')
plt.xlabel('Price')

In [None]:
plt.figure(figsize=(20, 30))
sns.boxplot(data=df_train, x="area_construida", y="barrio")

#### Coordenadas

In [None]:
# Price Total €
cm = plt.cm.get_cmap('magma')  # or your colormap of choice
ax = df_train.plot.scatter(x='longitud', y='latitud', c='precio',figsize=(10, 10), cmap=cm)
ax.set_xlabel('Longitud', fontsize=15)
ax.set_ylabel('Latitud', fontsize=15)
ax.set_title('Madrid: Total price')
ax.figure.show()

# Price Total by km^2 €
cm = plt.cm.get_cmap('magma')  # or your colormap of choice
ax = df_train.plot.scatter(x='longitud', y='latitud', c='precio_unitario_m2',figsize=(10, 10), cmap=cm)
ax.set_xlabel('Longitud', fontsize=15)
ax.set_ylabel('Latitud', fontsize=15)
ax.set_title('Madrid: Total price by m^2')
ax.figure.show()

#### Polygons

In [None]:
df, df_ine, df_osm, df_pois, df_polygons = extract_initial_data(
    root_dir="input_data"
)

In [None]:
# Convert WKT strings to Shapely geometries and create a GeoDataFrame
df_polygons['geometry'] = df_polygons['WKT'].apply(wkt.loads)
gdf_polygons = geopandas.GeoDataFrame(df_polygons['geometry'], crs='epsg:4326')

# Add additional columns to the GeoDataFrame
gdf_polygons['barrio_id'] = df_polygons['LOCATIONID']
gdf_polygons['barrio'] = df_polygons['LOCATIONNAME']

# Create Point geometries using longitude and latitude coordinates from df_train
geometry = [Point(xy) for xy in zip(df_train.longitud, df.latitud)]

# Create a GeoDataFrame gdf_ads with df_prices data and geometry column
gdf_train_train = GeoDataFrame(df_train, crs="EPSG:4326", geometry=geometry)

# Apply a logarithmic scale transformation to the 'precio' column in gdf_ads
gdf_train_train['precio_logaritmico'] = np.log(gdf_train_train['precio'])


In [None]:
# Plot the GeoDataFrame gdf_polygons as white polygons with black edges on the base plot
base = gdf_polygons.plot(color='white', edgecolor='black', figsize=(10, 10))

# Overlay gdf_ads on the same plot, plotting points colored by 'precio_unitario_km2' column
gdf_train_train.plot(ax=base, marker='o', column='precio_unitario_m2', markersize=5, cmap='inferno')

In [None]:
# Use logaritmic price
# Plot the GeoDataFrame gdf_polygons as white polygons with black edges on the base plot
base = gdf_polygons.plot(color='white', edgecolor='black', figsize=(10, 10))

# Overlay gdf_ads on the same plot, plotting points colored by 'precio_logaritmico' column
gdf_train_train.plot(ax=base, marker='o', column='precio_logaritmico', markersize=5, cmap='inferno')


#### INE Censal Polygons

In [None]:
# Convert WKT strings to Shapely geometries and create a GeoDataFrame for census polygons
df_ine['geometry'] = df_ine['WKT'].apply(wkt.loads)
gdf_polygons_census = geopandas.GeoDataFrame(df_ine['geometry'], crs='epsg:4326')

# Add additional column 'CUSEC' to the GeoDataFrame representing census polygons
gdf_polygons_census['cusec'] = df_ine['CUSEC']

In [None]:
# Plot the census polygons as white polygons with black edges on the base plot
base = gdf_polygons_census.plot(color='white', edgecolor='black', figsize=(10, 10))

# Overlay gdf_ads on the same plot, plotting points colored by 'precio_unitario_km2' column
gdf_train_train.plot(ax=base, marker='o', column='precio_unitario_m2', markersize=5, cmap='inferno')


#### Points of Interest - Open Street Map - Basis

In [None]:
# TODO - nice to have: conteos por la superficie del barrios  
# Show points of interest
unique_codes = df_osm['CODE'].unique()
cmap = plt.cm.get_cmap('magma', len(unique_codes))

# Create scatter plot
fig, ax = plt.subplots(figsize=(10, 10))
for i, code in enumerate(unique_codes):
    subset = df_osm[df_osm['CODE'] == code]
    ax.scatter(subset['LNG'], subset['LAT'], c=cmap(i), label=code)

# Set labels and title
ax.set_xlabel('Longitud', fontsize=15)
ax.set_ylabel('Latitud', fontsize=15)
ax.set_title('Points of Interest - Open Street Map')

# Add legend
ax.legend()
plt.show()
