In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df_madrid = pd.read_csv("../data/madrid-coordinates.csv")

In [3]:
df_madrid.columns

Index(['id', 'title', 'subtitle', 'sq_mt_built', 'sq_mt_useful', 'n_rooms',
       'n_bathrooms', 'n_floors', 'sq_mt_allotment', 'latitude', 'longitude',
       'raw_address', 'is_exact_address_hidden', 'street_name',
       'street_number', 'portal', 'floor', 'is_floor_under', 'door',
       'neighborhood_id', 'operation', 'rent_price', 'rent_price_by_area',
       'is_rent_price_known', 'buy_price', 'buy_price_by_area',
       'is_buy_price_known', 'house_type_id', 'is_renewal_needed',
       'is_new_development', 'built_year', 'has_central_heating',
       'has_individual_heating', 'are_pets_allowed', 'has_ac',
       'has_fitted_wardrobes', 'has_lift', 'is_exterior', 'has_garden',
       'has_pool', 'has_terrace', 'has_balcony', 'has_storage_room',
       'is_furnished', 'is_kitchen_equipped', 'is_accessible',
       'has_green_zones', 'energy_certificate', 'has_parking',
       'has_private_parking', 'has_public_parking',
       'is_parking_included_in_price', 'parking_price', 'is

#### Columns to use:
- id
- title
- subtitle
- sq_mt_built
- sq_mt_useful
- n_rooms
- n_bathrooms
- latitude
- longitude
- raw_address
- neighborhood_id
- rent_price
- buy_price
- house_type_id
- is_new_development
- is_renewal_needed
- has_lift
- is_exterior
- energy_certificate
- has_parking
- has_garden
- has_pool
- has_terrace
- has_balcony
- has_storage_room
- floor

In [4]:
columns = ["id",
          "title",
          "subtitle",
          "sq_mt_built",
          "sq_mt_useful",
          "n_rooms",
          "n_bathrooms",
           "latitude",
           "longitude",
           "raw_address",
           "neighborhood_id",
           "rent_price",
           "buy_price",
           "house_type_id",
           "is_new_development",
           "is_renewal_needed",
           "has_lift",
           "is_exterior",
           "energy_certificate",
           "has_parking",
           "has_garden",
           "has_pool",
          "has_terrace",
           "has_balcony",
           "has_storage_room",
           "floor"
          ]

In [5]:
new_madrid = df_madrid[columns]

In [6]:
new_madrid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16277 entries, 0 to 16276
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  16277 non-null  int64  
 1   title               16277 non-null  object 
 2   subtitle            16277 non-null  object 
 3   sq_mt_built         16216 non-null  float64
 4   sq_mt_useful        6753 non-null   float64
 5   n_rooms             16277 non-null  int64  
 6   n_bathrooms         16264 non-null  float64
 7   latitude            14266 non-null  float64
 8   longitude           14266 non-null  float64
 9   raw_address         16277 non-null  object 
 10  neighborhood_id     16277 non-null  object 
 11  rent_price          16277 non-null  int64  
 12  buy_price           16277 non-null  int64  
 13  house_type_id       15946 non-null  object 
 14  is_new_development  15514 non-null  object 
 15  is_renewal_needed   16277 non-null  bool   
 16  has_

#### Let's filter by latitude and longitude:

In [7]:
new_madrid = new_madrid[(new_madrid.latitude > 40.34) & 
                      (new_madrid.latitude < 40.51) & 
                      (new_madrid.longitude > -3.8) & 
                      (new_madrid.longitude < -3.6)]

#### We can estimate "useful square meters" from the other apartments:

### Cleaning "sq_mt_built" and "sq_mt_useful":

In [8]:
new_madrid["sq_mt_coef"] = new_madrid["sq_mt_useful"]/new_madrid["sq_mt_built"]

In [9]:
sq_mt_ratio = new_madrid["sq_mt_coef"].mean()

In [10]:
sq_mt_ratio

0.856862301863558

In [11]:
new_madrid["sq_mt_useful"].fillna(round(new_madrid["sq_mt_built"] * sq_mt_ratio, 2), inplace=True)

In [12]:
new_madrid["sq_mt_built"].fillna(round(new_madrid["sq_mt_useful"] / sq_mt_ratio, 2), inplace=True)

### Cleaning "has_garden", "has_balcony", "has_pool", "has_terrace", "has_storage_room":

In [13]:
new_madrid.has_garden.unique()

array([nan, True], dtype=object)

In [14]:
def clean_cols_has(df, column):
    df[column].fillna(0, inplace=True)
    df.loc[(df[column] == True), column] = 1
    df[column] = df[column].astype(int)
    return df

In [15]:
new_madrid = clean_cols_has(new_madrid, "has_garden")
new_madrid = clean_cols_has(new_madrid, "has_balcony")
new_madrid = clean_cols_has(new_madrid, "has_pool")
new_madrid = clean_cols_has(new_madrid, "has_terrace")
new_madrid = clean_cols_has(new_madrid, "has_storage_room")

In [16]:
new_madrid.reset_index(drop = True)

Unnamed: 0,id,title,subtitle,sq_mt_built,sq_mt_useful,n_rooms,n_bathrooms,latitude,longitude,raw_address,...,is_exterior,energy_certificate,has_parking,has_garden,has_pool,has_terrace,has_balcony,has_storage_room,floor,sq_mt_coef
0,21741,Piso en venta en calle de la del Manojo de Rosas,"Los Ángeles, Madrid",70.0,59.98,3,1.0,40.359340,-3.698374,Calle de la del Manojo de Rosas,...,True,en trámite,False,0,0,1,0,0,4,
1,21740,"Piso en venta en calle del Talco, 68","San Andrés, Madrid",94.0,54.00,2,2.0,40.344630,-3.715191,"Calle del Talco, 68",...,True,no indicado,False,0,0,0,0,1,1,0.574468
2,21738,Piso en venta en carretera de Villaverde a Val...,"Los Rosales, Madrid",108.0,90.00,2,2.0,40.357722,-3.685029,Carretera de Villaverde a Vallecas,...,True,en trámite,True,0,1,0,0,1,4,0.833333
3,21737,Piso en venta en geologia,"San Andrés, Madrid",126.0,114.00,4,2.0,40.343389,-3.712416,geologia,...,True,en trámite,True,0,0,1,1,0,3,0.904762
4,21736,Piso en venta en avenida Real de Pinto,"San Andrés, Madrid",120.0,100.00,5,2.0,40.348149,-3.706239,Avenida Real de Pinto,...,True,F,True,0,1,1,1,1,1,0.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10877,32,Piso en venta en calle Teresa López Valcárcel,"Legazpi, Madrid",228.0,195.36,4,2.0,40.389037,-3.692823,Calle Teresa López Valcárcel,...,True,G,True,0,0,1,1,1,5,
10878,20,Piso en venta en calle Timón,"Timón, Madrid",74.0,63.41,2,2.0,40.423552,-3.796780,Calle Timón,...,,en trámite,True,0,1,0,0,1,2,
10879,11,Chalet adosado en venta en calle Siroco,"Timón, Madrid",276.0,236.49,4,4.0,40.408853,-3.791879,Calle Siroco,...,,G,True,1,1,1,0,0,,
10880,9,Casa o chalet independiente en venta en aveni...,"Campo de las Naciones-Corralejos, Madrid",267.0,215.00,5,3.0,40.450461,-3.608672,Avenida de Logroño,...,,en trámite,True,1,1,1,0,1,,0.805243


In [17]:
new_madrid.has_garden.value_counts()

0    10574
1      308
Name: has_garden, dtype: int64

#### House Type:

In [18]:
new_madrid.house_type_id.value_counts()

HouseType 1: Pisos            9401
HouseType 5: Áticos            481
HouseType 2: Casa o chalet     442
HouseType 4: Dúplex            315
Name: house_type_id, dtype: int64

In [19]:
house_types = {
    'HouseType 1: Pisos': 0,
    'HouseType 2: Casa o chalet': 3,
    'HouseType 5: Áticos': 2,
    'HouseType 4: Dúplex': 1
}

In [20]:
new_madrid["house_type"] = new_madrid.house_type_id.map(house_types)

In [21]:
new_madrid.house_type.value_counts()

0.0    9401
2.0     481
3.0     442
1.0     315
Name: house_type, dtype: int64

#### Energy certification:

In [24]:
new_madrid.energy_certificate.value_counts()

en trámite         5443
no indicado        1664
E                  1464
D                   595
G                   466
F                   411
C                   311
A                   274
B                   193
inmueble exento      61
Name: energy_certificate, dtype: int64

In [25]:
energy_certification = {
     'en trámite': 0,
     'no indicado': 0,
     'E': 3,
     'D': 4,
     'G': 1,
     'F': 2,
     'A': 7,
     'C': 5,
     'B': 6,
     'inmueble exento': 0
   }

In [26]:
new_madrid["energy_certificate"] = new_madrid.energy_certificate.map(energy_certification)

#### Drop nulls:

In [27]:
new_madrid.drop("sq_mt_coef", axis=1, inplace=True)

In [28]:
new_madrid.isna().sum()

id                       0
title                    0
subtitle                 0
sq_mt_built              6
sq_mt_useful             6
n_rooms                  0
n_bathrooms             10
latitude                 0
longitude                0
raw_address              0
neighborhood_id          0
rent_price               0
buy_price                0
house_type_id          243
is_new_development     548
is_renewal_needed        0
has_lift               685
is_exterior           1019
energy_certificate       0
has_parking              0
has_garden               0
has_pool                 0
has_terrace              0
has_balcony              0
has_storage_room         0
floor                  725
house_type             243
dtype: int64

In [29]:
madrid = new_madrid.dropna(axis=0, how="any")

In [30]:
madrid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9130 entries, 1 to 16250
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  9130 non-null   int64  
 1   title               9130 non-null   object 
 2   subtitle            9130 non-null   object 
 3   sq_mt_built         9130 non-null   float64
 4   sq_mt_useful        9130 non-null   float64
 5   n_rooms             9130 non-null   int64  
 6   n_bathrooms         9130 non-null   float64
 7   latitude            9130 non-null   float64
 8   longitude           9130 non-null   float64
 9   raw_address         9130 non-null   object 
 10  neighborhood_id     9130 non-null   object 
 11  rent_price          9130 non-null   int64  
 12  buy_price           9130 non-null   int64  
 13  house_type_id       9130 non-null   object 
 14  is_new_development  9130 non-null   object 
 15  is_renewal_needed   9130 non-null   bool   
 16  has_l

### Let's prepare some last columns:

In [31]:
def boolean_to_binary(df, column):
    df.loc[(df[column] == True), column] = 1
    df.loc[(df[column] == False), column] = 0
    df[column] = df[column].astype(int)
    return df

#### Has_lift column:

In [32]:
madrid = boolean_to_binary(madrid, "has_lift")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(int)


#### Has_parking column:

In [33]:
madrid.has_parking.value_counts()

False    6475
True     2655
Name: has_parking, dtype: int64

In [34]:
madrid = boolean_to_binary(madrid, "has_parking")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(int)


#### n_bathrooms column:

In [35]:
madrid["n_bathrooms"] = madrid["n_bathrooms"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  madrid["n_bathrooms"] = madrid["n_bathrooms"].astype(int)


#### Floor column:

In [36]:
madrid.floor.value_counts()

1                       2104
2                       1676
3                       1417
4                       1159
Bajo                     967
5                        625
6                        423
7                        280
8                        164
Entreplanta exterior     144
9                         81
Semi-sótano exterior      32
Entreplanta interior      21
Sótano interior           18
Semi-sótano interior      17
Sótano exterior            2
Name: floor, dtype: int64

In [37]:
floors = {
    "Bajo" : 0,
    "Entreplanta" : 0.5,
    "Entreplanta exterior" : 0.5,
    "Entreplanta interior" : 0.5,
    "Semi-sótano" : -0.5,
    "Semi-sótano exterior" : -0.5,
    "Semi-sótano interior" : -0.5,
    "Sótano" : -1,
    "Sótano interior" : -1,
    "Sótano exterior" : -1,    
}

In [38]:
madrid = madrid.replace({"floor": floors})

In [39]:
madrid["floor"] = madrid["floor"].astype(float)

#### Let's change type of "is_exterior", "is_renewal_needed" and "is_new_development":

In [40]:
madrid = boolean_to_binary(madrid, "is_renewal_needed")

In [41]:
madrid = boolean_to_binary(madrid, "is_exterior")

In [42]:
madrid = boolean_to_binary(madrid, "is_new_development")

#### Let's drop house_type_id column because we already has a house_type column:

In [43]:
madrid.drop("house_type_id", axis=1, inplace=True)

#### Final cleaning: neigborhood_id column:

In [44]:
madrid["value_m2"] = madrid["neighborhood_id"].str.split("(", n=1, expand=True)[1].str.split(" ", n=1, expand=True)[0]

In [45]:
madrid["neighborhood"] = madrid["neighborhood_id"].str.split(":", n=1, expand=True)[1].str.split("(", n=1, expand=True)[0].str.strip()

In [46]:
madrid["district"] = madrid["neighborhood_id"].str.split(":", n=2, expand=True)[2].str.strip()

In [47]:
madrid.loc[(madrid["value_m2"] == "None"), "value_m2"] = 0

In [48]:
madrid.value_m2 = madrid.value_m2.astype(float)

#### I will modify type of house_type to int:

In [None]:
madrid.house_type = madrid.house_type.astype(int)

#### I will check the final dataframe clean:

In [49]:
madrid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9130 entries, 1 to 16250
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  9130 non-null   int64  
 1   title               9130 non-null   object 
 2   subtitle            9130 non-null   object 
 3   sq_mt_built         9130 non-null   float64
 4   sq_mt_useful        9130 non-null   float64
 5   n_rooms             9130 non-null   int64  
 6   n_bathrooms         9130 non-null   int64  
 7   latitude            9130 non-null   float64
 8   longitude           9130 non-null   float64
 9   raw_address         9130 non-null   object 
 10  neighborhood_id     9130 non-null   object 
 11  rent_price          9130 non-null   int64  
 12  buy_price           9130 non-null   int64  
 13  is_new_development  9130 non-null   int64  
 14  is_renewal_needed   9130 non-null   int64  
 15  has_lift            9130 non-null   int64  
 16  is_ex

#### Rent price column cleaning. It has some negative values:

In [56]:
madrid = madrid[madrid.rent_price > 0]

In [57]:
madrid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8666 entries, 1 to 16250
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  8666 non-null   int64  
 1   title               8666 non-null   object 
 2   subtitle            8666 non-null   object 
 3   sq_mt_built         8666 non-null   float64
 4   sq_mt_useful        8666 non-null   float64
 5   n_rooms             8666 non-null   int64  
 6   n_bathrooms         8666 non-null   int64  
 7   latitude            8666 non-null   float64
 8   longitude           8666 non-null   float64
 9   raw_address         8666 non-null   object 
 10  neighborhood_id     8666 non-null   object 
 11  rent_price          8666 non-null   int64  
 12  buy_price           8666 non-null   int64  
 13  is_new_development  8666 non-null   int64  
 14  is_renewal_needed   8666 non-null   int64  
 15  has_lift            8666 non-null   int64  
 16  is_ex

#### Let's export the final result:

In [61]:
madrid.to_csv("../data/clean-madrid.csv", index = False)