# Data cleaning Immobiliare.it

Importing the librares

In [125]:
import numpy as np
import pandas as pd
import re
from datetime import date

I check the raw dataframe and the datatypes

In [126]:
#REMEMBER TO FILL THE PROPER FILE NAME!
df_raw = pd.read_csv('house_prices_italy_2023_7.csv')
df_raw.head()

Unnamed: 0.1,Unnamed: 0,region,city,area,rooms,toilets,price,date
0,0,abruzzo,Pescara,295m²,5+,3+,€ 257.000,2023-07-31
1,1,abruzzo,Francavilla al Mare,88m²,3,1,€ 168.000,2023-07-31
2,2,abruzzo,Pescara,227m²,5,3+,€ 299.000,2023-07-31
3,3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43m²,2 - 4,1,da € 165.000,2023-07-31
4,4,abruzzo,Francavilla al Mare,83m²,4,1,€ 139.000,2023-07-31


In [127]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39999 entries, 0 to 39998
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  39999 non-null  int64 
 1   region      39999 non-null  object
 2   city        39999 non-null  object
 3   area        39974 non-null  object
 4   rooms       39057 non-null  object
 5   toilets     39999 non-null  object
 6   price       39999 non-null  object
 7   date        39999 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.4+ MB


I know that I will need to change datatypes at least on 'area' and 'price'.

Are there any null values? let's see:

In [128]:
df_raw.isnull().sum()

Unnamed: 0      0
region          0
city            0
area           25
rooms         942
toilets         0
price           0
date            0
dtype: int64

I decided to drop the first column since it does not carry any info:

In [129]:
df=df_raw.drop("Unnamed: 0", axis=1)

In [130]:
df.head()

Unnamed: 0,region,city,area,rooms,toilets,price,date
0,abruzzo,Pescara,295m²,5+,3+,€ 257.000,2023-07-31
1,abruzzo,Francavilla al Mare,88m²,3,1,€ 168.000,2023-07-31
2,abruzzo,Pescara,227m²,5,3+,€ 299.000,2023-07-31
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43m²,2 - 4,1,da € 165.000,2023-07-31
4,abruzzo,Francavilla al Mare,83m²,4,1,€ 139.000,2023-07-31


I force the "area" column as 'string', if not, later it will give errors.

In [131]:
df['area'] = df['area'].astype(str)

I do not like the column names. I rename them:

In [132]:
df.rename(columns={'area':'area[m2]', 'price':'price[€]'}, inplace=True)
df.sample(10)

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],date
9675,emilia-romagna,Pianoro,114m²,3,2,€ 455.000,2023-07-31
15638,liguria,Sanremo,53m²,2,1,€ 85.000,2023-07-31
5305,campania,Nocera Inferiore,120m²,3,2,€ 250.000,2023-07-31
1388,abruzzo,Montesilvano,220m²,5+,2,€ 240.000,2023-07-31
16322,lombardia,Appartamenti e Ville unifamiliari di nuova cos...,65m²,2 - 4,1,da € 330.000,2023-07-31
22349,piemonte,Torino,198m²,5+,2,€ 950.000,2023-07-31
39259,veneto,Silea,80m²,2,1,Prezzo su richiesta,2023-07-31
10332,friuli-venezia-giulia,Gorizia,200m²,4,2,"€ 217.000€ 259.000(-16,2%)",2023-07-31
19671,marche,Macerata,230m²,5+,3,€ 499.000,2023-07-31
21966,molise,Chiauci,320m²,5+,2,€ 60.000,2023-07-31


Regex on "area" column. Let's start cleaning from here:

In [133]:
#compilo il pattern
p = re.compile('[0-9]+')


df['area[m2]'] = df['area[m2]'].apply(lambda x: 0 if p.search(x) is None else p.search(x).group())
df.head()

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],date
0,abruzzo,Pescara,295,5+,3+,€ 257.000,2023-07-31
1,abruzzo,Francavilla al Mare,88,3,1,€ 168.000,2023-07-31
2,abruzzo,Pescara,227,5,3+,€ 299.000,2023-07-31
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43,2 - 4,1,da € 165.000,2023-07-31
4,abruzzo,Francavilla al Mare,83,4,1,€ 139.000,2023-07-31


I convert 'area' column into 'Int64' dtype:

In [134]:
df['area[m2]'] = df['area[m2]'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39999 entries, 0 to 39998
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   region    39999 non-null  object
 1   city      39999 non-null  object
 2   area[m2]  39999 non-null  int64 
 3   rooms     39057 non-null  object
 4   toilets   39999 non-null  object
 5   price[€]  39999 non-null  object
 6   date      39999 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.1+ MB


It's the turn for the "price" column:

In [135]:
df['price[€]'] = df['price[€]'].str.replace('.','', regex=False)
df['price[€]'] = df['price[€]'].str.replace('€','', regex= False)
df['price[€]']


0            257000
1            168000
2            299000
3        da  165000
4            139000
            ...    
39994        598000
39995        440000
39996       1200000
39997        450000
39998        380000
Name: price[€], Length: 39999, dtype: object

In [136]:
p_price = re.compile('\d+')
df['price[€]'] = df['price[€]'].apply(lambda x: 0 if x.strip().isnumeric() == False else p.search(x).group())

In [137]:
df['price[€]'] = df['price[€]'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39999 entries, 0 to 39998
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   region    39999 non-null  object
 1   city      39999 non-null  object
 2   area[m2]  39999 non-null  int64 
 3   rooms     39057 non-null  object
 4   toilets   39999 non-null  object
 5   price[€]  39999 non-null  int64 
 6   date      39999 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


In [138]:
#CONVERT 0 VALUES TO NAN
df['area[m2]'] = df['area[m2]'].apply(lambda x: np.nan if x==0 else x)
df['price[€]'] = df['price[€]'].apply(lambda x: np.nan if x==0 else x)

#CONVERT TO INT TYPE
df['area[m2]'] = df['area[m2]'].astype('Int64', errors='ignore')
df['price[€]'] = df['price[€]'].astype('Int64', errors='ignore')

df

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],date
0,abruzzo,Pescara,295,5+,3+,257000,2023-07-31
1,abruzzo,Francavilla al Mare,88,3,1,168000,2023-07-31
2,abruzzo,Pescara,227,5,3+,299000,2023-07-31
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43,2 - 4,1,,2023-07-31
4,abruzzo,Francavilla al Mare,83,4,1,139000,2023-07-31
...,...,...,...,...,...,...,...
39994,veneto,San Zeno di Montagna,400,5+,3+,598000,2023-07-31
39995,veneto,Bassano del Grappa,260,5+,3+,440000,2023-07-31
39996,veneto,Garda,219,5+,2,1200000,2023-07-31
39997,veneto,Mirano,185,5+,2,450000,2023-07-31


The 'rooms' and 'toilets' columns are still treated as strings. As they are in this moment one could use them as categorical values. I leave it this way.

I now rename the regions into a more appropriate manner:

In [139]:
regions_dict = {'abruzzo':'Abruzzo', 'basilicata':'Basilicata', 'campania':'Campania', 'calabria':'Calabria', 'emilia-romagna':'Emilia Romagna',
       'friuli-venezia-giulia':'Friuli-Venezia Giulia', 'lazio': 'Lazio', 'liguria':'Liguria', 'lombardia':'Lombardia', 'marche':'Marche',
       'molise':'Molise', 'piemonte':'Piemonte', 'puglia':'Puglia', 'sardegna':'Sardegna', 'sicilia':'Sicilia', 'toscana':'Toscana',
       'trentino-alto-adige': 'Trentino-Alto Adige', 'umbria':'Umbria', 'valle-d-aosta':'Valle d\'Aosta', 'veneto':'Veneto'}

df.replace({'region':regions_dict}, inplace=True)
df.isna().sum()


region         0
city           0
area[m2]      25
rooms        942
toilets        0
price[€]    4660
date           0
dtype: int64

In [140]:
df.sample(20)

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],date
28843,Sicilia,Messina,110,3,2,180000.0,2023-07-31
31208,Toscana,Casciana Terme Lari,75,3,1,52000.0,2023-07-31
22257,Piemonte,Torino,61,2,1,146000.0,2023-07-31
1151,Abruzzo,Chieti,98,3,2,135000.0,2023-07-31
18044,Marche,Appartamenti di nuova costruzione a Falconara ...,50,2 - 4,1,,2023-07-31
15227,Liguria,Finale Ligure,71,3,1,399000.0,2023-07-31
11681,Friuli-Venezia Giulia,Gorizia,90,3,1,195000.0,2023-07-31
22957,Piemonte,San Mauro Torinese,96,5,1,255000.0,2023-07-31
13521,Lazio,Roma,65,2,1,175000.0,2023-07-31
22195,Piemonte,Torino,190,5,2,670000.0,2023-07-31


I believe this is enough. It will be possible to load the data into a DataViz software and keep going from there with some visualizations.
I save the csv file.

In [141]:
year = date.today().year
month = date.today().month

In [142]:
df.to_csv(f'house_prices_italy_{year}_{month}_cleaned.csv')