# Data cleaning Immobiliare.it

Importing the librares

In [927]:
import numpy as np
import pandas as pd
import re

I check the raw dataframe and the datatypes

In [928]:
df_raw = pd.read_csv('house_prices_italy.csv')
df_raw.head()

Unnamed: 0.1,Unnamed: 0,region,city,area,rooms,toilets,price,year,month
0,0,abruzzo,Pescara,295m²,5+,3+,€ 257.000,2023,7
1,1,abruzzo,Spoltore,199m²,5+,3+,€ 235.000,2023,7
2,2,abruzzo,Pescara,227m²,5,3+,€ 299.000,2023,7
3,3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43m²,2 - 4,1,da € 165.000,2023,7
4,4,abruzzo,Roseto degli Abruzzi,978m²,5+,3+,€ 1.500.000,2023,7


In [929]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  40000 non-null  int64 
 1   region      40000 non-null  object
 2   city        40000 non-null  object
 3   area        39974 non-null  object
 4   rooms       39053 non-null  object
 5   toilets     40000 non-null  object
 6   price       40000 non-null  object
 7   year        40000 non-null  int64 
 8   month       40000 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 2.7+ MB


I know that I will need to change datatypes at least on 'area' and 'price'.

Are there any null values? let's see:

In [930]:
df_raw.isnull().sum()

Unnamed: 0      0
region          0
city            0
area           26
rooms         947
toilets         0
price           0
year            0
month           0
dtype: int64

I decided to drop the first column since it does not carry any info:

In [931]:
df=df_raw.drop("Unnamed: 0", axis=1)

In [932]:
df.head()

Unnamed: 0,region,city,area,rooms,toilets,price,year,month
0,abruzzo,Pescara,295m²,5+,3+,€ 257.000,2023,7
1,abruzzo,Spoltore,199m²,5+,3+,€ 235.000,2023,7
2,abruzzo,Pescara,227m²,5,3+,€ 299.000,2023,7
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43m²,2 - 4,1,da € 165.000,2023,7
4,abruzzo,Roseto degli Abruzzi,978m²,5+,3+,€ 1.500.000,2023,7


I force the "area" column as 'string', if not, later it will give errors.

In [933]:
df['area'] = df['area'].astype(str)

I do not like the column names. I rename them:

In [934]:
df.rename(columns={'area':'area[m2]', 'price':'price[€]'}, inplace=True)
df.sample(10)

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],year,month
24244,puglia,Castro,155m²,5+,2,€ 650.000,2023,7
36518,valle-d-aosta,Brusson,122m²,4,1,€ 160.000,2023,7
1538,abruzzo,Silvi,270m²,5+,3,€ 278.000,2023,7
20538,molise,Montenero di Bisaccia,118m²,2,1,"da € 11.714,86",2023,7
11926,friuli-venezia-giulia,San Daniele del Friuli,115m²,3,2,€ 56.000,2023,7
37364,valle-d-aosta,Gressan,70m²,4,2,€ 90.000,2023,7
16237,lombardia,Milano,75m²,3,1,€ 420.000,2023,7
22368,piemonte,Settimo Torinese,241m²,5+,3,€ 370.000,2023,7
10880,friuli-venezia-giulia,Trivignano Udinese,147m²,2,1,€ 70.000,2023,7
22443,piemonte,Avigliana,131m²,4,1,€ 129.000,2023,7


Regex on "area" column. Let's start cleaning from here:

In [935]:
#compilo il pattern
p = re.compile('[0-9]+')


df['area[m2]'] = df['area[m2]'].apply(lambda x: 0 if p.search(x) is None else p.search(x).group())
df.head()

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],year,month
0,abruzzo,Pescara,295,5+,3+,€ 257.000,2023,7
1,abruzzo,Spoltore,199,5+,3+,€ 235.000,2023,7
2,abruzzo,Pescara,227,5,3+,€ 299.000,2023,7
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43,2 - 4,1,da € 165.000,2023,7
4,abruzzo,Roseto degli Abruzzi,978,5+,3+,€ 1.500.000,2023,7


I convert 'area' column into 'Int64' dtype:

In [936]:
df['area[m2]'] = df['area[m2]'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   region    40000 non-null  object
 1   city      40000 non-null  object
 2   area[m2]  40000 non-null  int64 
 3   rooms     39053 non-null  object
 4   toilets   40000 non-null  object
 5   price[€]  40000 non-null  object
 6   year      40000 non-null  int64 
 7   month     40000 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 2.4+ MB


It's the turn for the "price" column:

In [937]:
df['price[€]'] = df['price[€]'].str.replace('.','', regex=False)
df['price[€]'] = df['price[€]'].str.replace('€','', regex= False)
df['price[€]']


0            257000
1            235000
2            299000
3        da  165000
4           1500000
            ...    
39995        140000
39996         95000
39997        394000
39998        245000
39999        310000
Name: price[€], Length: 40000, dtype: object

In [938]:
p_price = re.compile('\d+')
df['price[€]'] = df['price[€]'].apply(lambda x: 0 if x.strip().isnumeric() == False else p.search(x).group())

In [939]:
df['price[€]'] = df['price[€]'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   region    40000 non-null  object
 1   city      40000 non-null  object
 2   area[m2]  40000 non-null  int64 
 3   rooms     39053 non-null  object
 4   toilets   40000 non-null  object
 5   price[€]  40000 non-null  int64 
 6   year      40000 non-null  int64 
 7   month     40000 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 2.4+ MB


In [940]:
df['area[m2]'] = df['area[m2]'].apply(lambda x: np.nan if x==0 else x)
df['price[€]'] = df['price[€]'].apply(lambda x: np.nan if x==0 else x)
df

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],year,month
0,abruzzo,Pescara,295.0,5+,3+,257000.0,2023,7
1,abruzzo,Spoltore,199.0,5+,3+,235000.0,2023,7
2,abruzzo,Pescara,227.0,5,3+,299000.0,2023,7
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43.0,2 - 4,1,,2023,7
4,abruzzo,Roseto degli Abruzzi,978.0,5+,3+,1500000.0,2023,7
...,...,...,...,...,...,...,...,...
39995,veneto,Rosà,264.0,5,1,140000.0,2023,7
39996,veneto,Cerea,66.0,2,1,95000.0,2023,7
39997,veneto,Bassano del Grappa,140.0,4,1,394000.0,2023,7
39998,veneto,Carrè,219.0,5+,3,245000.0,2023,7


The 'rooms' and 'toilets' columns are still treated as strings. As they are in this moment one could use them as categorical values. I leave it this way.

I now rename the regions into a more appropriate manner:

In [941]:
regions_dict = {'abruzzo':'Abruzzo', 'basilicata':'Basilicata', 'campania':'Campania', 'calabria':'Calabria', 'emilia-romagna':'Emilia Romagna',
       'friuli-venezia-giulia':'Friuli-Venezia Giulia', 'lazio': 'Lazio', 'liguria':'Liguria', 'lombardia':'Lombardia', 'marche':'Marche',
       'molise':'Molise', 'piemonte':'Piemonte', 'puglia':'Puglia', 'sardegna':'Sardegna', 'sicilia':'Sicilia', 'toscana':'Toscana',
       'trentino-alto-adige': 'Trentino-Alto Adige', 'umbria':'Umbria', 'valle-d-aosta':'Valle d\'Aosta', 'veneto':'Veneto'}

df.replace({'region':regions_dict}, inplace=True)
df.isna().sum()


region         0
city           0
area[m2]      26
rooms        947
toilets        0
price[€]    4706
year           0
month          0
dtype: int64

In [942]:
df.sample(20)

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],year,month
19961,Marche,Civitanova Marche,101.0,4,1,149000.0,2023,7
4282,Campania,Napoli,136.0,5,2,470000.0,2023,7
4346,Campania,Napoli,200.0,5+,3+,1500000.0,2023,7
12040,Lazio,Roma,150.0,5,2,829000.0,2023,7
1972,Abruzzo,Sant'Egidio alla Vibrata,66.0,3,1,95000.0,2023,7
37023,Valle d'Aosta,Courmayeur,80.0,4,2,695000.0,2023,7
28166,Sicilia,Siracusa,250.0,5+,3,680000.0,2023,7
36445,Valle d'Aosta,Gignod,81.0,4,1,98000.0,2023,7
33850,Trentino-Alto Adige,Borgo Valsugana,855.0,5+,3+,160000.0,2023,7
35896,Umbria,Castiglione del Lago,200.0,5+,3+,449000.0,2023,7


I believe this is enough. It will be possible to load the data into a DataViz software and keep going from there with some visualizations.
I save the csv file.

In [943]:
year = df.year.max()
month = df.month.max()

In [944]:
df.to_csv(f'house_prices_italy_{year}_{month}_cleaned.csv')