# Data cleaning Immobiliare.it

Importing the librares

In [814]:
import numpy as np
import pandas as pd
import re
from datetime import date

I check the raw dataframe and the datatypes

In [815]:
df_raw = pd.read_csv('house_prices_italy.csv')
df_raw.head()

Unnamed: 0.1,Unnamed: 0,region,city,area,rooms,toilets,price
0,0,abruzzo,Pescara,89m²,3,1,€ 75.000
1,1,abruzzo,Spoltore,199m²,5+,3+,€ 235.000
2,2,abruzzo,Pescara,227m²,5,3+,€ 299.000
3,3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43m²,2 - 4,1,da € 165.000
4,4,abruzzo,Roseto degli Abruzzi,978m²,5+,3+,€ 1.500.000


In [816]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  40000 non-null  int64 
 1   region      40000 non-null  object
 2   city        40000 non-null  object
 3   area        39975 non-null  object
 4   rooms       39050 non-null  object
 5   toilets     40000 non-null  object
 6   price       40000 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.1+ MB


I know that I will need to change datatypes at least on 'area' and 'price'.

Are there any null values? let's see:

In [817]:
df_raw.isnull().sum()

Unnamed: 0      0
region          0
city            0
area           25
rooms         950
toilets         0
price           0
dtype: int64

I decided to drop the first column since it does not carry any info:

In [818]:
df=df_raw.drop("Unnamed: 0", axis=1)

In [819]:
df.head()

Unnamed: 0,region,city,area,rooms,toilets,price
0,abruzzo,Pescara,89m²,3,1,€ 75.000
1,abruzzo,Spoltore,199m²,5+,3+,€ 235.000
2,abruzzo,Pescara,227m²,5,3+,€ 299.000
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43m²,2 - 4,1,da € 165.000
4,abruzzo,Roseto degli Abruzzi,978m²,5+,3+,€ 1.500.000


I force the "area" column as 'string', if not, later it will give errors.

In [820]:
df['area'] = df['area'].astype(str)

I do not like the column names. I rename them:

In [821]:
df.rename(columns={'area':'area[m2]', 'price':'price[€]'}, inplace=True)
df.sample(10)

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€]
35896,umbria,Spoleto,70m²,3,1,€ 89.000
8907,emilia-romagna,Castelfranco Emilia,190m²,5,3,€ 400.000
15240,liguria,Cairo Montenotte,85m²,4,1,€ 98.000
2627,basilicata,Maratea,172m²,5,3+,Prezzo su richiesta
11572,friuli-venezia-giulia,Trieste,137m²,3,2,€ 180.000
17536,lombardia,Milano,50m²,2,1,€ 308.000
8701,emilia-romagna,Castel San Pietro Terme,188m²,5,2,€ 350.000
4189,campania,Napoli,125m²,4,1,€ 289.000
37282,valle-d-aosta,Gressan,97m²,3,1,€ 180.000
14712,liguria,Genova,200m²,5+,2,€ 450.000


Regex on "area" column. Let's start cleaning from here:

In [822]:
#compilo il pattern
p = re.compile('[0-9]+')


df['area[m2]'] = df['area[m2]'].apply(lambda x: 0 if p.search(x) is None else p.search(x).group())
df.head()

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€]
0,abruzzo,Pescara,89,3,1,€ 75.000
1,abruzzo,Spoltore,199,5+,3+,€ 235.000
2,abruzzo,Pescara,227,5,3+,€ 299.000
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43,2 - 4,1,da € 165.000
4,abruzzo,Roseto degli Abruzzi,978,5+,3+,€ 1.500.000


I convert 'area' column into 'Int64' dtype:

In [823]:
df['area[m2]'] = df['area[m2]'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   region    40000 non-null  object
 1   city      40000 non-null  object
 2   area[m2]  40000 non-null  int64 
 3   rooms     39050 non-null  object
 4   toilets   40000 non-null  object
 5   price[€]  40000 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.8+ MB


It's the turn for the "price" column:

In [824]:
df['price[€]'] = df['price[€]'].str.replace('.','', regex=False)
df['price[€]'] = df['price[€]'].str.replace('€','', regex= False)
df['price[€]']


0                      75000
1                     235000
2                     299000
3                 da  165000
4                    1500000
                ...         
39995                 890000
39996                  79000
39997                 260000
39998                 590000
39999    Prezzo su richiesta
Name: price[€], Length: 40000, dtype: object

In [825]:
p_price = re.compile('\d+')
df['price[€]'] = df['price[€]'].apply(lambda x: 0 if x.strip().isnumeric() == False else p.search(x).group())

In [826]:
df['price[€]'] = df['price[€]'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   region    40000 non-null  object
 1   city      40000 non-null  object
 2   area[m2]  40000 non-null  int64 
 3   rooms     39050 non-null  object
 4   toilets   40000 non-null  object
 5   price[€]  40000 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.8+ MB


In [827]:
df['area[m2]'] = df['area[m2]'].apply(lambda x: np.nan if x==0 else x)
df['price[€]'] = df['price[€]'].apply(lambda x: np.nan if x==0 else x)
df

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€]
0,abruzzo,Pescara,89.0,3,1,75000.0
1,abruzzo,Spoltore,199.0,5+,3+,235000.0
2,abruzzo,Pescara,227.0,5,3+,299000.0
3,abruzzo,Appartamenti di nuova costruzione a Tortoreto,43.0,2 - 4,1,
4,abruzzo,Roseto degli Abruzzi,978.0,5+,3+,1500000.0
...,...,...,...,...,...,...
39995,veneto,Caorle,106.0,5,2,890000.0
39996,veneto,Vicenza,48.0,2,1,79000.0
39997,veneto,Venezia,100.0,4,2,260000.0
39998,veneto,Bussolengo,305.0,5,3,590000.0


The 'rooms' and 'toilets' columns are still treated as strings. As they are in this moment one could use them as categorical values. I leave it this way.

I now rename the regions into a more appropriate manner:

In [828]:
regions_dict = {'abruzzo':'Abruzzo', 'basilicata':'Basilicata', 'campania':'Campania', 'calabria':'Calabria', 'emilia-romagna':'Emilia Romagna',
       'friuli-venezia-giulia':'Friuli-Venezia Giulia', 'lazio': 'Lazio', 'liguria':'Liguria', 'lombardia':'Lombardia', 'marche':'Marche',
       'molise':'Molise', 'piemonte':'Piemonte', 'puglia':'Puglia', 'sardegna':'Sardegna', 'sicilia':'Sicilia', 'toscana':'Toscana',
       'trentino-alto-adige': 'Trentino-Alto Adige', 'umbria':'Umbria', 'valle-d-aosta':'Valle d\'Aosta', 'veneto':'Veneto'}

df.replace({'region':regions_dict}, inplace=True)
df.isna().sum()


region         0
city           0
area[m2]      25
rooms        950
toilets        0
price[€]    4683
dtype: int64

In [829]:
today = date.today()
df['date'] = today

In [830]:
df.sample(20)

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],date
37237,Valle d'Aosta,Verrès,120.0,5,1,,2023-07-30
10191,Friuli-Venezia Giulia,Cividale del Friuli,350.0,5+,2,225000.0,2023-07-30
2736,Basilicata,Montescaglioso,84.0,4,1,,2023-07-30
33925,Trentino-Alto Adige,Sanzeno,80.0,3,1,120000.0,2023-07-30
9454,Emilia Romagna,Ravenna,88.0,4,1,167000.0,2023-07-30
27043,Sardegna,Sassari,50.0,2,1,85000.0,2023-07-30
23046,Piemonte,Torino,166.0,5,2,648000.0,2023-07-30
11797,Friuli-Venezia Giulia,San Giorgio di Nogaro,223.0,5+,1,139000.0,2023-07-30
8876,Emilia Romagna,Parma,50.0,2,1,,2023-07-30
16962,Lombardia,Limbiate,127.0,4,1,189000.0,2023-07-30


In [831]:
df['date']=pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df.sample(10)

Unnamed: 0,region,city,area[m2],rooms,toilets,price[€],date,year,month
4978,Campania,Napoli,200.0,4,2,675000.0,2023-07-30,2023,7
11899,Friuli-Venezia Giulia,San Vito al Tagliamento,260.0,5+,2,183000.0,2023-07-30,2023,7
39302,Veneto,Venezia,60.0,2,1,320000.0,2023-07-30,2023,7
25983,Puglia,Trepuzzi,229.0,5,2,160000.0,2023-07-30,2023,7
39769,Veneto,Scorzè,101.0,5,2,270000.0,2023-07-30,2023,7
14878,Liguria,Savignone,210.0,5+,1,225000.0,2023-07-30,2023,7
2058,Basilicata,Matera,91.0,4,1,180000.0,2023-07-30,2023,7
30976,Toscana,Viareggio,140.0,5,3,799000.0,2023-07-30,2023,7
39139,Veneto,Peschiera del Garda,86.0,3,1,230000.0,2023-07-30,2023,7
21645,Molise,Castel del Giudice,70.0,4,2,150000.0,2023-07-30,2023,7


I believe this is enough. It will be possible to load the data into a DataViz software and keep going from there with some visualizations

In [832]:
df.to_csv('house_prices_italy_cleaned.csv')