# Data Preparation

This notebook creates adds new data from other files new features (feature-engineering), clean up the data by deleting rows with missing values and

- Aggregate data from external data sources
- Feature engineering
- Data cleaning

In [1]:
# Libraries
import os
import re
import fnmatch
import datetime
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Aggregate data from external data sources

The data is originaly form the modul data analytics. The first part is to clean up the data and aggregate the data with other sources.

The main idea originates from data analytics week 3.

### Import the data

In [2]:
# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_zurich_with_bfs.csv', sep=',', encoding='utf-8')

# Get number of rows and columns
df.shape

(870, 8)

### Add municipality data

In [3]:
# Meaning of variables:
# bfs_number: official municipality id
# bfs_name: official municipality name
# pop: number of residents (=population)
# pop_dens: population density (pop per km2)
# frg_pct: percentage foreigners
# emp: numer of employees

df_municip = pd.read_excel('municipality_data.xlsx', 
                           sheet_name='data_for_import')
df_municip.head(5)

Unnamed: 0,bfs_number,bfs_name,pop,pop_dens,frg_pct,emp
0,1,Aeugst am Albis,1981,250.442478,14.184755,442.0
1,2,Affoltern am Albis,12303,1161.756374,28.700317,6920.0
2,3,Bonstetten,5572,749.932705,16.564968,1014.0
3,4,Hausen am Albis,3751,275.808824,16.022394,1021.0
4,5,Hedingen,3778,578.56049,16.410799,1478.0


In [4]:
# Merge needs a key which must be identical in both data sets (here the key is 'bfs_number')
df = df.merge(df_municip[['bfs_number', 
                            'bfs_name',
                            'pop', 
                            'pop_dens', 
                            'frg_pct', 
                            'emp']], 
                on="bfs_number")
df.head(5)

Unnamed: 0,bfs_number,rooms,area,price,postalcode,address,town,description_raw,bfs_name,pop,pop_dens,frg_pct,emp
0,112,3.5,122,3180,8633,Sunnenbergstrasse 15,Wolfhausen,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...",Bubikon,7344,632.55814,11.410675,3617.0
1,112,4.0,87,1690,8633,Blumenbergstrasse 7,Wolfhausen,«Sehr grosse 4 Zimmer Wohnung»,Bubikon,7344,632.55814,11.410675,3617.0
2,112,3.5,92,2350,8608,,Bubikon,«Eigentumsstandard mit grossem Garten»,Bubikon,7344,632.55814,11.410675,3617.0
3,112,3.5,130,2500,8608,,Bubikon,«CHARMANT HELL UND ZENTRAL»,Bubikon,7344,632.55814,11.410675,3617.0
4,261,2.5,78,3760,8002,Lavaterstr. 63,Zürich,«Wunderschöne Wohnung im Enge-Quartier»,Zürich,420217,4778.994655,32.458468,491193.0


In [5]:
df.shape

(870, 13)

### Add tax incom per municipal

In [6]:

df_tax_income = pd.read_csv('steuerbares_einkommen_2017.csv',
                            sep=',', encoding='utf-8', 
                            header=0, names=['bfs_number', 'bfs_name', 'yearly_income_in_mio', 'tax_income'])
df_tax_income.head(5)

Unnamed: 0,bfs_number,bfs_name,yearly_income_in_mio,tax_income
0,1,Aeugst am Albis,98,108'788
1,2,Affoltern am Albis,391,72'583
2,3,Bonstetten,224,91'002
3,4,Hausen am Albis,148,91'766
4,5,Hedingen,155,94'456


In [7]:
df_tax_income['tax_income'] = df_tax_income['tax_income'].str.replace("'", "").astype(int)

In [8]:
df.head(4)

Unnamed: 0,bfs_number,rooms,area,price,postalcode,address,town,description_raw,bfs_name,pop,pop_dens,frg_pct,emp
0,112,3.5,122,3180,8633,Sunnenbergstrasse 15,Wolfhausen,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...",Bubikon,7344,632.55814,11.410675,3617.0
1,112,4.0,87,1690,8633,Blumenbergstrasse 7,Wolfhausen,«Sehr grosse 4 Zimmer Wohnung»,Bubikon,7344,632.55814,11.410675,3617.0
2,112,3.5,92,2350,8608,,Bubikon,«Eigentumsstandard mit grossem Garten»,Bubikon,7344,632.55814,11.410675,3617.0
3,112,3.5,130,2500,8608,,Bubikon,«CHARMANT HELL UND ZENTRAL»,Bubikon,7344,632.55814,11.410675,3617.0


In [9]:
# Merge needs a key which must be identical in both data sets (here the key is 'bfs_number')
df = df.merge(df_tax_income[['bfs_number', 
                            'tax_income']], 
                on="bfs_number")
df.head(5)

Unnamed: 0,bfs_number,rooms,area,price,postalcode,address,town,description_raw,bfs_name,pop,pop_dens,frg_pct,emp,tax_income
0,112,3.5,122,3180,8633,Sunnenbergstrasse 15,Wolfhausen,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...",Bubikon,7344,632.55814,11.410675,3617.0,82162
1,112,4.0,87,1690,8633,Blumenbergstrasse 7,Wolfhausen,«Sehr grosse 4 Zimmer Wohnung»,Bubikon,7344,632.55814,11.410675,3617.0,82162
2,112,3.5,92,2350,8608,,Bubikon,«Eigentumsstandard mit grossem Garten»,Bubikon,7344,632.55814,11.410675,3617.0,82162
3,112,3.5,130,2500,8608,,Bubikon,«CHARMANT HELL UND ZENTRAL»,Bubikon,7344,632.55814,11.410675,3617.0,82162
4,261,2.5,78,3760,8002,Lavaterstr. 63,Zürich,«Wunderschöne Wohnung im Enge-Quartier»,Zürich,420217,4778.994655,32.458468,491193.0,85446


### Count number of rows and columns in the data frame

In [10]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (825, 14)
Number of rows: 825
Number of columns: 14


## Feature engineering

### Create additional variables from the apartment's descriptions

#### Create new binary (0/1) variable 'luxurious'

In [11]:
# Create a pattern which can be used to search the variable 'description_raw'
pattern = '(LOFT)|(SEESICHT)|(ATTIKA)|(LUXURIÖS)|(POOL)|(EXKLUSIV)'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['luxurious'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['luxurious'].sum())

# Show values
df[['description_raw','rooms','area','price','luxurious']]

7


Unnamed: 0,description_raw,rooms,area,price,luxurious
0,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...",3.5,122,3180,0
1,«Sehr grosse 4 Zimmer Wohnung»,4.0,87,1690,0
2,«Eigentumsstandard mit grossem Garten»,3.5,92,2350,0
3,«CHARMANT HELL UND ZENTRAL»,3.5,130,2500,0
4,«Wunderschöne Wohnung im Enge-Quartier»,2.5,78,3760,0
...,...,...,...,...,...
820,«geräumig & weitläufig»,4.5,136,2610,0
821,«Wohnung an zentraler Lage zu vermieten»,4.5,85,1710,0
822,«Moderne Wohnung an idyllischer Lage»,3.5,94,2600,0
823,«Erstbezug im Eigentumsstandard»,4.5,115,2580,0


In [12]:
df[df['luxurious']==1]

Unnamed: 0,bfs_number,rooms,area,price,postalcode,address,town,description_raw,bfs_name,pop,pop_dens,frg_pct,emp,tax_income,luxurious
174,261,4.5,171,7900,8053,,Zürich,«EXKLUSIV MÖBLIERT ODER UNMÖBLIERT»,Zürich,420217,4778.994655,32.458468,491193.0,85446,1
215,121,3.5,123,3350,8620,,Wetzikon ZH,"«LUXURIÖS, AN SCHÖNSTER LAGE»",Wetzikon (ZH),24990,1486.61511,25.994398,14163.0,68951,1
518,53,4.5,124,3250,8180,,Bülach,«EXKLUSIV UND HOCHWERTIG MIT IDEALER AUSSICHT»,Bülach,21372,1328.278434,28.032004,10973.0,78194,1
527,53,4.5,115,3250,8180,,Bülach,«EXKLUSIVE AUSSTATTUNG MIT GROSSARTIGER AUSSICHT»,Bülach,21372,1328.278434,28.032004,10973.0,78194,1
537,111,2.0,110,2150,8344,Pfarrhausstrasse 18a,Bäretswil,«EXKLUSIVES WOHNEN»,Bäretswil,5053,227.715187,10.152385,1647.0,77877,1
548,96,5.5,86,2500,8105,,Watt,"«ZENTRAL, RUHIG, MIT EXKLUSIVER TERRASSE, GARTEN»",Regensdorf,18540,1268.125855,35.571737,11126.0,73522,1
711,224,4.5,130,2950,8422,Dättlikonerstrasse 16,Pfungen,«GROSSE FANTASTISCHE TERRASSEN-ATTIKA-WOHNUNG»,Pfungen,3900,781.563126,24.846154,1400.0,71575,1


In [13]:
df.describe()

Unnamed: 0,bfs_number,rooms,area,price,postalcode,pop,pop_dens,frg_pct,emp,tax_income,luxurious
count,825.0,825.0,825.0,825.0,825.0,825.0,825.0,825.0,817.0,825.0,825.0
mean,179.933333,3.313939,85.104242,2290.633939,8341.315152,129338.147879,2175.27443,27.008149,140181.637699,82109.425455,0.008485
std,82.245926,1.108873,33.27378,940.825789,281.640466,173285.858114,1656.633466,7.38409,207775.841297,15367.783878,0.091777
min,2.0,1.0,12.0,16.0,8001.0,577.0,82.310984,7.54717,129.0,65147.0,0.0
25%,112.0,2.5,65.0,1700.0,8057.0,8689.0,826.99115,23.188001,2946.0,71979.0,0.0
50%,199.0,3.5,82.0,2080.0,8307.0,21372.0,1662.597326,25.994398,14163.0,80449.0,0.0
75%,261.0,4.0,100.0,2600.0,8603.0,420217.0,4778.994655,32.458468,491193.0,85446.0,0.0
max,298.0,9.0,300.0,8900.0,8955.0,420217.0,4778.994655,46.226483,491193.0,193412.0,1.0


#### Create new categorical variable based on apartment area

In [14]:
labels = ['0 - 49', '50 - 99', '100 - 500']
df["area_cat"] = pd.cut(df.area, bins=[0, 50, 100, 500], labels=labels)
df[['area', 'area_cat']].head(10)

Unnamed: 0,area,area_cat
0,122,100 - 500
1,87,50 - 99
2,92,50 - 99
3,130,100 - 500
4,78,50 - 99
5,195,100 - 500
6,59,50 - 99
7,75,50 - 99
8,58,50 - 99
9,52,50 - 99


#### Create new numeric variable 'price_per_m2'

In [15]:
# Create the new variable
df['price_per_m2'] = round(df['price'] / df['area'], 2)

# Show values
df[['description_raw','rooms','area', 'area_cat','price','luxurious', 'price_per_m2']]

Unnamed: 0,description_raw,rooms,area,area_cat,price,luxurious,price_per_m2
0,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...",3.5,122,100 - 500,3180,0,26.07
1,«Sehr grosse 4 Zimmer Wohnung»,4.0,87,50 - 99,1690,0,19.43
2,«Eigentumsstandard mit grossem Garten»,3.5,92,50 - 99,2350,0,25.54
3,«CHARMANT HELL UND ZENTRAL»,3.5,130,100 - 500,2500,0,19.23
4,«Wunderschöne Wohnung im Enge-Quartier»,2.5,78,50 - 99,3760,0,48.21
...,...,...,...,...,...,...,...
820,«geräumig & weitläufig»,4.5,136,100 - 500,2610,0,19.19
821,«Wohnung an zentraler Lage zu vermieten»,4.5,85,50 - 99,1710,0,20.12
822,«Moderne Wohnung an idyllischer Lage»,3.5,94,50 - 99,2600,0,27.66
823,«Erstbezug im Eigentumsstandard»,4.5,115,100 - 500,2580,0,22.43


## Data cleaning

### Count, identify and remove missing values

In [16]:
# Count missing values
print('Count missing values per variable')
print(pd.isna(df).sum(), '\n')

# Identify rows with missing values
print('Identify rows with missing values')
print(df.loc[df.isna().any(axis=1)][['rooms', 'area', 'price', 'address', 'emp']], '\n')

# Drop rows where at least one element is missing.
df2 = df.dropna()
df2.head()

Count missing values per variable
bfs_number          0
rooms               0
area                0
price               0
postalcode          0
address            50
town                0
description_raw     0
bfs_name            0
pop                 0
pop_dens            0
frg_pct             0
emp                 8
tax_income          0
luxurious           0
area_cat            0
price_per_m2        0
dtype: int64 

Identify rows with missing values
     rooms  area  price               address       emp
2      3.5    92   2350                   NaN    3617.0
3      3.5   130   2500                   NaN    3617.0
30     3.5   110   2880                   NaN  491193.0
31     5.0    80   2900                   NaN  491193.0
48     4.5   137     16                   NaN  491193.0
68     4.5   135   4490                   NaN  491193.0
72     3.5   105   1990                   NaN  491193.0
75     1.0    28   2699                   NaN  491193.0
79     2.5    60   2780                

Unnamed: 0,bfs_number,rooms,area,price,postalcode,address,town,description_raw,bfs_name,pop,pop_dens,frg_pct,emp,tax_income,luxurious,area_cat,price_per_m2
0,112,3.5,122,3180,8633,Sunnenbergstrasse 15,Wolfhausen,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...",Bubikon,7344,632.55814,11.410675,3617.0,82162,0,100 - 500,26.07
1,112,4.0,87,1690,8633,Blumenbergstrasse 7,Wolfhausen,«Sehr grosse 4 Zimmer Wohnung»,Bubikon,7344,632.55814,11.410675,3617.0,82162,0,50 - 99,19.43
4,261,2.5,78,3760,8002,Lavaterstr. 63,Zürich,«Wunderschöne Wohnung im Enge-Quartier»,Zürich,420217,4778.994655,32.458468,491193.0,85446,0,50 - 99,48.21
5,261,5.5,195,6900,8002,Parkring 59,Zürich,«Wohnanlage Im Parkring - Exklusive Wohnung zu...,Zürich,420217,4778.994655,32.458468,491193.0,85446,0,100 - 500,35.38
6,261,2.5,59,2920,8044,Flobotstrasse 2,Zürich,«Erstvermietung am Zürichberg: Charmante 2.5-Z...,Zürich,420217,4778.994655,32.458468,491193.0,85446,0,50 - 99,49.49


In [17]:
median = df['emp'].median()
df['emp'].fillna(median, inplace=True)

In [18]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(df[['rooms', 'area', 'price', 'pop', 'pop_dens', 'emp', 'tax_income']])

np.set_printoptions(suppress=True)
print(imputer.statistics_)
print('-----')
print(df[['rooms', 'area', 'price', 'pop', 'pop_dens', 'emp', 'tax_income']].median())


[    3.5           82.          2080.         21372.
  1662.59732628 14163.         80449.        ]
-----
rooms             3.500000
area             82.000000
price          2080.000000
pop           21372.000000
pop_dens       1662.597326
emp           14163.000000
tax_income    80449.000000
dtype: float64


### Count, identify & remove duplicated values

In [19]:
# Count duplicated values in the whole data set
print('Sum of missing values:', df.duplicated().sum(), '\n')

# Identify duplicated values in 'rooms', 'area', 'price'
print('Duplicated values')
print(df.loc[df.duplicated(keep = 'last')])

# Drop the rows with duplicated values
df3 = df2.drop_duplicates()

Sum of missing values: 7 

Duplicated values
     bfs_number  rooms  area  price  postalcode                 address  \
17          261    1.5    32   2100        8049           Am Wasser 161   
53          261    2.5    60   2500        8048            Herrligweg 9   
84          261    3.5    74   4200        8004     Badenerstrasse  250   
123         261    2.5    42   3000        8049           Am Wasser 161   
166         261    1.5    22   1750        8049           Am Wasser 161   
502          66    2.5    46   1630        8152  Boulevard Lilienthal 5   
679          52    2.5    78   2600        8303                     NaN   

            town                                  description_raw  \
17        Zürich         «MÖBLIERTE WOHNUNG DIREKT AN DER LIMMAT»   
53        Zürich            «Wohnung im Grünen und doch zentral!»   
84        Zürich            «Neu - Luxuriös - Am Puls von Zürich»   
123       Zürich       «Traumhafte Wohnoase direkt an der Limmat»   
166      

#### Save data to file

In [20]:
df3.columns

Index(['bfs_number', 'rooms', 'area', 'price', 'postalcode', 'address', 'town',
       'description_raw', 'bfs_name', 'pop', 'pop_dens', 'frg_pct', 'emp',
       'tax_income', 'luxurious', 'area_cat', 'price_per_m2'],
      dtype='object')

In [21]:
df3.to_csv('apartments_data_enriched.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)