# Data Preparation Data Analytics

The goal of this notebook is to prepare the housing data from data analytics and add the bsf_number (Bundesamt für Statistik number) and save the csv as a new file.

The goal is to have the following columns: 
Index(['bfs_number', 'bfs_name', 'lat', 'lon', 'rooms', 'area', 'price', 'postalcode',
       'address', 'town'],
      dtype='object')

In [1]:
# Libraries
import os
import re
import fnmatch
import datetime
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_zurich.csv', sep=',', encoding='utf-8')

# Get number of rows and columns
df.shape

(1008, 7)

In [3]:
df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw
0,1662023695-433,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3,5 Zimmer, 122 m², CHF 3180.—","Sunnenbergstrasse 15, 8633 Wolfhausen, ZH",CHF 3180.—,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...","3,5 Zimmer, 122 m², CHF 3180.—Sunnenbergstrass..."
1,1662023745-820,https://www.immoscout24.ch/de/wohnung/mieten/k...,"2,5 Zimmer, 78 m², CHF 3760.—","Lavaterstr. 63, 8002 Zürich, ZH",CHF 3760.—,«Wunderschöne Wohnung im Enge-Quartier»,"2,5 Zimmer, 78 m², CHF 3760.—Lavaterstr. 63, 8..."
2,1662023742-807,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5,5 Zimmer, 115 m², CHF 2860.—","Langfurrenstrasse 5c, 8623 Wetzikon ZH, ZH",CHF 2860.—,«Wohnmomente zum Festhalten»,"5,5 Zimmer, 115 m², CHF 2860.—Langfurrenstrass..."
3,1662023804-1290,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3,5 Zimmer, 74 m², CHF 2165.—","Sandbuckweg 5A, 8157 Dielsdorf, ZH",CHF 2165.—,"«3,5 pièces, 74 m²»","top3,5 Zimmer, 74 m², CHF 2165.—Sandbuckweg 5A..."
4,1662023739-771,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5,5 Zimmer, 195 m², CHF 6900.—","Parkring 59, 8002 Zürich, ZH",CHF 6900.—,«Wohnanlage Im Parkring - Exklusive Wohnung zu...,"5,5 Zimmer, 195 m², CHF 6900.—Parkring 59, 800..."


In [4]:
# Meaning of variables
# lat: geographical latitude
# lon: geographical longitude
# bfs_number: official municipality id
# bfs_name: official municipality name

# Geocoded data (i.e. data with latitude and longitude)
df_geo = pd.read_csv('apartments_data_geocoded.csv', 
                     sep=',', 
                     encoding='utf-8')
df_geo.head(5)

Unnamed: 0,web-scraper-order,address_raw,lat,lon,bfs_number,bfs_name
0,1662023695-433,"Sunnenbergstrasse 15, 8633 Wolfhausen, ZH",47.255714,8.804976,112,Bubikon
1,1662023720-634,"Blumenbergstrasse 7, 8633 Wolfhausen, ZH",47.254879,8.793746,112,Bubikon
2,1662023745-834,"8608 Bubikon, ZH",47.277386,8.800306,112,Bubikon
3,1662023701-503,"8608 Bubikon, ZH",47.277386,8.800306,112,Bubikon
4,1662023745-820,"Lavaterstr. 63, 8002 Zürich, ZH",47.361378,8.533339,261,Zürich


In [5]:
df = df.merge(df_geo[['web-scraper-order', 
                       'lat', 
                       'lon', 
                       'bfs_number', 
                       'bfs_name']], 
               on="web-scraper-order")
df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw,lat,lon,bfs_number,bfs_name
0,1662023695-433,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3,5 Zimmer, 122 m², CHF 3180.—","Sunnenbergstrasse 15, 8633 Wolfhausen, ZH",CHF 3180.—,"«Grosse Galerie, Terrasse mit Pergola, Berg- u...","3,5 Zimmer, 122 m², CHF 3180.—Sunnenbergstrass...",47.255714,8.804976,112,Bubikon
1,1662023745-820,https://www.immoscout24.ch/de/wohnung/mieten/k...,"2,5 Zimmer, 78 m², CHF 3760.—","Lavaterstr. 63, 8002 Zürich, ZH",CHF 3760.—,«Wunderschöne Wohnung im Enge-Quartier»,"2,5 Zimmer, 78 m², CHF 3760.—Lavaterstr. 63, 8...",47.361378,8.533339,261,Zürich
2,1662023742-807,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5,5 Zimmer, 115 m², CHF 2860.—","Langfurrenstrasse 5c, 8623 Wetzikon ZH, ZH",CHF 2860.—,«Wohnmomente zum Festhalten»,"5,5 Zimmer, 115 m², CHF 2860.—Langfurrenstrass...",47.328632,8.8104,121,Wetzikon (ZH)
3,1662023804-1290,https://www.immoscout24.ch/de/wohnung/mieten/k...,"3,5 Zimmer, 74 m², CHF 2165.—","Sandbuckweg 5A, 8157 Dielsdorf, ZH",CHF 2165.—,"«3,5 pièces, 74 m²»","top3,5 Zimmer, 74 m², CHF 2165.—Sandbuckweg 5A...",47.477493,8.456285,86,Dielsdorf
4,1662023739-771,https://www.immoscout24.ch/de/wohnung/mieten/k...,"5,5 Zimmer, 195 m², CHF 6900.—","Parkring 59, 8002 Zürich, ZH",CHF 6900.—,«Wohnanlage Im Parkring - Exklusive Wohnung zu...,"5,5 Zimmer, 195 m², CHF 6900.—Parkring 59, 800...",47.366898,8.528817,261,Zürich


In [6]:
df.columns

Index(['web-scraper-order', 'web-scraper-start-url', 'rooms_area_price_raw',
       'address_raw', 'price_raw', 'description_raw', 'text_raw', 'lat', 'lon',
       'bfs_number', 'bfs_name'],
      dtype='object')

### Extract and save relevant information from raw data using regular expressions (regex)

#### Extract number of rooms

In [7]:
 # Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    3.5
1    2.5
2    5.5
3    3.5
4    5.5
Name: rooms, dtype: float64 



#### Extract living area

In [8]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    122
1     78
2    115
3     74
4    195
Name: area, dtype: Int64 



#### Extract rental price

In [9]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Print first 5 values
print(df['price_raw'].head(), '\n')
print(df['price'].head())

0    CHF 3180.—
1    CHF 3760.—
2    CHF 2860.—
3    CHF 2165.—
4    CHF 6900.—
Name: price_raw, dtype: object 

0    3180
1    3760
2    2860
3    2165
4    6900
Name: price, dtype: Int64


#### Extract postalcode, address, town

In [10]:
# Extract values from 'address_raw' strings
postalcode = []
address = []
towns = []

for i in df['address_raw']:
    i = i.replace(', ZH', '').replace(',ZH', '').replace(',', '')
    d1 = re.findall('\d\d\d\d', i)
    #print(d1)
    try:
        d2 = d1[0].strip()
        address_with_housenumber = i[:i.index(d2)].strip()
        town = i[i.index(d2)+4:].strip()
        
        
    except:
        d2 = None
    
    postalcode.append(d2)
    address.append(address_with_housenumber)
    towns.append(town)

# Save as new variable in the pandas data frame
df['postalcode'] = pd.Series(postalcode, dtype="Int64")
df['address'] = pd.Series(address, dtype="string")
df['town'] = pd.Series(towns, dtype="string")

#### Extract number of rooms

In [11]:
 # Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    3.5
1    2.5
2    5.5
3    3.5
4    5.5
Name: rooms, dtype: float64 



#### Extract living area

In [12]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    122
1     78
2    115
3     74
4    195
Name: area, dtype: Int64 



#### Extract rental price

In [13]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Print first 5 values
print(df['price_raw'].head(), '\n')
print(df['price'].head())

0    CHF 3180.—
1    CHF 3760.—
2    CHF 2860.—
3    CHF 2165.—
4    CHF 6900.—
Name: price_raw, dtype: object 

0    3180
1    3760
2    2860
3    2165
4    6900
Name: price, dtype: Int64


#### Extract postalcode, address, town

In [14]:
# Extract values from 'address_raw' strings
postalcode = []
address = []
towns = []

for i in df['address_raw']:
    i = i.replace(', ZH', '').replace(',ZH', '').replace(',', '')
    d1 = re.findall('\d\d\d\d', i)
    #print(d1)
    try:
        d2 = d1[0].strip()
        address_with_housenumber = i[:i.index(d2)].strip()
        town = i[i.index(d2)+4:].strip()
        
        
    except:
        d2 = None
    
    postalcode.append(d2)
    address.append(address_with_housenumber)
    towns.append(town)

# Save as new variable in the pandas data frame
df['postalcode'] = pd.Series(postalcode, dtype="Int64")
df['address'] = pd.Series(address, dtype="string")
df['town'] = pd.Series(towns, dtype="string")

In [15]:
df.columns

Index(['web-scraper-order', 'web-scraper-start-url', 'rooms_area_price_raw',
       'address_raw', 'price_raw', 'description_raw', 'text_raw', 'lat', 'lon',
       'bfs_number', 'bfs_name', 'rooms', 'area', 'price', 'postalcode',
       'address', 'town'],
      dtype='object')

In [16]:
df[['bfs_number', 'rooms', 'area', 'price', 'postalcode',
       'address', 'town', 'description_raw']].to_csv('apartments_data_zurich_with_bfs.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

In [17]:
df[['bfs_number', 'rooms', 'area', 'price', 'postalcode',
       'address', 'town', 'description_raw']].head(5)

Unnamed: 0,bfs_number,rooms,area,price,postalcode,address,town,description_raw
0,112,3.5,122,3180,8633,Sunnenbergstrasse 15,Wolfhausen,"«Grosse Galerie, Terrasse mit Pergola, Berg- u..."
1,261,2.5,78,3760,8002,Lavaterstr. 63,Zürich,«Wunderschöne Wohnung im Enge-Quartier»
2,121,5.5,115,2860,8623,Langfurrenstrasse 5c,Wetzikon ZH,«Wohnmomente zum Festhalten»
3,86,3.5,74,2165,8157,Sandbuckweg 5A,Dielsdorf,"«3,5 pièces, 74 m²»"
4,261,5.5,195,6900,8002,Parkring 59,Zürich,«Wohnanlage Im Parkring - Exklusive Wohnung zu...
