# Importing and preparing rental apartments data

## Repetition:

In [1]:
#An emtpy list is a container
rooms = []
rooms

[]

In [2]:
#filled list 
rooms = [1,2,3,4,5,6,7,8,9]
rooms

for i in rooms:
    print(i) 


1
2
3
4
5
6
7
8
9


## Libraries and settings

In [3]:
# Libraries
import os
import re
import fnmatch
import datetime
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Importing data

In [4]:
# Get current working directory
print(os.getcwd())

# Show all files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_zuerich.csv', sep=',', encoding='utf-8')

# Get number of rows and columns
df.shape

C:\Workspacezhaw\data analytics\Woche 3\Materials for exercises-20221005
apartments_data_enriched.csv
apartments_data_geocoded.csv
apartments_data_prepared.csv
apartments_data_zuerich.csv


(1008, 7)

## Count number of rows and columns in the data frame

In [5]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (1008, 7)
Number of rows: 1008
Number of columns: 7


## Get data types (raw-format from web scraping)

In [6]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
rooms_area_price_raw     object
address_raw              object
price_raw                object
description_raw          object
text_raw                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract number of rooms

In [7]:
# Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('(.*)Zimmer', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rooms'] = pd.Series(rooms, dtype="float64")
    
# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['rooms'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    3.5
1    2.5
2    5.5
3    3.5
4    5.5
Name: rooms, dtype: float64 



### Extract living area

In [8]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['rooms_area_price_raw']:
    d1 = re.findall('Zimmer, (.*)m²', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)

# Save as new variable in the pandas data frame
df['area'] = pd.Series(area, dtype="Int64")

# Print first 5 values
print(df['rooms_area_price_raw'].head(5), '\n')
print(df['area'].head(5), '\n')

0    3,5 Zimmer, 122 m², CHF 3180.—
1     2,5 Zimmer, 78 m², CHF 3760.—
2    5,5 Zimmer, 115 m², CHF 2860.—
3     3,5 Zimmer, 74 m², CHF 2165.—
4    5,5 Zimmer, 195 m², CHF 6900.—
Name: rooms_area_price_raw, dtype: object 

0    122
1     78
2    115
3     74
4    195
Name: area, dtype: Int64 



### Extract rental price

In [9]:
# Extract values from 'price_raw' strings
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price, dtype="Int64")

# Print first 5 values
print(df['price_raw'].head(), '\n')
print(df['price'].head())

0    CHF 3180.—
1    CHF 3760.—
2    CHF 2860.—
3    CHF 2165.—
4    CHF 6900.—
Name: price_raw, dtype: object 

0    3180
1    3760
2    2860
3    2165
4    6900
Name: price, dtype: Int64


## Create additional variables from the apartment's descriptions

### Change strings in 'description_raw' ad 'text_raw' to uppercase 

In [10]:
# Change strings to uppercase 
df['description_raw'] = df['description_raw'].str.upper() #str.upper() == changes all letters of the strings in the df in uppercase letters
print(df['description_raw'].head(10), '\n')

df['text_raw'] = df['text_raw'].str.upper()
print(df['text_raw'].head(10))

0    «GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...
1              «WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»
2                         «WOHNMOMENTE ZUM FESTHALTEN»
3                                  «3,5 PIÈCES, 74 M²»
4    «WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...
5     «OHNE MIETKAUTION - SUPER RUHIGE LÄNDLICHE LAGE»
6    «ERSTVERMIETUNG AM ZÜRICHBERG: CHARMANTE 2.5-Z...
7    «ERSTVERMIETUNG NACH TOTAL-SANIERUNG: 3-ZIMMER...
8              «NEUES ZUHAUSE FÜR SIE UND IHRE KINDER»
9       «MODERN, HELL MIT PANORAMASICHT ÜBERS GLATTAL»
Name: description_raw, dtype: object 

0    3,5 ZIMMER, 122 M², CHF 3180.—SUNNENBERGSTRASS...
1    2,5 ZIMMER, 78 M², CHF 3760.—LAVATERSTR. 63, 8...
2    5,5 ZIMMER, 115 M², CHF 2860.—LANGFURRENSTRASS...
3    TOP3,5 ZIMMER, 74 M², CHF 2165.—SANDBUCKWEG 5A...
4    5,5 ZIMMER, 195 M², CHF 6900.—PARKRING 59, 800...
5    2 ZIMMER, 47 M², CHF 1400.—IFANGWEG 1, 8610 US...
6    2,5 ZIMMER, 59 M², CHF 2920.—FLOBOTSTRASSE 2, ...
7    3 ZIMMER, 75 M², CHF 

### Calculate lenght of strings in 'description_raw' and 'text_raw'

In [11]:
# Show first item of variable 'description_raw'
print(df['description_raw'][0])

# Lenght of the strings in 'description_raw'
df['description_raw_len'] = df['description_raw'].str.len()
print(df['description_raw_len'], '\n')

# Show first item of variable 'text_raw'
print(df['text_raw'][0])

# Lenght of the strings in 'text_raw'
df['text_raw_len'] = df['text_raw'].str.len()
print(df['text_raw_len'])

«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- UND SEESICHT»
0       58
1       39
2       28
3       19
4       57
        ..
1003    33
1004    33
1005    51
1006    41
1007    58
Name: description_raw_len, Length: 1008, dtype: int64 

3,5 ZIMMER, 122 M², CHF 3180.—SUNNENBERGSTRASSE 15, 8633 WOLFHAUSEN, ZH«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- UND SEESICHT»IN WOLFHAUSEN, EINGEBETTET IN DIE SANFTEN HÜGEL ÜBER DEM ZÜRICHSEE VERMIETEN WIR PER 1. OKTOBER 2022 DIESE MAISONETTE-DACHWOHNUNG, WELCHE MIT FOLGENDER AUSSTATTUNG ÜBERZEUGT:GROSSE TERRASSE MIT PERGOLA, SEE-/WEITSICHT UND VIEL SONNENSCHEIN GARANTIERTLICHTDURCHFLUTETE GALERIESCHLAFZIMMER MIT ZUGANG INS BAD MIT DUSCHE UND WCEIN WEITERES SCHLAFZIMMER MIT EINEM EINBAUSCHRANKSEPARATE NASSZELLE MIT ECKBADEWANNE UND WCTOPMODERNE KÜCHE MIT SÄMTLICHEN KOMFORTOFFENES WOHN- UND ESSZIMMERDIE NASSZELLEN UND DIE KÜCHE SIND MIT FEINSTEINZEUGPLATTEN UND DER RE
0       679
1       661
2       650
3       635
4       665
       ... 
1003    645


### Create new binary (0/1) variable 'luxurious'

In [12]:
# Create a pattern which can be used to search the variable 'description_raw'
pattern = '(LOFT)|(SEESICHT)|(ATTIKA)|(LUXURIÖS)|(POOL)|(EXKLUSIV)'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['luxurious'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['luxurious'].sum())

# Create new variable 'furnished' as binary dummy (0/1) variable
df['furnished'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['furnished'].sum())

# Create new variable 'balcony' as binary dummy (0/1) variable
df['balcony'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['balcony'].sum())

# Create new variable 'garden' as binary dummy (0/1) variable
df['garden'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['garden'].sum())

# Create new variable 'quiet' as binary dummy (0/1) variable
df['quiet'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['quiet'].sum())

# Create new variable 'central' as binary dummy (0/1) variable
df['central'] = df['description_raw'].str.contains(pat = pattern).astype(int)
print(df['central'].sum())

# Show values
df[['description_raw','rooms','area','price','luxurious','furnished','balcony','garden','quiet','central']]

71
71
71
71
71
71


Unnamed: 0,description_raw,rooms,area,price,luxurious,furnished,balcony,garden,quiet,central
0,"«GROSSE GALERIE, TERRASSE MIT PERGOLA, BERG- U...",3.5,122,3180,1,1,1,1,1,1
1,«WUNDERSCHÖNE WOHNUNG IM ENGE-QUARTIER»,2.5,78,3760,0,0,0,0,0,0
2,«WOHNMOMENTE ZUM FESTHALTEN»,5.5,115,2860,0,0,0,0,0,0
3,"«3,5 PIÈCES, 74 M²»",3.5,74,2165,0,0,0,0,0,0
4,«WOHNANLAGE IM PARKRING - EXKLUSIVE WOHNUNG ZU...,5.5,195,6900,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
1003,«TRAUMHAFTE LOFTWOHNUNG GESUCHT?»,1.5,65,2470,1,1,1,1,1,1
1004,"«AN SONNIGER LAGE, RICHTUNG WALD»",3.5,70,1465,0,0,0,0,0,0
1005,«IHRE WOHNUNG AN RUHIGER LAGE - BEFRISTETES WO...,2.5,56,1870,0,0,0,0,0,0
1006,«MODERNE 3.5-ZIMMERWOHNUNG IN ALTSTETTEN»,3.5,70,2190,0,0,0,0,0,0


### Create new categorical variable based on apartment area

#test1
numberLevel = {0 - 49: "low", 50-99: "medium", 100-500: "high"}

 def getNumberAsLevel(number):
    if number == 0-49:
        return "low"
    elif number == 50-99:
        return "medium"
    elif number == 100-500:
        return "high"
    
for i in range(1,500):
    print(getNumberAsLevel(i))

In [19]:
labels = ['0 - 49', '50 - 99', '100 - 500']
labels1 = ['low','medium','high']

df["area_cat"] = pd.cut(df.area, bins=[0, 50, 100, 500], labels=labels) #df.area, bins  == df['area'], bins
df["levels"] = pd.cut(df['area'], bins=[0, 50, 100, 500], labels=labels1)
df[['area', 'area_cat','levels']].head(10)

Unnamed: 0,area,area_cat,levels
0,122,100 - 500,high
1,78,50 - 99,medium
2,115,100 - 500,high
3,74,50 - 99,medium
4,195,100 - 500,high
5,47,0 - 49,low
6,59,50 - 99,medium
7,75,50 - 99,medium
8,97,50 - 99,medium
9,124,100 - 500,high


### Create new numeric variable 'price_per_m2'

In [None]:
# Create the new variable
df['price_per_m2'] = round(df['price'] / df['area'], 2)

# Show values
df[['description_raw','rooms','area','price','luxurious','price_per_m2']]

### Create new numeric variable based on  'price_per_m2' & 'apartment area'

In [None]:
# price per m2 & price categorie (Unterricht - Task1 - Price per m2)

df['price_per_m2'] = df['price'] / df['area']
df['price_per_m2'] #just for printing

labels = ['0 - 14', '15 - 19','20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90 - 1000']
df["price_cat"] = pd.cut(df['price_per_m2'], bins = [0,15,20,30,40,50,60,70,80,90,100], labels = labels)
df[['price_per_m2', 'price_cat','area','price']].head(10)

### Including current datetime

In [None]:
# Get and format datetime
df['datetime'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Show values
df[['description_raw','rooms','area','price','luxurious','price_per_m2', 'datetime']]

## Count, identify and remove missing values

In [None]:
# Count missing values
print('Count missing values per variable')
print(pd.isna(df).sum(), '\n')

# Identify rows with missing values
print('Identify rows with missing values')
print(df.loc[df.isna().any(axis=1)][['rooms', 'area', 'price']], '\n')

# Drop rows where at least one element is missing.
df2 = df.dropna()
df2.head()

## Count, identify & remove duplicated values

In [None]:
# Count duplicated values in the whole data set
print('Sum of missing values:', df.duplicated().sum(), '\n')

# Identify duplicated values in 'rooms', 'area', 'price'
print('Duplicated values')
print(df.loc[df.duplicated(keep = 'last')])

# Drop the rows with duplicated values
df3 = df2.drop_duplicates()

### Save data to file

In [None]:
df3.to_csv('apartments_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')