# Data collected from OP's website op-koti.fi. Initially I used webscraping technique to collect data. But using the API was more dynamic and easy to handle

In [1]:
import numpy as np
import pandas as pd
import requests
# import math
import re
from pandas import json_normalize

In [2]:
URL = 'https://op-koti.fi/api/apartments?mode=sale&featureGroup=apartment&orderBy=created&order=desc&offset=0&limit=4500'
# request the URL and parse the JSON
response = requests.get(URL)
response.raise_for_status() # raise exception if invalid response
data = response.json()['listings']

# normalizing the json data into a dataframe
df = json_normalize(data, max_level = 1)

In [3]:
df.head(10)

Unnamed: 0,id,kipinaId,listingType,commission,listingGroup,source,created,featureGroups,floor,year,...,siteArea.size,siteArea.displayUnit,pricing.debtFreePrice,siteArea,virtualPresentationUrl,videoUrl,livingArea,totalArea,newHousing.newHousingId,newHousing.name
0,692440,417138,90,K,82,Kipinä,2021-06-01T09:06:09.931Z,[apartment],0.0,1991.0,...,986.0,m2,,,,,,,,
1,507032,431746,90,K,82,Kipinä,2021-06-01T09:05:46.712Z,[apartment],0.0,1969.0,...,1200.0,m2,,,,,,,,
2,506809,431523,91,O,82,Kipinä,2021-06-01T09:05:44.681Z,[apartment],1.0,1989.0,...,2584.0,m2,137000.0,,,,,,,
3,698329,423026,90,K,82,Kipinä,2021-06-01T08:07:41.137Z,[apartment],0.0,2009.0,...,2130.0,m2,,,,,,,,
4,507990,432702,91,O,82,Kipinä,2021-06-01T08:07:06.259Z,[apartment],1.0,1990.0,...,4056.0,m2,58500.0,,,,,,,
5,508029,432741,91,O,82,Kipinä,2021-06-01T08:07:05.548Z,[apartment],1.0,1989.0,...,5959.0,m2,127000.0,,,,,,,
6,699096,423794,89,O,82,Kipinä,2021-06-01T07:06:31.042Z,[apartment],1.0,1963.0,...,2280.0,m2,69000.0,,,,,,,
7,507879,432591,89,O,82,Kipinä,2021-06-01T07:06:12.453Z,[apartment],2.0,1914.0,...,1355.0,m2,298000.0,,,,,,,
8,507358,432071,90,K,82,Kipinä,2021-06-01T07:06:11.073Z,[apartment],0.0,2006.0,...,2112.0,m2,,,,,,,,
9,507097,431811,89,O,82,Kipinä,2021-06-01T07:06:05.402Z,[apartment],1.0,1974.0,...,10204.0,m2,62000.0,,,,,,,


It can be seen above that the dataframe contains multiple columns that are irrelevant to us. Relevant columns are selected as shown below.

In [5]:
df.columns

Index(['id', 'kipinaId', 'listingType', 'commission', 'listingGroup', 'source',
       'created', 'featureGroups', 'floor', 'year', 'imageUrl', 'showings',
       'rooms', 'numberOfRooms', 'price', 'debtFreePrice', 'rentPerMonth',
       'location.streetAddress', 'location.postalCode', 'location.postOffice',
       'location.city', 'location.cityId', 'location.region',
       'location.regionId', 'location.district', 'location.districtId',
       'location.streetId', 'location.coordinates', 'housing.livingArea',
       'housing.totalArea', 'housing.siteArea', 'housing.rooms',
       'housing.numberOfRooms', 'pricing.price', 'livingArea.size',
       'livingArea.displayUnit', 'totalArea.size', 'totalArea.displayUnit',
       'siteArea.size', 'siteArea.displayUnit', 'pricing.debtFreePrice',
       'siteArea', 'virtualPresentationUrl', 'videoUrl', 'livingArea',
       'totalArea', 'newHousing.newHousingId', 'newHousing.name'],
      dtype='object')

In [6]:
cols = ['id', 'listingType', 'floor', 'year', 'rooms', 'numberOfRooms', 'price',
       'location.city', 'location.region', 'location.district','location.postalCode',
       'livingArea.size', 'totalArea.size']

df = df[cols]

Renaming the cols

In [7]:
df = df.rename(columns = {'location.city':'city','location.region':'region','location.district':'district','location.postalCode':'postalCode','livingArea.size':'livingArea','totalArea.size':'totalArea', 'year':'yearBuilt'})

Mapping the listingType column to the apporpriate type of house (extracted from the website manually)

In [8]:
lType = {89:'Kerrostalo',90:'Omakotitalo',91:'Rivitalo',92:'Paritalo',93:'Erillistalo',112:'Puutalo',113:'Luhtitalo',470:'Kytketty paritalo'}
df['listingType'] = df['listingType'].map(lType)

In [9]:
# some values in df.district are ''. So i decided to fill them with np.nan and then remove them altogether in the next step
df[df.district == ''] = np.nan

In [10]:
def count_NAs(df):
    for col in df:
        print(col," : ",df[col].isnull().sum())
        
# def homeSizeLabels(area):
#     """these ranges were calulated by using pd.cut(). 1=Smallest...4=Largest"""
#     if area in range(0,100):
#         return 1
#     elif area in range(100,175):
#         return 2
#     elif area in range(175,250):
#         return 3
#     else:
#         return 4 

In [11]:
# df = df.dropna(inplace = True)
count_NAs(df)

id  :  221
listingType  :  221
floor  :  611
yearBuilt  :  229
rooms  :  343
numberOfRooms  :  343
price  :  343
city  :  221
region  :  221
district  :  221
postalCode  :  221
livingArea  :  343
totalArea  :  343


In [12]:
# df.dropna(inplace = True)
# count_NAs(df)

Converting the columns to string and integer(from float) in order to remove the .0 at the end of figures

In [13]:
# df['id'] = df['id'].astype(str)
# df['floor'] = df['floor'].astype(int)
# df['numberOfRooms'] = df['numberOfRooms'].astype(int)
# df['yearBuilt'] = df['yearBuilt'].astype(int)
# df['postalCode'] = df['postalCode'].astype(str)

Calculating building age from 'yearBuilt'. Some buildings are supposed to be completed in the future. So any building that gets completed in the future year are also grouped into '1 year old buildings'

In [14]:
df['buildingAge'] = [1 if x > 2021 else (pd.to_datetime('today').year - pd.to_datetime(x, format = "%Y").year) for x in df.yearBuilt]
# df['centrum'] = [1 if x == 'Keskusta' else 0 for x in df.district]
# df['howBig'] = [homeSizeLabels(x) for x in df.livingArea]

In [15]:
# hideRows = ['listingGroup', 'yearBuilt', 'imageUrl', 'streetAddress', 'postalCode', 'region', 'link']
df.drop('yearBuilt', axis = 1, inplace = True)
df.reset_index(drop = True)

Unnamed: 0,id,listingType,floor,rooms,numberOfRooms,price,city,region,district,postalCode,livingArea,totalArea,buildingAge
0,692440,Omakotitalo,0.0,5h+k+th+s+at,5.0,159000.00,Pori,Pori,Leppäkorpi,28660,209.0,255.0,30.0
1,507032,Omakotitalo,0.0,4h+k+psh+s+pkh+wc,4.0,78000.00,Jämsä,Jämsä,Jämsänkoski Koulunmäki,42300,102.7,149.7,52.0
2,506809,Rivitalo,1.0,4h+k+kph+s+wc (kellari),4.0,89468.56,Oulu,Oulu,Iinatti,90240,92.5,92.5,32.0
3,698329,Omakotitalo,0.0,"4h, k, khh, kph, s, wc, vh, et",4.0,139000.00,Pälkäne,Pälkäne,Luopioinen,36760,101.5,101.5,12.0
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3839,592934,Rivitalo,1.0,"2h, k",2.0,126000.00,Rauma,Rauma,Ota,26100,81.0,81.0,85.0
3840,,,,,,,,,,,,,
3841,578021,Paritalo,1.0,"2h,s",2.0,84446.92,Imatra,Imatra,Ukonniemi,55420,51.0,51.0,17.0
3842,573385,Omakotitalo,0.0,"4h,k,s+takkahuone/oleskelutila",4.0,95000.00,Sastamala,Sastamala,Keikyän Meskala,32740,119.0,119.0,47.0


The values in the 'rooms' column are separated using , or + or . or * and need to be cleaned

In [16]:
df.dropna(inplace = True)
count_NAs(df)

id  :  0
listingType  :  0
floor  :  0
rooms  :  0
numberOfRooms  :  0
price  :  0
city  :  0
region  :  0
district  :  0
postalCode  :  0
livingArea  :  0
totalArea  :  0
buildingAge  :  0


In [17]:
def clean_rooms(item):
    delimiters = "+", ","
    # >>> example = "stackoverflow (c) is awesome... isn't it?"
    regexPattern = '|'.join(map(re.escape, delimiters))
    return([re.sub('\\+|,|\\.|\\*| ', '', x) for x in re.split(regexPattern, item)])

df.rooms = [clean_rooms(x) for x in df.rooms]

These functions take a single entry from df.rooms and detects the features. These features are then represented with new columns using 1 or 0 denoting the presence or absence of the feature

In [18]:
def detect_centrum(postalCode):
    if postalCode[2:] == '100':
        return 1
    else:
        return 0

df['centrum'] = [detect_centrum(x) for x in df.postalCode]

def detect_sauna(room): #v4
    if 's' in room or str(room).find('sauna') != -1:
        return 1
    else:
        return 0
df['hasSauna'] = [detect_sauna(x) for x in df.rooms]

def detect_balcony(room): #v3
    if 'p' in room or str(room).find('parv') != -1:
        return 1
    else:
        return 0
df['hasBalcony'] = [detect_balcony(x) for x in df.rooms]

def detect_parking(room): #v3
    if 'ak' in room or 'at' in room or str(room).find('auto') != -1:
        return 1
    else:
        return 0
df['hasParking'] = [detect_parking(x) for x in df.rooms]

def detect_walk_in_closet(room): #v3
    if 'v' in room or 'vh' in room or str(room).find('vaate') != -1:
        return 1
    else:
        return 0
df['hasWalkInCloset'] = [detect_walk_in_closet(x) for x in df.rooms]

def detect_storage_room(room):
    if str(room).find('var') != -1:
        return 1
    else:
        return 0
df['hasStorageRoom'] = [detect_storage_room(x) for x in df.rooms]

In [19]:
# df.dropna(inplace = True)
# count_NAs(df)

In [20]:
df.drop('rooms', axis = 1, inplace = True)

In [21]:
df['id'] = df['id'].astype(str)
df['floor'] = df['floor'].astype(int)
df['numberOfRooms'] = df['numberOfRooms'].astype(int)
df['postalCode'] = df['postalCode'].astype(str)

In [22]:
df.head()

Unnamed: 0,id,listingType,floor,numberOfRooms,price,city,region,district,postalCode,livingArea,totalArea,buildingAge,centrum,hasSauna,hasBalcony,hasParking,hasWalkInCloset,hasStorageRoom
0,692440,Omakotitalo,0,5,159000.0,Pori,Pori,Leppäkorpi,28660,209.0,255.0,30.0,0,1,0,1,0,0
1,507032,Omakotitalo,0,4,78000.0,Jämsä,Jämsä,Jämsänkoski Koulunmäki,42300,102.7,149.7,52.0,0,1,0,0,0,0
2,506809,Rivitalo,1,4,89468.56,Oulu,Oulu,Iinatti,90240,92.5,92.5,32.0,0,1,0,0,0,0
3,698329,Omakotitalo,0,4,139000.0,Pälkäne,Pälkäne,Luopioinen,36760,101.5,101.5,12.0,0,1,0,0,1,0
5,508029,Rivitalo,1,2,127000.0,Jyväskylä,Jyväskylä,Heinämäki,40270,59.0,59.0,32.0,0,1,0,0,0,0


In [23]:
set(df.listingType)

{'Erillistalo',
 'Kerrostalo',
 'Kytketty paritalo',
 'Luhtitalo',
 'Omakotitalo',
 'Paritalo',
 'Puutalo',
 'Rivitalo'}

Exporting the dataframe to a csv file

In [24]:
df.to_csv(f'op-koti.csv', sep=',', encoding='utf-8', index=False)