In [86]:
## Install packages if they don't exist
list.of.packages <- c("tidyverse", "jsonlite", "magrittr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

In [87]:
suppressPackageStartupMessages({
    library(jsonlite)
    library(tidyverse)
    library(lubridate)
    library(magrittr)
})

In [88]:
data <- jsonlite::fromJSON("https://op-koti.fi/api/apartments?mode=sale&featureGroup=apartment&orderBy=created&order=desc&offset=0&limit=4500", flatten = T)

df <- data[[2]]

In [89]:
cols = c('id', 'listingType', 'floor', 'year', 'rooms', 'numberOfRooms', 'price', 'debtFreePrice',
        'location.city', 'location.region', 'location.district','location.postalCode',
        'livingArea.size', 'totalArea.size', 'featureGroups')

df <- df[cols]

df <- df %>%
  rename(city = location.city, 
         region = location.region, 
         district = location.district, 
         postalCode = location.postalCode, 
         livingArea = livingArea.size, 
         totalArea = totalArea.size, 
         yearBuilt = year)

In [90]:
names(df)

_listingType_ is in code. The code below changes it into the corresponding text values

In [91]:
df <- df %>%
  mutate(listingType = recode(listingType, '89'='Kerrostalo','90'='Omakotitalo','91'='Rivitalo','92'='Paritalo','93'='Erillistalo','112'='Puutalo','113'='Luhtitalo','470'='Kytketty paritalo'))

In [92]:
table(df$listingType)


      Erillistalo        Kerrostalo Kytketty paritalo         Luhtitalo 
               12              1418                 1                45 
      Omakotitalo          Paritalo           Puutalo          Rivitalo 
              804                79                 6               565 

replacing empty values with NAs

In [93]:
df <- df %>% 
  mutate_all(na_if,"")

df %>%
  summarise_all(funs(sum(is.na(.))))

id,listingType,floor,yearBuilt,rooms,numberOfRooms,price,debtFreePrice,city,region,district,postalCode,livingArea,totalArea,featureGroups
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
0,0,335,10,114,114,114,953,0,0,151,0,114,114,0


#### Next,
* Cases where _debtFreePrice_ is NA but _price_ is given, the price of the listing is the debtFreePrice. This means there is no outstanding loan payment to be carried over to the new owner

* Also creating a new row _buildingAge_ from the _yearBuilt_

In [94]:
df <- df %>%
  mutate(debtFreePrice = case_when(is.na(debtFreePrice) & !is.na(price) ~ price,
                                   TRUE ~ debtFreePrice))

df <- df %>%
  filter(!is.na(yearBuilt))%>%
  mutate(buildingAge = case_when(yearBuilt > year(today()) ~ 1,
                                 TRUE ~ year(today()) - year(as.Date(as.character(yearBuilt), format = "%Y")))) %>%
  select(-yearBuilt)

Removing the NAs

In [95]:
df <- na.omit(df)

In [96]:
## CREATED FUNCTIONS
is.not.null <- function(x) !is.null(x)

clean_rooms <- function(rooms){
  clean = c()
  for (room in rooms){
    room = str_replace(room, "\\.", " ")
    y = strsplit(room, split = "[[:punct:]]")
    z = c()
    for (x in y) {
      z = append(z, str_trim(x, side = "both"))
    }
    z = z[!z == ""]
    # y <- paste(z, collapse = " ", recycle0 = FALSE)
    y = list(z)
    clean = append(clean, y)
  }
  return(clean)
}

detect_sauna <- function(rooms){
  # takes a list of rooms, checks for sauna and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^s$|sauna')))
  }
  return(as.integer(l))
}

detect_balcony <- function(rooms){
  # takes a list of rooms, checks for balcony and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^p$|parv|lasit p|las p')))
  }
  return(as.integer(l))
}

detect_parking <- function(rooms){
  # takes a list of rooms, checks for parking and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^ak$|^at$|auto')))
  }
  return(as.integer(l))
}

detect_walk_in_closet <- function(rooms){
  # takes a list of rooms, checks for walk-in-closet and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^v$|^vh$|vaate')))
  }
  return(as.integer(l))
}

detect_storage <- function(rooms){
  # takes a list of rooms, checks for storage room and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, 'var')))
  }
  return(as.integer(l))
}

In [97]:
head(df$rooms)

We can see that _rooms_ has information in varying formats. clean_rooms() makes it uniform

In [98]:
df$rooms = clean_rooms(df$rooms)
head(df$rooms)

In the next phase we extract features of each listing from the _rooms_ column

In [99]:
df <- df %>%
  mutate(centrum = case_when(str_detect(postalCode, "100$") ~ as.integer(1),
                             TRUE ~ as.integer(0)))

df$hasSauna <- detect_sauna(df$rooms)
df$hasBalcony <- detect_balcony(df$rooms)
df$hasParking <- detect_parking(df$rooms)
df$hasWalkInCloset <- detect_walk_in_closet(df$rooms)
df$hasStorage <- detect_storage(df$rooms)

finally, _rooms_ can be removed

In [100]:
df <- df %>%
  select(-rooms)

In [101]:
# Creating new variables for price per meter square and link to the respective houses on the website
df <- df %>%
mutate(pricePMsq = debtFreePrice/totalArea, link = paste0("<a href='https://op-koti.fi/kohde/",id,"'>","https://op-koti.fi/kohde/",id,"</a>"))

In [102]:
table(df$listingType)


      Erillistalo        Kerrostalo Kytketty paritalo         Luhtitalo 
               11              1297                 1                44 
      Omakotitalo          Paritalo           Puutalo          Rivitalo 
              536                67                 5               489 

In [103]:
# Merging 'Kytketty paritalo' into 'Paritalo'
df$listingType[df$listingType %in% "Kytketty paritalo"] <- "Paritalo"
table(df$listingType)


Erillistalo  Kerrostalo   Luhtitalo Omakotitalo    Paritalo     Puutalo 
         11        1297          44         536          68           5 
   Rivitalo 
        489 

_Kytketty paritalo_ is now merged into _Paritalo_.
Next is featureGroups, currently it's in a list inside a list

In [104]:
df$featureGroups <- rapply(df$featureGroups, function(x) head(x, 1))

Next we move on to arranging the columns

In [105]:
as.list(colnames(df))

In [106]:
df <- df[c(1,2,13,4,5,6,11,12,21,7:10,3,14:20,22)]

_The columns are now arranged in the preferred order._ __Now the df is ready for the dashboard. Let's save it to a csv file.__

In [107]:
write.csv(df, "/Users/avinashmalla/GitHub/opKotiDashboard/forDash.csv", row.names = F)