In [56]:
## First specify the packages of interest
list.of.packages <- c("tidyverse", "jsonlite", "magrittr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

In [57]:
suppressPackageStartupMessages({
    library(jsonlite)
    library(tidyverse)
    library(magrittr)
})

In [58]:
data <- jsonlite::fromJSON("https://op-koti.fi/api/apartments?mode=sale&featureGroup=apartment&orderBy=created&order=desc&offset=0&limit=4500", flatten = T)

df <- data[[2]]

In [59]:
cols = c('id', 'listingType', 'floor', 'year', 'rooms', 'numberOfRooms', 'price', 'debtFreePrice',
        'location.city', 'location.region', 'location.district','location.postalCode',
        'livingArea.size', 'totalArea.size')

df <- df[cols]

df <- df %>%
  rename(city = location.city, 
         region = location.region, 
         district = location.district, 
         postalCode = location.postalCode, 
         livingArea = livingArea.size, 
         totalArea = totalArea.size, 
         yearBuilt = year)

In [60]:
head(df)

Unnamed: 0_level_0,id,listingType,floor,yearBuilt,rooms,numberOfRooms,price,debtFreePrice,city,region,district,postalCode,livingArea,totalArea
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
1,521211,89,2,1972,"2h,k",2,46000,46000.0,Mikkeli,Mikkeli,Siekkilä,50120,47.5,47.5
2,521803,91,1,2000,"3h,k,sa,kh,wc,vh,et,tk",3,119000,119000.0,Oulu,Oulu,"Oulunsalo, Pitkäkangas",90460,75.5,75.5
3,521609,89,1,1984,"3h,k,s",3,115000,115000.0,Joensuu,Joensuu,Keskusta,80100,72.5,72.5
4,519998,89,5,1964,"5h, k, erillinen wc, vh, 2 parveketta",5,269000,269000.0,Helsinki,Itä-Helsinki,Vuosaari,960,96.4,96.4
5,520696,92,0,2006,4h+k+s+rt+kph+erill. wc+vh+lämmin varasto,4,449000,,Helsinki,Pohjois-Helsinki,Tapaninvainio,780,120.0,125.0
6,521950,91,0,2002,3h+k+s,3,189700,189700.0,Kalajoki,Kalajoki,Pohjankylä,85100,100.0,100.0


listingType is in code. The code below changes it into the corresponding text values

In [61]:
df <- df %>%
  mutate(listingType = recode(listingType, '89'='Kerrostalo','90'='Omakotitalo','91'='Rivitalo','92'='Paritalo','93'='Erillistalo','112'='Puutalo','113'='Luhtitalo','470'='Kytketty paritalo'))

In [62]:
table(df$listingType)


      Erillistalo        Kerrostalo Kytketty paritalo         Luhtitalo 
               12              1386                 1                32 
      Omakotitalo          Paritalo           Puutalo          Rivitalo 
              802                85                 4               592 

replacing empty values with NAs

In [63]:
df <- df %>% 
  mutate_all(na_if,"")

df %>%
  summarise_all(funs(sum(is.na(.))))

id,listingType,floor,yearBuilt,rooms,numberOfRooms,price,debtFreePrice,city,region,district,postalCode,livingArea,totalArea
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
0,0,397,9,114,114,113,956,0,0,158,0,114,114


#### Next,
* Cases where debtFreePrice is NA but price is given, the price of the listing is the debtFreePrice. This means there is no outstanding loan payment to be carried over to the new owner

* Also creating a new row 'buildingAge' from the 'yearBuilt'

In [64]:
df <- df %>%
  mutate(debtFreePrice = case_when(is.na(debtFreePrice) & !is.na(price) ~ price,
                                   TRUE ~ debtFreePrice))

df <- df %>%
  filter(!is.na(yearBuilt))%>%
  mutate(buildingAge = case_when(yearBuilt > year(today()) ~ 1,
                                 TRUE ~ year(today()) - year(as.Date(as.character(yearBuilt), format = "%Y")))) %>%
  select(-yearBuilt)

Removing the NAs

In [65]:
df <- na.omit(df)

In [66]:
## CREATED FUNCTIONS
is.not.null <- function(x) !is.null(x)

clean_rooms <- function(rooms){
  clean = c()
  for (room in rooms){
    room = str_replace(room, "\\.", " ")
    y = strsplit(room, split = "[[:punct:]]")
    z = c()
    for (x in y) {
      z = append(z, str_trim(x, side = "both"))
    }
    z = z[!z == ""]
    # y <- paste(z, collapse = " ", recycle0 = FALSE)
    y = list(z)
    clean = append(clean, y)
  }
  return(clean)
}

detect_sauna <- function(rooms){
  # takes a list of rooms, checks for sauna and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^s$|sauna')))
  }
  return(as.integer(l))
}

detect_balcony <- function(rooms){
  # takes a list of rooms, checks for balcony and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^p$|parv|lasit p|las p')))
  }
  return(as.integer(l))
}

detect_parking <- function(rooms){
  # takes a list of rooms, checks for parking and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^ak$|^at$|auto')))
  }
  return(as.integer(l))
}

detect_walk_in_closet <- function(rooms){
  # takes a list of rooms, checks for walk-in-closet and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, '^v$|^vh$|vaate')))
  }
  return(as.integer(l))
}

detect_storage <- function(rooms){
  # takes a list of rooms, checks for storage room and returns binary
  l = c()
  for(room in rooms){
    l = append(l,any(str_detect(room, 'var')))
  }
  return(as.integer(l))
}

In [67]:
head(df$rooms,20)

We can see that the rooms column has information in varying formats. clean_rooms() makes it uniform

In [68]:
df$rooms = clean_rooms(df$rooms)

In the next phase we extract features of each listing from the 'rooms' column

In [69]:
df <- df %>%
  mutate(centrum = case_when(str_detect(postalCode, "100$") ~ as.integer(1),
                             TRUE ~ as.integer(0)))

df$hasSauna <- detect_sauna(df$rooms)
df$hasBalcony <- detect_balcony(df$rooms)
df$hasParking <- detect_parking(df$rooms)
df$hasWalkInCloset <- detect_walk_in_closet(df$rooms)
df$hasStorage <- detect_storage(df$rooms)

finally, the 'rooms' column can be removed

In [70]:
df <- df %>%
  select(-rooms)

In [71]:
# Creating new variables for price per meter square and link to the respective houses on the website
df <- df %>%
mutate(pricePMsq = debtFreePrice/totalArea, link = paste0("<a href='https://op-koti.fi/kohde/",id,"'>","https://op-koti.fi/kohde/",id,"</a>"))

In [72]:
table(df$listingType)


      Erillistalo        Kerrostalo Kytketty paritalo         Luhtitalo 
               10              1262                 1                30 
      Omakotitalo          Paritalo           Puutalo          Rivitalo 
              498                65                 3               513 

In [73]:
# Merging 'Kytketty paritalo' into 'Paritalo'
df$listingType[df$listingType %in% "Kytketty paritalo"] <- "Paritalo"
table(df$listingType)


Erillistalo  Kerrostalo   Luhtitalo Omakotitalo    Paritalo     Puutalo 
         10        1262          30         498          66           3 
   Rivitalo 
        513 

'Kytketty paritalo' is now merged into 'Paritalo'

Next we rearrange the columns for convenience

In [74]:
as.list(colnames(df))

In [75]:
df <- df[c(1,2,4,5,6,11,12,20,7:10,3,13:19,21)]

#### Now the df is ready for the dashboard. Let's save it to a csv file.

In [76]:
write.csv(df, "/Users/avinashmalla/GitHub/opKotiDashboard/forDash.csv", row.names = F)