# Prevalence of ongoing symptoms following coronavirus (COVID-19) infection in the UK
https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/conditionsanddiseases/datasets/alldatarelatingtoprevalenceofongoingsymptomsfollowingcoronaviruscovid19infectionintheuk

In [2]:
library(tidyverse)
library(readxl)
library(rvest)

html <- read_html("https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/conditionsanddiseases/datasets/alldatarelatingtoprevalenceofongoingsymptomsfollowingcoronaviruscovid19infectionintheuk")
uri_nodes <- html %>% html_nodes(".btn--primary")%>% html_attr('href')

df <- data.frame(uri=uri_nodes)%>%
  filter(grepl('peoplepopulationandcommunity', uri), T)

In [3]:
df_all <- data.frame()

for(uri in df$uri){
    long_uri <- paste0("https://www.ons.gov.uk", uri)
    
    vec <- str_split(long_uri, "/")
    filename <- paste0("data/",  vec[[1]][length(vec[[1]])])
    
    if(!file.exists(filename)){
        download.file(long_uri, destfile = filename)
    }
    tableName <-  tryCatch({
        desc <- read_excel(filename, sheet = "Table 9", range="A1:A1") 
        "Table 9"
    },
    error=function(cond) {
        "Table_9"
    })    

    if(!grepl("ongoingsymptoms", filename) || grepl("accessible", filename) && !grepl("20303accessibleupdated", filename)) { 
        
        desc <- read_excel(filename, sheet = tableName, range="A1:A1") 
        data <- read_excel(filename, sheet = tableName, skip = 4) %>%
          mutate(info = colnames(desc)[[1]])%>%
          mutate(filename = filename)%>%
          filter(grepl("Duration", Domain))%>%
          select(-Notes)
    }else if(!grepl("updated", filename) && !grepl("20210401.xlsx", filename)){

        desc <- read_excel(filename, sheet = tableName, range="A2:A2") 
        
        a <- 
          read_excel(filename, sheet = tableName, range = "A5:E12")%>%
          mutate(`Impact on daily activity` = "Activity not limited")%>%
          fill(Domain)

        b <- cbind(
          read_excel(filename, sheet = tableName, range = "A5:B12"),
          read_excel(filename, sheet = tableName, range = "I5:K12"))%>%
          mutate(`Impact on daily activity` = "Activity limited a lot")%>%
          fill(Domain)


        c <- cbind(
          read_excel(filename, sheet = tableName, range = "A5:B12"),
          read_excel(filename, sheet = tableName, range = "F5:H12"))%>%
          mutate(`Impact on daily activity` = "Activity limited a little")%>%
          fill(Domain)

        data <-
          rbind(a, b)%>%
          rbind(c)%>%
          mutate(info = colnames(desc)[[1]])%>%
          mutate(filename = filename)%>%
          filter(grepl("Duration", Domain))
     }else if(grepl("updated", filename)){
        desc <- read_excel(filename, sheet = tableName, range="A1:A1") 
        data <- read_excel(filename, sheet = tableName, skip = 3) %>%
           mutate(info = colnames(desc)[[1]])%>%
           mutate(filename = filename)%>%
           filter(grepl("Duration", Domain))
    }else{
        print(paste("Not using", filename))
        data <- data.frame()
    }

    df_all <- rbind(df_all, data)
}

df_all <- 
  df_all%>%  
  mutate(Date = gsub(".*ending ", "", info))%>% 
  mutate(Date = as.Date(Date, format = "%d %B %Y"))%>%  
  mutate(Estimate = as.integer(Estimate))%>%  
  mutate(`Impact on daily activity` = gsub("A little", "Activity limited a little", `Impact on daily activity`))%>%
  mutate(`Impact on daily activity` = gsub("A lot", "Activity limited a lot", `Impact on daily activity`))%>%
  mutate(`Impact on daily activity` = gsub("Not at all", "Activity not limited", `Impact on daily activity`))%>%
  mutate(Group = gsub("<12 weeks", "0 to <12 weeks", Group))%>%
  mutate(Group = gsub("≥52 weeks", ">= 52 weeks", Group))%>%
  mutate(Group = gsub(">=104 weeks", "104+ weeks", Group))%>%
  filter(!(Date >= "2022-03-05" & Group ==">= 52 weeks"))%>%
  select(-info)

df_all%>%
  write_csv("data/tidy_ons_longcovid.csv")

df_all%>%  
  filter(Group == ">= 52 weeks")%>%  
  filter(`Impact on daily activity` == "Activity not limited")%>%
  arrange(Date)%>%
  tail

[1] "Not using data/ongoingsymptomsfollowingcovid20210401.xlsx"


Impact on daily activity,Domain,Group,Estimate,Lower 95% confidence limit,Upper 95% confidence limit,filename,Date
<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<date>
Activity not limited,Duration since first (suspected) coronavirus infection,>= 52 weeks,122,110,134,data/ongoingsymptomsfollowingcovid1920211007.xlsx,2021-09-05
Activity not limited,Duration since first (suspected) coronavirus infection,>= 52 weeks,122,110,134,data/ongoingsymptomsfollowingcovid1920211104.xlsx,2021-10-02
Activity not limited,Duration since first (suspected) coronavirus infection,>= 52 weeks,138,126,149,data/ongoingsymptomsfollowingcovid1920211202.xlsx,2021-10-31
Activity not limited,Duration since first (suspected) coronavirus infection,>= 52 weeks,166,153,179,data/ongoingsymptomsfollowingcovid1920220106.xlsx,2021-12-06
Activity not limited,Duration since first (suspected) coronavirus infection,>= 52 weeks,179,165,194,data/ongoingsymptomsfollowingcovid1920220203.xlsx,2022-01-02
Activity not limited,Duration since first (suspected) coronavirus infection,>= 52 weeks,215,201,229,data/ongoingsymptomsfollowingcovid1920220303accessibleupdated.xlsx,2022-01-31
