In [2]:
pollutantmean <- function(directory, pollutant, id = 1:332) {
  # Create empty data frame to store all readings
  all_data <- data.frame()
  
  # Loop through each monitor ID
  for(monitor in id) {
    # Construct file path - ensure monitor number has proper formatting
    filepath <- file.path(directory, paste0(sprintf("%03d", monitor), ".csv"))
    
    # Read the CSV file
    monitor_data <- read.csv(filepath)
    
    # Add data to our collection
    all_data <- rbind(all_data, monitor_data)
  }
  
  # Calculate mean of specified pollutant, removing NA values
  mean(all_data[[pollutant]], na.rm = TRUE)


}

In [3]:
pollutantmean("specdata", "sulfate", 1:10)

In [4]:
pollutantmean("specdata", "nitrate", 70:72)

In [5]:
pollutantmean("specdata", "sulfate", 34)

In [6]:
pollutantmean("specdata", "nitrate")

In [8]:
complete <- function(directory, id = 1:332) {
  # Create empty data frame to store results
  results <- data.frame()
  # Loop through each monitor ID
  for(monitor in id) {
    # Construct file path with proper formatting
    filepath <- file.path(directory, paste0(sprintf("%03d", monitor), ".csv"))

    # Read the CSV file
    monitor_data <- read.csv(filepath)

    # Count complete cases (rows with no NA values)
    complete_cases <- sum(complete.cases(monitor_data))

    # Add results to data frame
    results <- rbind(results, data.frame(id = monitor, nobs = complete_cases))
  }
  # Return results data frame
  results
}


In [9]:
cc <- complete("specdata", c(6, 10, 20, 34, 100, 200, 310))
print(cc$nobs)

[1] 228 148 124 165 104 460 232


In [10]:
cc <- complete("specdata", 54)
print(cc$nobs)

[1] 219


In [11]:
RNGversion("3.5.1")
set.seed(42)
cc <- complete("specdata", 332:1)
use <- sample(332, 10)
print(cc[use, "nobs"])

"non-uniform 'Rounding' sampler used"


 [1] 711 135  74 445 178  73  49   0 687 237


In [12]:
corr <- function(directory, threshold = 0) {
  # Create empty numeric vector to store correlations
  correlations <- numeric(0)

  # Loop through each monitor ID
  for(monitor in 1:332) {
    # Construct file path with proper formatting
    filepath <- file.path(directory, paste0(sprintf("%03d", monitor), ".csv"))

    # Read the CSV file
    monitor_data <- read.csv(filepath)

    # Count complete cases (rows with no NA values)
    complete_cases <- sum(complete.cases(monitor_data))

    # Check if complete cases exceed threshold
    if(complete_cases > threshold) {
      # Calculate correlation between sulfate and nitrate
      correlation <- cor(monitor_data$sulfate, monitor_data$nitrate, use = "complete.obs")
      # Add correlation to vector
      correlations <- c(correlations, correlation)
    }
  }

  # Return vector of correlations
  correlations
}


In [13]:
cr <- corr("specdata")                
cr <- sort(cr)   
RNGversion("3.5.1")
set.seed(868)                
out <- round(cr[sample(length(cr), 5)], 4)
print(out)

"non-uniform 'Rounding' sampler used"


[1]  0.2688  0.1127 -0.0085  0.4586  0.0447


In [14]:
cr <- corr("specdata", 129)                
cr <- sort(cr)                
n <- length(cr)    
RNGversion("3.5.1")
set.seed(197)                
out <- c(n, round(cr[sample(n, 5)], 4))
print(out)

"non-uniform 'Rounding' sampler used"


[1] 243.0000   0.2540   0.0504  -0.1462  -0.1680   0.5969


In [15]:
cr <- corr("specdata", 2000)                
n <- length(cr)                
cr <- corr("specdata", 1000)                
cr <- sort(cr)
print(c(n, round(cr, 4)))

[1]  0.0000 -0.0190  0.0419  0.1901
