In [1]:
library(dplyr)
library(tidyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# URL of the dataset on UCI Machine Learning Repository
url <- "https://archive.ics.uci.edu/static/public/360/air+quality.zip"

# Download the zip file and extract it
download.file(url, destfile = "air_quality.zip")
unzip("air_quality.zip")

# Read the dataset into R
air_quality <- read.csv("AirQualityUCI.csv", sep = ";", header = TRUE, na.strings = "-200")

# View the first few rows of the dataset
head(air_quality)


Unnamed: 0_level_0,Date,Time,CO.GT.,PT08.S1.CO.,NMHC.GT.,C6H6.GT.,PT08.S2.NMHC.,NOx.GT.,PT08.S3.NOx.,NO2.GT.,PT08.S4.NO2.,PT08.S5.O3.,T,RH,AH,X,X.1
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<lgl>,<lgl>
1,10/03/2004,18.00.00,26,1360,150,119,1046,166,1056,113,1692,1268,136,489,7578,,
2,10/03/2004,19.00.00,2,1292,112,94,955,103,1174,92,1559,972,133,477,7255,,
3,10/03/2004,20.00.00,22,1402,88,90,939,131,1140,114,1555,1074,119,540,7502,,
4,10/03/2004,21.00.00,22,1376,80,92,948,172,1092,122,1584,1203,110,600,7867,,
5,10/03/2004,22.00.00,16,1272,51,65,836,131,1205,116,1490,1110,112,596,7888,,
6,10/03/2004,23.00.00,12,1197,38,47,750,89,1337,96,1393,949,112,592,7848,,


In [3]:
# Clean the data: Convert commas to dots for numeric columns, remove unwanted columns (X, X.1), and remove NA columns
air_quality_cleaned <- air_quality |>
  mutate_at(vars(-Date, -Time), ~as.numeric(gsub(",", ".", .))) |>
  select(-X, -X.1)

# Convert Date and Time columns to proper formats
air_quality_cleaned$Date <- as.Date(air_quality_cleaned$Date, format = "%d/%m/%Y")
air_quality_cleaned$Time <- as.POSIXct(air_quality_cleaned$Time, format = "%H.%M.%S")

head(air_quality_cleaned)


Unnamed: 0_level_0,Date,Time,CO.GT.,PT08.S1.CO.,NMHC.GT.,C6H6.GT.,PT08.S2.NMHC.,NOx.GT.,PT08.S3.NOx.,NO2.GT.,PT08.S4.NO2.,PT08.S5.O3.,T,RH,AH
Unnamed: 0_level_1,<date>,<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2004-03-10,2023-07-29 18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
2,2004-03-10,2023-07-29 19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
3,2004-03-10,2023-07-29 20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
4,2004-03-10,2023-07-29 21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
5,2004-03-10,2023-07-29 22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888
6,2004-03-10,2023-07-29 23:00:00,1.2,1197,38,4.7,750,89,1337,96,1393,949,11.2,59.2,0.7848


In [5]:
# Rename the column names to more descriptive names
col_names <- c("Date", "Time", "CO_Concentration", "PT08_S1_CO_Sensor", "NMHC_Concentration",
               "C6H6_Concentration", "PT08_S2_NMHC_Sensor", "NOx_Concentration", "PT08_S3_NOx_Sensor",
               "NO2_Concentration", "PT08_S4_NO2_Sensor", "PT08_S5_O3_Sensor", "Temperature", "Relative_Humidity", "Absolute_Humidity")

colnames(air_quality_cleaned) <- col_names


In [6]:
head(air_quality_cleaned)

Unnamed: 0_level_0,Date,Time,CO_Concentration,PT08_S1_CO_Sensor,NMHC_Concentration,C6H6_Concentration,PT08_S2_NMHC_Sensor,NOx_Concentration,PT08_S3_NOx_Sensor,NO2_Concentration,PT08_S4_NO2_Sensor,PT08_S5_O3_Sensor,Temperature,Relative_Humidity,Absolute_Humidity
Unnamed: 0_level_1,<date>,<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2004-03-10,2023-07-29 18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
2,2004-03-10,2023-07-29 19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
3,2004-03-10,2023-07-29 20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
4,2004-03-10,2023-07-29 21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
5,2004-03-10,2023-07-29 22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888
6,2004-03-10,2023-07-29 23:00:00,1.2,1197,38,4.7,750,89,1337,96,1393,949,11.2,59.2,0.7848
