<a href="https://colab.research.google.com/github/aarav2703/Bike-sharing-R/blob/main/Module_2_2_Data_wrangling_with_dplyr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load necessary libraries
library(tidyverse)

# Read the dataset from the URL
url <- "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/datasets/raw_seoul_bike_sharing.csv"
bike_sharing_df <- read_csv(url)

# Print the summary of the dataset
summary(bike_sharing_df)

# Print the dimensions of the dataset
dim(bike_sharing_df)


[1mRows: [22m[34m8760[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (4): Date, SEASONS, HOLIDAY, FUNCTIONING_DAY
[32mdbl[39m (10): RENTED_BIKE_COUNT, Hour, TEMPERATURE, HUMIDITY, WIND_SPEED, Visibi...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


     Date           RENTED_BIKE_COUNT      Hour        TEMPERATURE    
 Length:8760        Min.   :   2.0    Min.   : 0.00   Min.   :-17.80  
 Class :character   1st Qu.: 214.0    1st Qu.: 5.75   1st Qu.:  3.40  
 Mode  :character   Median : 542.0    Median :11.50   Median : 13.70  
                    Mean   : 729.2    Mean   :11.50   Mean   : 12.87  
                    3rd Qu.:1084.0    3rd Qu.:17.25   3rd Qu.: 22.50  
                    Max.   :3556.0    Max.   :23.00   Max.   : 39.40  
                    NA's   :295                       NA's   :11      
    HUMIDITY       WIND_SPEED      Visibility   DEW_POINT_TEMPERATURE
 Min.   : 0.00   Min.   :0.000   Min.   :  27   Min.   :-30.600      
 1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 940   1st Qu.: -4.700      
 Median :57.00   Median :1.500   Median :1698   Median :  5.100      
 Mean   :58.23   Mean   :1.725   Mean   :1437   Mean   :  4.074      
 3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 14.800      
 Max.   :98.

In [None]:
# Drop rows with missing values in the RENTED_BIKE_COUNT column
bike_sharing_df <- bike_sharing_df %>%
  drop_na(RENTED_BIKE_COUNT)

# Calculate the summer average temperature
summer_avg_temp <- bike_sharing_df %>%
  filter(SEASONS == "Summer") %>%
  summarize(mean_temp = mean(TEMPERATURE, na.rm = TRUE)) %>%
  pull(mean_temp)

# Impute missing values for the TEMPERATURE column using the summer average temperature
bike_sharing_df <- bike_sharing_df %>%
  mutate(TEMPERATURE = ifelse(is.na(TEMPERATURE) & SEASONS == "Summer", summer_avg_temp, TEMPERATURE))

# Print the summary of the dataset again to ensure there are no missing values
summary(bike_sharing_df)

# Save the cleaned dataset
write_csv(bike_sharing_df, "seoul_bike_sharing.csv")


     Date           RENTED_BIKE_COUNT      Hour        TEMPERATURE    
 Length:8465        Min.   :   2.0    Min.   : 0.00   Min.   :-17.80  
 Class :character   1st Qu.: 214.0    1st Qu.: 6.00   1st Qu.:  3.00  
 Mode  :character   Median : 542.0    Median :12.00   Median : 13.50  
                    Mean   : 729.2    Mean   :11.51   Mean   : 12.77  
                    3rd Qu.:1084.0    3rd Qu.:18.00   3rd Qu.: 22.70  
                    Max.   :3556.0    Max.   :23.00   Max.   : 39.40  
    HUMIDITY       WIND_SPEED      Visibility   DEW_POINT_TEMPERATURE
 Min.   : 0.00   Min.   :0.000   Min.   :  27   Min.   :-30.600      
 1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 935   1st Qu.: -5.100      
 Median :57.00   Median :1.500   Median :1690   Median :  4.700      
 Mean   :58.15   Mean   :1.726   Mean   :1434   Mean   :  3.945      
 3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 15.200      
 Max.   :98.00   Max.   :7.400   Max.   :2000   Max.   : 27.200      
 SOLAR_RADIAT

In [None]:
# Convert HOUR column from numeric to character
bike_sharing_df <- bike_sharing_df %>%
  mutate(HOURS = as.character(Hour))

# Create indicator variables for SEASONS, HOLIDAY, FUNCTIONING_DAY, and HOUR
bike_sharing_df <- bike_sharing_df %>%
  mutate_at(vars(SEASONS, HOLIDAY, FUNCTIONING_DAY, Hour), as.factor) %>%
  mutate(HOLIDAY = ifelse(HOLIDAY == "Holiday", 1, 0),
         FUNCTIONING_DAY = ifelse(FUNCTIONING_DAY == "Fun", 1, 0)) %>%
  pivot_wider(names_from = SEASONS, values_from = SEASONS, values_fill = 0, values_fn = list(SEASONS = ~1)) %>%
  pivot_wider(names_from = Hour, values_from = Hour, values_fill = 0, values_fn = list(Hour = ~1))

# Save the dataset with indicator variables
write_csv(bike_sharing_df, "seoul_bike_sharing_converted.csv")


In [None]:
# Min-max normalization function
min_max_norm <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

# Apply min-max normalization to the numeric columns
bike_sharing_df <- bike_sharing_df %>%
  mutate_at(vars(RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, Visibility,
                 DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, Snowfall), min_max_norm)

# Save the normalized dataset
write_csv(bike_sharing_df, "seoul_bike_sharing_converted_normalized.csv")


In [None]:
# Dataset list
dataset_list <- c('seoul_bike_sharing.csv', 'seoul_bike_sharing_converted.csv', 'seoul_bike_sharing_converted_normalized.csv')

for (dataset_name in dataset_list) {
  # Read dataset
  dataset <- read_csv(dataset_name)
  # Standardize its columns
  names(dataset) <- toupper(names(dataset))
  names(dataset) <- str_replace_all(names(dataset), " ", "_")
  # Save the dataset back
  write_csv(dataset, dataset_name)
}


[1mRows: [22m[34m8465[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (4): Date, SEASONS, HOLIDAY, FUNCTIONING_DAY
[32mdbl[39m (10): RENTED_BIKE_COUNT, Hour, TEMPERATURE, HUMIDITY, WIND_SPEED, Visibi...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m8465[39m [1mColumns: [22m[34m41[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): Date
[32mdbl[39m (40): RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, Visibility, ...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this mess