# Data Preprocessing

## Task 1: Import the Modules

In [None]:
library(ggplot2)
library(ggthemes)
library(lubridate)
library(dplyr)
library(tidyr)
library(DT)
library(scales)
library(patchwork)
options(warn = -1)

## Task 2: Load the Data

In [None]:
apr <- read.csv('uber-raw-data-apr14.csv')
may <- read.csv('uber-raw-data-may14.csv')
june <- read.csv('uber-raw-data-jun14.csv')
july <- read.csv('uber-raw-data-jul14.csv')
aug <- read.csv('uber-raw-data-aug14.csv')
sep <- read.csv('uber-raw-data-sep14.csv')
dataset <- rbind(apr, may, june, july, aug, sep)

In [None]:
head(dataset)

## Task 3: Format the Data

In [None]:
dataset$Date.Time <- as.POSIXct(dataset$Date.Time, format = "%m/%d/%Y %H:%M:%S")
dataset$Time <- format(as.POSIXct(dataset$Date.Time, format = "%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
dataset$Date.Time <- ymd_hms(dataset$Date.Time)
dataset$day <- factor(day(dataset$Date.Time))
dataset$month <- factor(month(dataset$Date.Time, label = TRUE))
dataset$year <- factor(year(dataset$Date.Time))
dataset$dayofweek <- factor(wday(dataset$Date.Time, label = TRUE))
dataset$hour <- factor(hour(hms(dataset$Time)))
dataset$minute <- factor(minute(hms(dataset$Time)))
dataset$second <- factor(second(hms(dataset$Time)))

In [None]:
head(dataset)

# Monthly Data Analysis

## Task 4: Get the Monthly Data

In [None]:
month_data = dataset %>% group_by(month)
month_data = month_data %>% dplyr::summarize(trip_count = n())
datatable(month_data)

In [None]:
ggplot(month_data , aes(month, trip_count)) + geom_bar( stat = "identity")

## Task 5: Get the Trips Data of the Week Days

In [None]:
day_month_data = dataset %>% group_by(month, dayofweek)
day_month_data = day_month_data %>% dplyr::summarize(trip_count = n())
datatable(day_month_data)

In [None]:
ggplot(day_month_data, aes(month, trip_count, fill=dayofweek)) + geom_bar(stat = "identity", position="dodge")

## Task 6: Add Colors and Title

In [None]:
colors = c("#CC1011", "#665555", "#05a399", "#cfcaca", "#f5e840", "#0683c9", "#e075b0")
ggplot(day_month_data, aes(month, trip_count, fill=dayofweek)) + 
    geom_bar(stat = "identity", position="dodge") + 
    ggtitle("Trips by Day and Month") + 
    scale_fill_manual(values=colors)

## Task 7: Get the Trips by Bases on Each Month

In [None]:
ggplot(dataset, aes(Base)) + geom_bar(fill = "darkred") + ggtitle("Trips by Bases")

In [None]:
ggplot(dataset, aes(Base, fill = month)) + geom_bar(position = "dodge") + ggtitle("Trips by Bases and Month") + scale_fill_manual(values = colors)

## Task 8: Plot the Trips on Each Day of Week from the Base

In [None]:
ggplot(dataset, aes(Base, fill = dayofweek)) + geom_bar(position = "dodge") + ggtitle("Trips by Bases and Day of Week") + scale_fill_manual(values = colors)

# Daily Data Analysis

## Task 9: Get the Hourly Trips

In [None]:
hourly_data = dataset %>% group_by(hour)
hourly_data = hourly_data %>% dplyr::summarize(trip_count = n())
datatable(hourly_data)

In [None]:
ggplot(hourly_data, aes(hour, trip_count)) + geom_bar(stat = "identity") + ggtitle("Trips by Hour")

## Task 10: Get the Hourly Trips with Months and Days of Week

In [None]:
hourly_monthly_data = dataset %>% group_by(hour, month)
hourly_monthly_data = hourly_monthly_data %>% dplyr::summarize(trip_count = n())
hourly_day_data = dataset %>% group_by(hour, day)
hourly_day_data = hourly_day_data %>% dplyr:: summarize(trip_count = n())

In [None]:
ggplot(hourly_monthly_data, aes(hour, trip_count, fill = month)) + geom_bar( stat = "identity") + ggtitle("Trips by Hour and Month")
ggplot(hourly_day_data, aes(hour, trip_count, fill = day)) + geom_bar( stat = "identity") + ggtitle("Trips by Hour and Week Day")

## Task 11: Get the Trips on Each Day of Month

In [None]:
day_data = dataset %>% group_by(day)
day_data = day_data %>% dplyr::summarize(trip_count = n())
datatable(day_data)

In [None]:
ggplot(day_data, aes(day, trip_count)) + geom_bar(stat = "identity") + ggtitle("Trips by Day")

## Task 12: Plot the Trips on Each Day with Months

In [None]:
month_day_data = dataset %>% group_by(month, day)
month_day_data = month_day_data %>% dplyr::summarize(trip_count = n())
datatable(month_day_data)

In [None]:
ggplot(month_day_data, aes(day, trip_count, fill = month)) + geom_bar(stat="identity") + ggtitle("Trips by Day and Month")

# Data Plotting

## Task 13: Plot the Heatmap

In [None]:
day_and_hour <- dataset %>% group_by(day, hour) %>% dplyr::summarize(trip_count = n())
month_base <-  dataset %>% group_by(Base, month) %>% dplyr::summarize(Total = n()) 

In [None]:
g1 <- ggplot(day_and_hour, aes(day, hour, fill = trip_count)) + geom_tile(color = "white") + ggtitle("Heat Map by Hour and Day")
g2 <- ggplot(month_day_data, aes(day, month, fill = trip_count)) + geom_tile(color = "white") + ggtitle("Heat Map by Month and Day")
g3 <- ggplot(month_base, aes(Base, month, fill = Total)) + geom_tile(color = "white") + ggtitle("Heat Map by Month and Bases")

In [None]:
g1 / g2/ g3

## Task 14: Visualize the rides in New York

In [None]:
min_lat <- min(dataset$Lat)
max_lat <- max(dataset$Lat)
min_long <- min(dataset$Lon)
max_long <- max(dataset$Lon)

In [None]:
ggplot(dataset, aes(x=Lon, y=Lat, color = Base)) +
  geom_point(size=1) +
     scale_x_continuous(limits=c(min_long, max_long)) +
      scale_y_continuous(limits=c(min_lat, max_lat)) +
       theme_map() +
          ggtitle("NYC MAP BASED ON UBER RIDES DURING 2014 (APR-SEP) by BASE")