<a href="https://colab.research.google.com/github/alhasanmolla/R-programming/blob/main/R_programming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Bitcoin (BTC) price prediction**

In [None]:
install.packages("quantmod")
install.packages("caret")
install.packages("rpart")
install.packages("rpart.plot")
install.packages("Metrics")
install.packages("zoo")

In [None]:
# Load necessary libraries
library(quantmod)
library(caret)
library(rpart)
library(rpart.plot)
library(Metrics)
library(zoo)  # For rollmean function

# Step 1: Load and prepare the BTC data
getSymbols("BTC-USD", src = "yahoo", from = "2020-01-01", to = Sys.Date())
btc_data <- na.omit(Cl(`BTC-USD`))  # Using closing prices and removing NAs

# Convert to data frame and create features
btc_data <- data.frame(Date = index(btc_data), Close = as.numeric(btc_data$`BTC-USD.Close`))
btc_data$Lag1 <- lag(btc_data$Close, -1)
btc_data$Lag5 <- rollmean(btc_data$Close, k = 5, fill = NA, align = "right")
btc_data$Lag10 <- rollmean(btc_data$Close, k = 10, fill = NA, align = "right")
btc_data <- na.omit(btc_data)  # Remove any NA values created from lags

# Step 2: Train-Test Split
set.seed(123)  # For reproducibility
trainIndex <- createDataPartition(btc_data$Close, p = 0.8, list = FALSE)
train_data <- btc_data[trainIndex, ]
test_data <- btc_data[-trainIndex, ]

# Step 3: Train the Decision Tree Model
formula <- Close ~ Lag1 + Lag5 + Lag10
tree_model <- rpart(formula, data = train_data, method = "anova")
rpart.plot(tree_model)

# Step 4: Make Predictions and Evaluate
predictions <- predict(tree_model, newdata = test_data)
rmse_val <- rmse(test_data$Close, predictions)
print(paste("RMSE: ", rmse_val))

# Step 5: Future 7 Days Prediction
# Get the last row of the dataset to initialize the forecast
future_data <- tail(btc_data, 1)
future_predictions <- numeric(7)  # Store future predictions

for (i in 1:7) {
  # Predict the next day based on the last known features
  future_pred <- predict(tree_model, newdata = future_data)
  future_predictions[i] <- future_pred

  # Update the future_data for the next prediction
  future_data <- data.frame(
    Close = future_pred,
    Lag1 = future_data$Close,
    Lag5 = mean(c(future_data$Lag1, future_data$Lag5[1:4])),  # Update moving average
    Lag10 = mean(c(future_data$Lag1, future_data$Lag10[1:9])) # Update moving average
  )
}

# Step 6: Visualize Results
# Combine actual and future data for plotting
full_dates <- c(btc_data$Date, seq.Date(from = tail(btc_data$Date, 1) + 1, by = "days", length.out = 7))
full_data <- c(btc_data$Close, future_predictions)

plot(btc_data$Date, btc_data$Close, type = "l", col = "blue", lwd = 2,
     main = "BTC Price Prediction - Next 7 Days",
     xlab = "Date", ylab = "Price (USD)")
lines(full_dates[(length(full_dates) - 6):length(full_dates)], future_predictions, col = "red", lwd = 2, lty = 2)
legend("topright", legend = c("Actual", "Future Prediction"), col = c("blue", "red"), lty = 1:2, lwd = 2)


In [None]:
# Define parameters and assumptions
initial_fcf <- 6000  # Starting Free Cash Flow (e.g., $6 billion) - adjust based on actual data
growth_rate <- 0.15  # Estimated FCF growth rate of 15% per year
wacc <- 0.09         # Discount rate (WACC) at 9%
terminal_growth <- 0.03  # Terminal growth rate at 3%
projection_years <- 10  # Number of years for explicit FCF projections

# Step 1: Project Future Free Cash Flows
future_fcfs <- numeric(projection_years)
future_fcfs[1] <- initial_fcf * (1 + growth_rate)
for (i in 2:projection_years) {
  future_fcfs[i] <- future_fcfs[i - 1] * (1 + growth_rate)
}

# Step 2: Calculate Terminal Value (Using Gordon Growth Model)
terminal_value <- future_fcfs[projection_years] * (1 + terminal_growth) / (wacc - terminal_growth)

# Step 3: Discount Cash Flows to Present Value
discounted_fcfs <- future_fcfs / (1 + wacc)^(1:projection_years)
discounted_terminal_value <- terminal_value / (1 + wacc)^projection_years

# Step 4: Calculate Enterprise Value
enterprise_value <- sum(discounted_fcfs) + discounted_terminal_value

# Step 5: Adjust for Net Debt to Get Equity Value
net_debt <- -5000  # Assume Tesla has net cash (negative net debt)
equity_value <- enterprise_value + net_debt

# Step 6: Calculate Fair Value per Share
shares_outstanding <- 3167  # Approximate outstanding shares (in millions)
fair_value_per_share <- equity_value / shares_outstanding

# Output results
cat("Projected Future FCFs (in millions):", future_fcfs, "\n")
cat("Discounted FCFs (in millions):", discounted_fcfs, "\n")
cat("Enterprise Value (in millions):", enterprise_value, "\n")
cat("Equity Value (in millions):", equity_value, "\n")
cat("Fair Value per Share:", fair_value_per_share, "\n")


In [None]:
# Load necessary libraries
library(quantmod)
library(caret)
library(rpart)
library(rpart.plot)
library(Metrics)
library(zoo)  # For rollmean function

# Step 1: Load and prepare the BTC data
getSymbols("BTC-USD", src = "yahoo", from = "2020-01-01", to = Sys.Date())
btc_data <- na.omit(Cl(`BTC-USD`))  # Using closing prices and removing NAs

# Convert to data frame and create features
btc_data <- data.frame(Date = index(btc_data), Close = as.numeric(btc_data$`BTC-USD.Close`))
btc_data$Lag1 <- dplyr::lag(btc_data$Close, 1)   # Use positive 1 for lagging one day back
btc_data$Lag5 <- rollmean(btc_data$Close, k = 5, fill = NA, align = "right")
btc_data$Lag10 <- rollmean(btc_data$Close, k = 10, fill = NA, align = "right")
btc_data <- na.omit(btc_data)  # Remove any NA values created from lags

# Step 2: Train-Test Split
set.seed(123)  # For reproducibility
trainIndex <- createDataPartition(btc_data$Close, p = 0.8, list = FALSE)
train_data <- btc_data[trainIndex, ]
test_data <- btc_data[-trainIndex, ]

# Step 3: Train the Decision Tree Model
formula <- Close ~ Lag1 + Lag5 + Lag10
tree_model <- rpart(formula, data = train_data, method = "anova")
rpart.plot(tree_model)

# Step 4: Make Predictions and Evaluate
predictions <- predict(tree_model, newdata = test_data)
rmse_val <- rmse(test_data$Close, predictions)
print(paste("RMSE: ", rmse_val))

# Step 5: Future 7 Days Prediction
# Get the last row of the dataset to initialize the forecast
future_data <- tail(btc_data, 1)
future_predictions <- numeric(7)  # Store future predictions

for (i in 1:7) {
  # Predict the next day based on the last known features
  future_pred <- predict(tree_model, newdata = future_data)
  future_predictions[i] <- future_pred

  # Update the future_data for the next prediction
  future_data <- data.frame(
    Close = future_pred,
    Lag1 = future_data$Close,
    Lag5 = mean(c(future_data$Lag1, future_data$Lag5[1:4])),  # Update moving average
    Lag10 = mean(c(future_data$Lag1, future_data$Lag10[1:9])) # Update moving average
  )
}

# Step 6: Visualize Results
# Combine actual and future data for plotting
full_dates <- c(btc_data$Date, seq.Date(from = tail(btc_data$Date, 1) + 1, by = "days", length.out = 7))
full_data <- c(btc_data$Close, future_predictions)

plot(btc_data$Date, btc_data$Close, type = "l", col = "blue", lwd = 2,
     main = "BTC Price Prediction - Next 7 Days",
     xlab = "Date", ylab = "Price (USD)")
lines(full_dates[(length(full_dates) - 6):length(full_dates)], future_predictions, col = "red", lwd = 2, lty = 2)
legend("topright", legend = c("Actual", "Future Prediction"), col = c("blue", "red"), lty = 1:2, lwd = 2)


In [None]:
# How to activate repositories
# How to install packages
install.packages("tidyverse", dependencies = TRUE)

# Activate the packages
library(readxl)
library(tidyverse)




In [None]:
x <- 3

In [None]:
x

In [None]:
x <- 10
if (x > 5) {
  print("x is greater than 5")
} else {
  print("x is less than or equal to 5")
}

In [None]:
for (i in 1:5) {
  print(i)
}

In [None]:
i <- 1
while (i <= 5) {
  print(i)
  i <- i + 1
}

In [None]:
x <- 5

In [None]:
square <- function(x) {
  return(x^2)
}

In [None]:
square_and_add <- function(x, add_num) {
  return(x^2 + add_num)
}

In [None]:
result <- square_and_add(5, 3)
print(result)  # Output: 28

In [None]:
square

In [None]:
x <- NULL  # Represents a missing value
y <- NA     # Another way to represent a missing value
z <- Inf    # Represents infinity
w <- NaN    # Represents a Not-a-Number value

In [None]:
x

In [None]:
my_vector <- c(1, 2, 3, 4, 5)

In [None]:
my_vector

In [None]:
my_list <- list("apple", 10, TRUE)

In [None]:
print(my_list)

In [None]:
my_matrix <- matrix(c(1, 2, 3, 4,5,6,7,8,9,10), nrow = 2, ncol = 5)

In [None]:
my_matrix

In [None]:
my_df <- data.frame(name = c("Alice", "Bob"), age = c(25, 30))

In [None]:
my_df

In [None]:
my_factor <- factor(c("male", "female", "male"))

In [None]:
my_factor

In [None]:
# Read the CSV file
data <- read.csv("/content/sample_data/TSLA_cash_flow.csv", header = TRUE)

# Transpose the data
transposed_data <- t(data)

# Convert to a data frame
df_transposed <- as.data.frame(transposed_data)

# View the transposed data frame
print(df_transposed)

In [None]:
df_transposed

In [None]:
write.csv(df_transposed, "output.csv")

In [None]:
# Install the package if not already installed
install.packages("readxl")

# Load the library
library(readxl)

# Read the XLSX file
datas <- read_excel("/content/output.xlsx")

# Print the data
print(datas)

In [None]:
datas

In [None]:
#######################################################################################################

In [None]:
# Read the CSV file
df <- read.csv("/content/sample_data/salaries.csv", header = TRUE)

In [None]:

df

In [None]:
# Load necessary packages
install.packages("tidyverse", dependencies = TRUE)
library(readxl)
library(tidyverse)

# Load your dataset
df <- read.csv("/content/sample_data/salaries.csv")

# Display the first few rows and column names to verify
head(df)
colnames(df)



In [None]:
# Use the correct column name for color (replace 'department' with the actual column name if needed)
ggplot(df, aes(x = salary, y = job_title, color = experience_level)) +
  geom_point() +
  labs(title = "Salaries by Job Title", x = "salary", y = "job_title") +
  theme_minimal()


In [None]:
# Load necessary packages
install.packages("tidyverse", dependencies = TRUE)
library(readxl)
library(tidyverse)

# Load your dataset
df <- read.csv("/content/sample_data/salaries.csv")

# Display the first few rows and column names to verify
#head(df)
#colnames(df)

# Check the structure of the data frame
#str(df)

# Use the correct column names for aesthetics
# Replace 'department' with the actual categorical column name if necessary
# Assuming 'salary' is numeric and 'job_title' is a categorical variable
ggplot(df, aes(x = salary, y = job_title )) +
  geom_point(size = 3, alpha = 0.7) +  # Add points with specified size and transparency
  labs(title = "Salaries by Job Title and Experience Level",
       x = "salary",
       y = "job_title",
       color = "experience_level") +  # Add labels for axes and legend
  theme_minimal() +  # Apply a minimal theme for a clean look
  scale_color_brewer(palette = "Set1")  # Optional: use a color palette



In [None]:
install.packages("ggplot2")
library(ggplot2)

data()
data("diamonds")

d_sample <- diamonds[seq(1, 53000, by=100),]
nrow(d_sample)

p <- ggplot(data=d_sample, aes(x=color, y=price, fill=color)) + geom_violin()

p + geom_boxplot(width=0.1) + geom_jitter(size=0.5)

In [None]:
d_sample

In [None]:
# Load necessary packages
install.packages("ggplot2")  # Uncomment if ggplot2 is not installed
library(ggplot2)

# Load your dataset
df <- read.csv("/content/sample_data/salaries.csv")

# Display the first few rows and column names to verify
#head(df)
#colnames(df)

# Sample the dataset (uncomment if you want to sample)
# df <- df[seq(1, 130, by=11),]
# nrow(df)  # Display the number of rows in the sample (optional)

# Create the plot
p <- ggplot(data=df, aes(x=experience_level, y=salary_in_usd, fill=experience_level)) +
  geom_violin(trim = FALSE) +  # Create the violin plot
  geom_boxplot(width=0.1, position=position_dodge(0.9)) +  # Add boxplot
  geom_jitter(size=0.5, width=0.2, height=0) +  # Add jittered points
  labs(title = "Salary Distribution by Experience Level",
       x = "Experience Level",
       y = "Salary in USD") +  # Add labels
  theme_minimal()  # Use a minimal theme

# Display the plot
print(p)


In [None]:
df

In [None]:
qplot(experience_level, salary_in_usd, data = df, colour = company_location)

In [None]:
qplot(salary_in_usd, data = df, geom = "histogram", binwidth = 1,
xlim = c(0,3))
qplot(salary_in_usd, data = df, geom = "histogram", binwidth = 0.1,
xlim = c(0,3))
qplot(salary_in_usd, data = df, geom = "histogram", binwidth = 0.01,
xlim = c(0,3))


In [None]:
# Load necessary packages
install.packages("ggplot2")  # Uncomment if ggplot2 is not installed
library(ggplot2)

# Load your dataset
df <- read.csv("/content/sample_data/salaries.csv")  # Update with actual path



# Display the salary range to decide on appropriate binwidth and x-axis limits
summary(df$salary_in_usd)

# Adjusted binwidths for the salary range
# Plot with binwidth of 10000
qplot(salary_in_usd, data = df, geom = "histogram", binwidth = 10000, xlim = c(0, 800000),
      main = "Histogram of Salaries (Binwidth = 10,000)", xlab = "Salary in USD", ylab = "Count")

# Plot with binwidth of 5000
qplot(salary_in_usd, data = df, geom = "histogram", binwidth = 5000, xlim = c(0, 800000),
      main = "Histogram of Salaries (Binwidth = 5,000)", xlab = "Salary in USD", ylab = "Count")

# Plot with binwidth of 2000
qplot(salary_in_usd, data = df, geom = "histogram", binwidth = 2000, xlim = c(0, 800000),
      main = "Histogram of Salaries (Binwidth = 2,000)", xlab = "Salary in USD", ylab = "Count")


In [None]:
# Load necessary packages
install.packages("ggplot2")  # Uncomment if ggplot2 is not installed
library(ggplot2)

# Load your dataset
df <- read.csv("/content/sample_data/salaries.csv")  # Update with the actual path

# Display the salary range
salary_range <- range(df$salary_in_usd, na.rm = TRUE)
summary(df$salary_in_usd)

# Histogram with binwidth of 10000 and annotated salary range
p <- qplot(salary_in_usd, data = df, geom = "histogram", binwidth = 10000, xlim = c(0, 800000),
           main = "Histogram of Salaries (Binwidth = 10,000)",
           xlab = "Salary in USD", ylab = "Count") +
      annotate("text", x = 600000, y = 30, label = paste("Salary Range:", salary_range[1], "to", salary_range[2]),
               color = "blue", size = 4, hjust = 0)

print(p)


In [None]:
p <- ggplot(df, aes(x = salary_in_usd, y = employment_type))
p + geom_point()

In [None]:
boysbox <- ggplot(df, aes(employment_type , salary_in_usd)) + geom_boxplot()

In [None]:
boysbox

In [None]:
boysbox + geom_line(aes(group = work_year), colour = "#3366FF")

In [None]:
d <- ggplot(df, aes(employment_type)) + xlim(0, 3)
d + stat_bin(aes(ymax = ..count..), binwidth = 0.1, geom = "area")
d + stat_bin(
aes(size = ..density..), binwidth = 0.1,
geom = "point", position="identity")
d + stat_bin(
aes(y = 1, fill = ..count..), binwidth = 0.1,
geom = "tile", position="identity"
)


In [None]:
############################################################################################

In [None]:
# Sample data with large financial values
datas <- data.frame(
  Company = c('Company A', 'Company B', 'Company C'),
  Revenue = c(123456789, 987654321.34, 56473829100000000),
  Profit = c(23456789, 876543211500000, 34738291.678)
)

# Display the data frame
datas

In [None]:
# Install scales package if not already installed
# install.packages("scales")

library(scales)

# Create a copy of the dataframe
df_solution <- datas

# Format the Revenue and Profit columns with commas
df_solution$Revenue <- comma(df_solution$Revenue, accuracy = 1)
df_solution$Profit <- comma(df_solution$Profit, accuracy = 1)

# View the formatted dataframe
df_solution


In [None]:
install.packages("ggplot2")


In [None]:
# Create the scatter plot with enhancements
p <- ggplot(data=df_solution, aes(x=Revenue, y=Profit, color=Company)) +
  geom_point(size=3, alpha=0.7) +  # Add scatter points
  geom_smooth(method="lm", se=FALSE, linetype="dashed", color="black") +  # Add a trend line
  labs(title = "Revenue vs Profit by Company",
       x = "Revenue",
       y = "Profit",
       color = "Company") +  # Add labels
  theme_minimal()  # Use a minimal theme

# Display the plot
print(p)



In [None]:
install.packages("openxlsx")
library(openxlsx)

df <- read.xlsx("/content/output.xlsx")  # Update with actual path & xlsx extension if file is xlsx

In [None]:
df

In [None]:
# Create a copy of the dataframe
df_solution <- df

# Specify columns to format
columns_to_format <- c(
  "operatingCashflow", "paymentsForOperatingActivities",
  "proceedsFromOperatingActivities", "netIncome"
)

# Convert each specified column to numeric, if not already, and then apply comma formatting
for (col in columns_to_format) {
  if (!is.numeric(df_solution[[col]])) {
    df_solution[[col]] <- as.numeric(df_solution[[col]])
  }
  df_solution[[col]] <- comma(df_solution[[col]], accuracy = 1)
}

# View the formatted dataframe
df_solution

In [None]:
p <- ggplot(data=df_solution, aes(x=netIncome, y=operatingCashflow ,col=fiscalDateEnding)) +
  geom_point(size=3, alpha=0.7) +  # Add scatter points
  geom_smooth(method="lm", se=FALSE, linetype="dashed", color="black") +  # Add a trend line
  labs(title = "Revenue vs Profit by Company",
       x = "Revenue",
       y = "Profit",
       color = "Company") +  # Add labels
  theme_minimal()  # Use a minimal theme

# Display the plot
print(p)

In [None]:
#####################################################################################################

In [None]:
# Load ggplot2 library
library(ggplot2)

# Sample data for demonstration
df_solution <- data.frame(
  fiscalDateEnding = as.Date(c("2024-01-01", "2024-04-01", "2024-07-01")),
  netIncome = c(3456789, 12345678, 9876543)
)

# Plot with logarithmic transformation on both axes and linear regression line
p <- ggplot(df_solution, aes(x = fiscalDateEnding, y = netIncome)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_y_log10() +
  labs(
    title = "Log-Transformed Plot of Fiscal Date vs. Net Income with Linear Trend",
    x = "Fiscal Date Ending",
    y = "Net Income (Log Scale)"
  )

# Display the plot
print(p)


In [None]:
# Load ggplot2 library
library(ggplot2)

# 1. Stacked Bar Chart
stacked_bar <- ggplot(mtcars, aes(x = factor(1), fill = factor(cyl))) +
  geom_bar(width = 1) +
  labs(title = "Stacked Bar Chart", fill = "Cylinder Count") +
  xlab("") +
  ylab("Count")

print(stacked_bar)

# 2. Pie Chart
pie_chart <- stacked_bar +
  coord_polar(theta = "y") +
  labs(title = "Pie Chart of Cylinder Count in mtcars")

print(pie_chart)

# 3. Bullseye Chart (Radial with x as angle and y as radius)
bullseye_chart <- stacked_bar +
  coord_polar() +
  labs(title = "Bullseye Chart of Cylinder Count")

print(bullseye_chart)


In [None]:
# Load the iris dataset
data("iris")

# View the first few rows of the dataset
head(iris)

# Get a summary of the dataset
summary(iris)

# Load ggplot2 library for plotting
library(ggplot2)

# Scatter plot of Sepal.Length vs Sepal.Width, colored by Species
plot_iris <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_point(size = 3) +
  labs(
    title = "Sepal Dimensions in Iris Dataset",
    x = "Sepal Length",
    y = "Sepal Width"
  ) +
  theme_minimal()  # Use a minimal theme

# Display the plot
print(plot_iris)



In [None]:
View(iris)

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width)) + geom_point()

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width)) + geom_density_2d()

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width)) + geom_bin2d()

In [None]:
# Load ggplot2 library
library(ggplot2)

# Create a 2D bin plot for Sepal.Length vs Sepal.Width
ggplot(data = iris, mapping = aes(x = Sepal.Length, y = Sepal.Width)) +
  geom_bin_2d(bins = 15) +  # Specify the number of bins
  scale_fill_gradient(low = "lightblue", high = "darkblue") +  # Customize color gradient
  labs(
    title = "2D Binned Heatmap of Sepal Dimensions",
    x = "Sepal Length",
    y = "Sepal Width",
    fill = "Count"  # Label for the color legend
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
# Load necessary libraries
library(ggplot2)
library(MASS)  # For density estimation

# Calculate a 2D density estimate based on Sepal.Length and Sepal.Width
density_data <- kde2d(iris$Sepal.Length, iris$Sepal.Width, n = 100)

# Convert density data to a data frame for ggplot2
density_df <- expand.grid(x = density_data$x, y = density_data$y)
density_df$z <- as.vector(density_data$z)

# Plot the filled contour plot using density estimates
ggplot(data = density_df, aes(x = x, y = y, z = z)) +
  geom_contour_filled() +
  scale_fill_viridis_d(option = "plasma") +
  labs(
    title = "Filled Contour Plot of Sepal Dimensions Density",
    x = "Sepal Length",
    y = "Sepal Width",
    fill = "Density"
  ) +
  theme_minimal()


In [None]:
# Load necessary libraries
library(ggplot2)
library(MASS)  # For kernel density estimation

# Calculate a 2D density estimate for Sepal.Length and Sepal.Width
density_data <- kde2d(iris$Sepal.Length, iris$Sepal.Width, n = 100)

# Convert the density data to a data frame
density_df <- expand.grid(x = density_data$x, y = density_data$y)
density_df$z <- as.vector(density_data$z)

# Create the contour plot using the density estimate
ggplot(data = density_df, aes(x = x, y = y, z = z)) +
  geom_contour() +
  scale_fill_viridis_c(option = "plasma") +  # Use a color gradient for filled contours
  labs(
    title = "Density Contour Plot of Sepal Dimensions",
    x = "Sepal Length",
    y = "Sepal Width",
    fill = "Density"  # Label for the color legend
  ) +
  theme_minimal()


In [None]:
# Load ggplot2 library
library(ggplot2)

# Create a scatterplot for Sepal.Length vs Sepal.Width
ggplot(iris, mapping = aes(x = Sepal.Length, y = Sepal.Width)) +
  geom_point(color = "blue", size = 2) +  # Scatter plot points
  labs(
    title = "Scatterplot of Sepal Dimensions",
    x = "Sepal Length",
    y = "Sepal Width"
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
# Install quantreg package if not already installed
if (!require(quantreg)) install.packages("quantreg")

# Load ggplot2 and quantreg libraries
library(ggplot2)
library(quantreg)

# Create a scatterplot with quantile regression lines
ggplot(iris, mapping = aes(x = Sepal.Length, y = Sepal.Width )) +
  geom_point(color = "blue", size = 2) +  # Scatter plot points
  geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linewidth = 1) +  # Add quantile regression lines
  labs(
    title = "Scatterplot of Sepal Dimensions with Quantile Regression Lines",
    x = "Sepal Length",
    y = "Sepal Width"
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
# Install quantreg package if not already installed
if (!require(quantreg)) install.packages("quantreg")

# Load ggplot2 and quantreg libraries
library(ggplot2)
library(quantreg)

# Create a scatterplot with quantile regression lines, colored by species
ggplot(iris, mapping = aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
  geom_point(size = 2) +  # Scatter plot points colored by species
  geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linewidth = 1) +  # Add quantile regression lines
  labs(
    title = "Scatterplot of Sepal Dimensions with Quantile Regression Lines",
    x = "Sepal Length",
    y = "Sepal Width",
    color = "Species"  # Legend label for species
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width)) + geom_jitter()

In [None]:
# Load ggplot2 library
library(ggplot2)

# Create a Q-Q plot for Sepal.Length to check normality
ggplot(iris, mapping = aes(sample = Sepal.Length)) +
  geom_qq_line(color = "blue") +  # Add Q-Q line
  geom_qq(color = "red") +        # Add Q-Q points
  labs(
    title = "Q-Q Plot of Sepal Length",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
# Install quantreg package if not already installed
if (!require(quantreg)) install.packages("quantreg")

# Load ggplot2 and quantreg libraries
library(ggplot2)
library(quantreg)

# Create a scatterplot with quantile regression lines, colored by species
ggplot(iris, mapping = aes(x = Sepal.Length, y = Sepal.Width, shape = Species)) +
  geom_point(size = 2) +  # Scatter plot points colored by species
  geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linewidth = 1) +  # Add quantile regression lines
  labs(
    title = "Scatterplot of Sepal Dimensions with Quantile Regression Lines",
    x = "Sepal Length",
    y = "Sepal Width",
    color = "Species"  # Legend label for species
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
# Install quantreg package if not already installed
if (!require(quantreg)) install.packages("quantreg")

# Load ggplot2 and quantreg libraries
library(ggplot2)
library(quantreg)

# Create a scatterplot with quantile regression lines, colored by species
ggplot(iris, mapping = aes(x = Sepal.Length, y = Sepal.Width, size = Species)) +
  geom_point(size = 2) +  # Scatter plot points colored by species
  geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linewidth = 1) +  # Add quantile regression lines
  labs(
    title = "Scatterplot of Sepal Dimensions with Quantile Regression Lines",
    x = "Sepal Length",
    y = "Sepal Width",
    color = "Species"  # Legend label for species
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
# Install quantreg package if not already installed
if (!require(quantreg)) install.packages("quantreg")

# Load ggplot2 and quantreg libraries
library(ggplot2)
library(quantreg)

# Create a scatterplot with quantile regression lines, colored by species
ggplot(iris, mapping = aes(x = Sepal.Length, y = Sepal.Width, shape = Species , col = Species )) +
  geom_point(size = 2) +  # Scatter plot points colored by species
  geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linewidth = 1) +  # Add quantile regression lines
  labs(
    title = "Scatterplot of Sepal Dimensions with Quantile Regression Lines",
    x = "Sepal Length",
    y = "Sepal Width",
    color = "Species"  # Legend label for species
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
# Install quantreg package if not already installed
if (!require(quantreg)) install.packages("quantreg")

# Load ggplot2 and quantreg libraries
library(ggplot2)
library(quantreg)

# Create a scatterplot with quantile regression lines, colored by species
p1 <- ggplot(iris, mapping = aes(x = Sepal.Length, y = Sepal.Width, shape = Species , col = Species )) +
  geom_point(size = 5) +  # Scatter plot points colored by species
  geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linewidth = 1) +  # Add quantile regression lines
  labs(
    title = "Scatterplot of Sepal Dimensions with Quantile Regression Lines",
    x = "Sepal Length",
    y = "Sepal Width",
    color = "Species"  # Legend label for species
  ) +
  theme_minimal()  # Apply a minimal theme


In [None]:
print(p1)

In [None]:
# saving pdf
ggsave("mpg.pdf" , plot = p1 , height = 5 , width = 6 , units = "in")

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width , col = Species)) +
  geom_point() +
  facet_wrap(~iris$Species )

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width , col = Species)) +
  geom_point() +
  facet_wrap(~iris$Species , ncol = 2) +
  theme_bw()

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width , col = Species)) +
  geom_point() +
  facet_wrap(~iris$Species , ncol = 2) +
  theme_dark()

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width , col = Species)) +
  geom_point() +
  facet_wrap(~iris$Species , ncol = 2) +
  theme_classic()

In [None]:
ggplot(iris , mapping = aes(x = Sepal.Length , y = Sepal.Width , col = Species)) +
geom_point() +
facet_wrap(~iris$Species , ncol = 2) +
theme_classic()+
labs(
    title = "Scatterplot of Sepal Dimensions with Quantile Regression Lines",
    x = "Sepal Is Length",
    y = "Sepal Is Width",
    color = "Species"  # Legend label for species
)

# **Data Transformation in R**

In [None]:
install.packages("nycflights13", dependencies = TRUE)
library(nycflights13)
library(tidyverse)

df <- nycflights13 :: flights

In [None]:
summary(df)

In [None]:
filter(df , month==1 , day==1)

In [None]:
filter(df , month==1 , day==1 , carrier=="UA")

In [None]:
filter(df , month==1:2)

In [None]:
filter(df , month %in% c(1,2))

In [None]:
filter(df, month == 1 | month == 2)


In [None]:
filter(df, arr_delay > 120)


In [None]:
filter(df, arr_delay > 180)


In [None]:
filter(df, arr_delay > 120 & arr_delay < 300)


In [None]:
# Sample data
data <- df$month

# Converting numbers to month names using ifelse
# Instead of data$month_name, use data directly since it's a vector
month_name <- ifelse(data == 1, "January",
                   ifelse(data == 2, "February",
                   ifelse(data == 3, "March",
                   ifelse(data == 4, "April",
                   ifelse(data == 5, "May",
                   ifelse(data == 6, "June",
                   ifelse(data == 7, "July",
                   ifelse(data == 8, "August",
                   ifelse(data == 9, "September",
                   ifelse(data == 10, "October",
                   ifelse(data == 11, "November", "December")))))))))))

# Create a data frame with the original month numbers and month names
data <- data.frame(month = df$month, month_name = month_name)



In [None]:
# Assuming both `df` and `data$month_name` have the same number of rows
combined_data <- df %>%
  mutate(month_name = data$month_name)

print(combined_data)


In [None]:
view(combined_data)

In [None]:
filter(combined_data , month==4 , day==1)

# **The Desired Data Type**

In [None]:
combined_data$year <- as.numeric(combined_data$year)


In [None]:
combined_data

In [None]:
# Convert a column to integer type
combined_data$year <- as.integer(combined_data$year)


In [None]:
combined_data

In [None]:
# df$column_name <- as.character(df$column_name)


In [None]:
# df$column_name <- as.factor(df$column_name)


In [None]:
# df$column_name <- as.Date(df$column_name, format = "%Y-%m-%d")


In [None]:
# df$column_name <- as.Date(df$column_name, format = "%m/%d/%Y")


In [None]:
# df$column_name <- as.POSIXct(df$column_name, format = "%Y-%m-%d %H:%M:%S")


#**Missing Values with all**

In [None]:
sum(is.na(combined_data))


In [None]:
colSums(is.na(combined_data))


In [None]:
colMeans(is.na(combined_data)) * 100


In [None]:
install.packages("VIM") # Install the VIM package
library(VIM) # Load the VIM package

#Your existing R code
aggr(combined_data, col = c("navyblue", "yellow"), numbers = TRUE, sortVars = TRUE,
     labels = names(combined_data), cex.axis = 0.7, gap = 3, ylab = c("Missing Data", "Pattern"))

In [None]:
# Install the package if you haven't already
if(!require(naniar)){
  install.packages("naniar")
}

# Load the package
library(naniar)

# Now you can use the functions from naniar
gg_miss_var(combined_data)  # Shows missing data per variable
gg_miss_upset(combined_data) # Shows combinations of missing data

In [None]:
combined_data [!complete.cases(combined_data), ]


In [None]:
missing_summary <- data.frame(
  Column = names(combined_data),
  MissingCount = colSums(is.na(combined_data)),
  MissingPercentage = colMeans(is.na(combined_data)) * 100
)

print(missing_summary)

# **Removing Missing Data**

In [None]:
cleaned_data <- na.omit(combined_data)


In [None]:
cleaned_data <- combined_data[!is.na(combined_data$arr_delay), ]


In [None]:
cleaned_data <- combined_data[, colMeans(is.na(combined_data)) < 0.5]


In [None]:
cleaned_data

# **Imputation of Missing Data**

In [None]:
cleaned_data$arr_delay[is.na(cleaned_data$arr_delay)] <- mean(cleaned_data$arr_delay, na.rm = TRUE)


In [None]:
cleaned_data$arr_delay[is.na(cleaned_data$arr_delay)] <- median(cleaned_data$arr_delay, na.rm = TRUE)


In [None]:
get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
daf <- cleaned_data$arr_delay[is.na(cleaned_data$arr_delay)] <- get_mode(cleaned_data$arr_delay)


In [None]:
daf

# **END With Values**

In [None]:
ggplot(combined_data , mapping = aes(x = combined_data$dep_time  , y = combined_data$sched_dep_time , col= combined_data$carrier)) + geom_point()

In [None]:
ggplot(combined_data , mapping = aes(x = combined_data$dep_time  , y = combined_data$sched_dep_time , col= combined_data$carrier)) +
  geom_point() +
  facet_wrap(~combined_data$carrier , ncol = 4) +
  theme_classic()

In [None]:
ggplot(combined_data , mapping = aes(x = combined_data$dep_time  , y = combined_data$sched_dep_time , col= combined_data$month_name)) +
  geom_point() +
  facet_wrap(~combined_data$month_name , ncol = 4) +
  theme_classic()

In [None]:
library(ggplot2)

ggplot(combined_data, mapping = aes(x = dep_time, y = sched_dep_time, color = carrier)) +
  geom_point() +
  facet_wrap(~ carrier, ncol = 4) +
  theme_classic() +
  theme(
    strip.background = element_rect(fill = "red"),
    strip.text = element_text(color = "white")
  ) +
  labs(
    title = "Scheduled vs. Actual Departure Time",
    x = "Departure Time",
    y = "Scheduled Departure Time",
    color = "Month"
  )


In [None]:
combined_data

In [None]:
install.packages("nycflights13", dependencies = TRUE)
library(nycflights13)
library(tidyverse)

df <- nycflights13 :: flights

In [None]:
summary(df)

In [None]:
glimpse(df)

In [None]:
data ("iris")

In [None]:


print(iris)

In [None]:
iris %>%
 group_by(Species) %>%
 summarise(avg = mean(iris$Sepal.Width))%>%
 arrange(avg)

In [None]:
# This is wrong code
df %>%
 group_by(carrier) %>%
 summarise(mean(df$hour))

In [None]:
# This is wrong code
df %>%
  group_by(carrier) %>%
  summarise(mean_air_time = mean(df$air_time, na.rm = TRUE))


In [None]:
df %>%
  group_by(carrier) %>%
  summarise(mean_hours = mean(hour, na.rm = TRUE))


In [None]:
df %>%
  group_by(carrier) %>%
  summarise(flight_count = n())


In [None]:
df %>%
  group_by(carrier) %>%
  summarise(mean_air_time = mean(air_time, na.rm = TRUE))


In [None]:
df

In [None]:
# Select columns (adjust column names if they differ)
df_1 <- df %>% dplyr::select(year,month,day)
df_1

In [None]:
# Select columns (adjust column names if they differ)
df_1 <- df %>% dplyr::select("year","month","day")

# View result
print(df_1)

In [None]:
# Load the dplyr package
library(dplyr)

# Check column names
print(colnames(df))

# Select columns using dplyr::select and backticks for column names
df_1 <- df %>% dplyr::select(`year`, `month`, `day`)

# View result
print(df_1)

In [None]:
# Select columns (adjust column names if they differ)
df_2 <- df %>% dplyr::select(year,month,day) %>% filter(month==1)
df_2

In [None]:
# the rename column (df,new_name = load_name)
#df <- rename(df , departure_time=dep_time)
df <- rename(df , dep_time=departure_time)

In [None]:
df

In [None]:
# Load dplyr package
library(dplyr)

# Arrange df by dep_delay and arr_time
arrange_df <- df %>% dplyr:: arrange(arr_time)


In [None]:
arrange_df

In [None]:
# Sample data frame
library(dplyr)

dfs <- data.frame(
  group = c("A", "A", "A", "B", "B", "B"),
  value = c(10, 20, 15, 25, 30, 20)
)


In [None]:
dfs

In [None]:


# Calculate difference with next row (lead)
df22 <- dfs %>%
  mutate(
    next_value = lead(value),
    diff_from_next = value - next_value
  )
df22

In [None]:
# Calculate difference with previous row (lag)
df11 <- dfs %>%
  mutate(
    previous_value = lag(value),
    diff_from_previous = value - previous_value
  )
df11


In [None]:
df22

In [None]:
# Calculate running total (cumulative sum)
df33 <- dfs %>%
  mutate(running_total = cumsum(value))


In [None]:
df33

In [None]:
# Calculate running average
df44 <- dfs %>%
  mutate(running_avg = cummean(value))


In [None]:
df44

In [None]:

# Rank values within each group
df55 <- dfs %>%
  group_by(group) %>%
  mutate(rank_within_group = rank(value)) %>%
  ungroup()


In [None]:
df55

In [None]:
# To add a column with row numbers based on the order of the 'arr_time' column:
df_with_row_numbers <- df %>%
  mutate(row_num = row_number(arr_time))

# To simply add a sequential row number column:
df_with_row_numbers <- df %>%
  mutate(row_num = row_number())

In [None]:
df_with_row_numbers

In [None]:
library(dplyr)

cs <- data.frame(
  group = c("A", "A", "A", "B", "B", "B"),
  value = c(10, 20, 15, 25, 30, 20)
)

df1 <- cs %>%
  mutate(
    previous_value = lag(value),
    diff_from_previous = value - previous_value,
    next_value = lead(value),
    diff_from_next = value - next_value,
    running_total = cumsum(value),
    running_avg = cummean(value)
  ) %>%
  group_by(group) %>%
  mutate(
    rank_within_group = rank(value),
    row_number_within_group = row_number(),
    running_total_within_group = cumsum(value)
  ) %>%
  ungroup()

print(df1)


In [None]:
df1

In [None]:
ggplot(data=df , mapping = aes(x = distance , y = arr_delay , col = carrier))+geom_point()

In [None]:
# Install required packages if they are not installed
install.packages("ggplot2")
install.packages("nycflights13")

# Load libraries
library(ggplot2)
library(nycflights13)
library(dplyr)

# Load data
df <- nycflights13::flights


In [None]:
# Prepare data: Calculate average dep_delay for each day and month
heatmap_data <- df %>%
  group_by(year, month, day) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup()


In [None]:
# Plot heatmap of average departure delay by month and day
ggplot(heatmap_data, aes(x = factor(month), y = factor(day), fill = avg_dep_delay)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkred", na.value = "gray") +
  labs(title = "Average Departure Delay by Day and Month",
       x = "Month",
       y = "Day",
       fill = "Avg Delay (min)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


In [None]:
# Load libraries
library(ggplot2)
library(lubridate)
library(dplyr)

# Assuming you have a dataframe 'flights' with columns year, month, day, and hour

# Convert relevant columns to appropriate formats
flights$date <- as.Date(paste(flights$year, flights$month, flights$day, sep="-"))
flights$time_hour <- as.POSIXct(paste(flights$date, flights$hour), format = "%Y-%m-%d %H")

# Calculate the frequency of flights by hour and day
flight_counts <- flights %>%
  mutate(day_of_month = day(date)) %>%
  group_by(hour, day_of_month) %>%
  summarise(flight_count = n()) %>%
  ungroup() %>%
  mutate(prop = flight_count / sum(flight_count))  # Calculate proportion

# Create a heatmap
ggplot(flight_counts, aes(x = hour, y = day_of_month)) +
  geom_tile(aes(fill = prop), color = "white") +
  scale_fill_gradient(low = "white", high = "darkblue") +
  labs(x = "Hour of Day", y = "Day of Month", fill = "Proportion of Flights") +
  ggtitle("Flight Frequency Heatmap") +
  theme_minimal()


# **normal disrtribution**

In [None]:
# normal disrtribution
histogram(df$distance)


In [None]:
qqnorm(df$distance)

In [None]:
normal_data <- rnorm(200)

In [None]:
histogram(normal_data , col="blue")

In [None]:
shapiro.test(normal_data)

In [None]:
install.packages("corrplot") # Install the corrplot package
library(corrplot) # Load the corrplot package

df <- mtcars # Load the mtcars dataset
m <- cor(df) # Calculate the correlation matrix

corrplot(m) # Create the correlation plot using corrplot

In [None]:
corrplot(m , method = "number") # Create the correlation plot using corrplot

In [None]:
corrplot(m , method = "number" , order = "alphabet") # Create the correlation plot using corrplot

In [None]:
corrplot(m , method = "shade" , order = "alphabet") # Create the correlation plot using corrplot

In [None]:
corrplot(m , method = "number" , order = "alphabet" , type = "upper") # Create the correlation plot using corrplot

In [None]:
corrplot.mixed(m , lower = "number" , upper = "pie") # Create the correlation plot using corrplot

In [None]:
corrplot(m , order = "hclus" , addrect = 2)

In [None]:
df

In [None]:
install.packages("nycflights13", dependencies = TRUE)
library(nycflights13)
library(tidyverse)

df <- nycflights13 :: flights

In [None]:
df

In [None]:
colnames(df)

In [None]:
df <- mtcars
head(df)

In [None]:
cor(df)

In [None]:
df <- as.matrix(df)
heatmap(df)

In [None]:
df <- as.matrix(df)
heatmap(df , scale = "column")

In [None]:
heatmap(df , scale = "column" , col = cm.colors((256)))

In [None]:
heatmap(df , scale = "column" , col = terrain.colors((256)))

In [None]:
library(RColorBrewer)
coul <- colorRampPalette(brewer.pal(8, "BuGn"))(25)  # Change "piyG" to "PiYG"
heatmap(df, scale = "column", col = coul)

In [None]:
volcano

In [None]:
library(tidyverse)

volcano %>%
  as_tibble() %>%
  rowid_to_column(var = "X") %>%
  gather(key = "Y", value = "z" , -1) %>%
  mutate(Y = as.numeric(gsub("v", "", Y)))

In [None]:
# Load the necessary libraries
library(tidyverse)

# Convert the 'volcano' matrix into a tibble, add row numbers, reshape, and clean up the Y column
volcano %>%
  as_tibble() %>%
  rowid_to_column(var = "X") %>%
  pivot_longer(cols = -X, names_to = "Y", values_to = "z") %>%
  mutate(Y = as.numeric(gsub("V", "", Y))) -> volcano_tidy

# Display the transformed dataset
print(volcano_tidy)


In [None]:
library(tidyverse)

# Convert the volcano matrix into a data frame
volcano_df <- as.data.frame(volcano)

# Add row and column names to the data frame
rownames(volcano_df) <- 1:nrow(volcano_df)  # Assign row names as numbers
colnames(volcano_df) <- paste0("V", 1:ncol(volcano_df)) # Assign column names as V1, V2, ...

# Reshape the data frame into a long format
volcano_df_long <- volcano_df %>%
  rownames_to_column("X") %>%  # Use rownames_to_column to preserve row names
  pivot_longer(cols = -X, names_to = "Y", values_to = "z") %>%
  mutate(X = as.numeric(X), Y = as.numeric(gsub("V", "", Y)))


# Now create the plot using the reshaped data frame
volcano_df_long %>% ggplot(aes(Y ,X , fill = z))+geom_tile()

In [None]:
# execute this command only the first time you use ggpubfigs
devtools::install_github("JLSteenwyk/ggpubfigs")
# load ggpubfigs
library(ggpubfigs)

In [None]:
mtcars

In [None]:
ggplot(mtcars, aes(factor(carb), fill=factor(cyl))) + geom_bar() + scale_fill_manual(values = friendly_pal("ito_seven")) + theme_big_simple()

In [None]:
ggplot(iris, aes(Sepal.Length, fill = Species)) + geom_histogram() + theme_black()

In [None]:
# Sample data
data <- data.frame(Color = c('Red', 'Blue', 'Green', 'Red'))

In [None]:
# One-hot encode using model.matrix
encoded_data <- model.matrix(~ Color - 1, data = data)

# Display the encoded data
print(encoded_data)

In [None]:

# Load the dplyr package And column remove
library(dplyr)

# Remove the 'timestamp' column and keep the other specified columns
df <- df %>% select(close, volume)

In [None]:
# Load the data
data("mtcars")

# Convert transmission type to a factor (0 = automatic, 1 = manual)
mtcars$am <- factor(mtcars$am, labels = c("Automatic", "Manual"))

# Perform a t-test
t_test_result <- t.test(mpg ~ am, data = mtcars)

# Display the t-test result
print(t_test_result)


In [None]:
# Load the data
data("mtcars")

# Convert the cylinder column to a factor
mtcars$cyl <- as.factor(mtcars$cyl)

# Perform ANOVA
anova_result <- aov(mpg ~ cyl, data = mtcars)

# Display the ANOVA result
summary(anova_result)


In [None]:
# Load the data
data("mtcars")

# Known population mean
population_mean <- 20

# Calculate sample mean and standard deviation
sample_mean <- mean(mtcars$mpg)
sample_sd <- sd(mtcars$mpg)

# Sample size
n <- length(mtcars$mpg)

# Calculate z-score
z_score <- (sample_mean - population_mean) / (sample_sd / sqrt(n))

# Calculate p-value (two-tailed)
p_value <- 2 * (1 - pnorm(abs(z_score)))

# Display the z-score and p-value
cat("Z-score:", z_score, "\nP-value:", p_value, "\n")
