Violin Plots

Make violin plots for each of the IDs which are for each row encoded as GeneSymbol.UniqueIdentifier


# Function to create paired violin plots


In [None]:
create_paired_violin_plot <- function(gene_data, gene_name, filename, t_test_mean, t_test_median) {
  gene_data$State <- factor(gene_data$State, levels = c("preAML", "AML"))
  
  mean_preAML <- mean(gene_data$Expression[gene_data$State == "preAML"])
  mean_AML <- mean(gene_data$Expression[gene_data$State == "AML"])
  median_preAML <- median(gene_data$Expression[gene_data$State == "preAML"])
  median_AML <- median(gene_data$Expression[gene_data$State == "AML"])
  
  line_data <- data.frame(State = c("preAML", "AML"), Mean = c(mean_preAML, mean_AML), Median = c(median_preAML, median_AML))
  
  p <- ggplot(gene_data, aes(x = State, y = Expression, fill = State)) +
    geom_violin(trim = FALSE) +
    geom_jitter(width = 0.2, size = 1.5, alpha = 0.6) + # Add individual expression values as dots
    stat_summary(fun = mean, geom = "point", color = "red", size = 3) + # Plot mean as points
    geom_line(data = line_data, aes(x = State, y = Mean, group = 1), color = "blue", size = 1) + # Line connecting means
    geom_line(data = line_data, aes(x = State, y = Median, group = 1), color = "green", size = 1) + # Line connecting medians
    geom_line(aes(group = Individual), color = "grey", alpha = 0.5) + # Line connecting paired samples
    ggtitle(paste("Paired Violin Plot for", gene_name, "\nMean p-value: ", signif(t_test_mean$p.value, 3), " Median p-value: ", signif(t_test_median$p.value, 3))) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    scale_fill_manual(values = c("preAML" = "skyblue", "AML" = "salmon")) # Custom colors for states
  
  ggsave(filename, plot = p, width = 10, height = 8)
  return(p)
}

# Function to create comparison violin plots


In [None]:
create_comparison_violin_plot <- function(gene_data, gene_name, filename, left_state, right_state, t_test_mean, t_test_median) {
  gene_data$State <- factor(gene_data$State, levels = c(left_state, right_state))
  
  mean_left <- mean(gene_data$Expression[gene_data$State == left_state])
  mean_right <- mean(gene_data$Expression[gene_data$State == right_state])
  median_left <- median(gene_data$Expression[gene_data$State == left_state])
  median_right <- median(gene_data$Expression[gene_data$State == right_state])
  
  line_data <- data.frame(State = c(left_state, right_state), Mean = c(mean_left, mean_right), Median = c(median_left, median_right))
  
  p <- ggplot(gene_data, aes(x = State, y = Expression, fill = State)) +
    geom_violin(trim = FALSE) +
    geom_jitter(width = 0.2, size = 1.5, alpha = 0.6) + # Add individual expression values as dots
    stat_summary(fun = mean, geom = "point", color = "red", size = 3) + # Plot mean as points
    geom_line(data = line_data, aes(x = State, y = Mean, group = 1), color = "blue", size = 1) + # Line connecting means
    geom_line(data = line_data, aes(x = State, y = Median, group = 1), color = "green", size = 1) + # Line connecting medians
    ggtitle(paste("Violin Plot for", gene_name, "\nMean p-value: ", signif(t_test_mean$p.value, 3), " Median p-value: ", signif(t_test_median$p.value, 3))) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    scale_fill_manual(values = c(left_state = "skyblue", right_state = "salmon")) # Custom colors for states
  
  ggsave(filename, plot = p, width = 10, height = 8)
  return(p)
}

# Initialize lists to store statistical results


In [None]:
stat_results_consistent <- list()
stat_results_opposite <- list()

# Plot paired data
for (gene_id in unique(paired_data_long$X)) {
  gene_name <- featureData[gene_id, 2]
  gene_data <- paired_data_long %>% filter(X == gene_id)
  
  # Perform t-tests between means
  t_test_mean <- t.test(gene_data$Expression[gene_data$State == "preAML"], 
                        gene_data$Expression[gene_data$State == "AML"])
  
  # Perform Wilcoxon tests between medians
  t_test_median <- wilcox.test(gene_data$Expression[gene_data$State == "preAML"], 
                               gene_data$Expression[gene_data$State == "AML"])
  
  create_paired_violin_plot(gene_data, gene_name, paste0("paired_violin_plots/", gene_name, "_paired_violin_plot.pdf"), t_test_mean, t_test_median)
}

# Plot consistent test: Paired AML with unpaired preAML


In [None]:
consistent_plots <- list()
for (gene_id in unique(paired_data_long$X)) {
  gene_name <- featureData[gene_id, 2]
  
  paired_gene_data <- paired_data_long %>% filter(X == gene_id & State == "AML") %>% mutate(Source = "Paired")
  unpaired_gene_data <- unpaired_data_long %>% filter(X == gene_id & State == "preAML") %>% mutate(State = "preAML_unpaired", Source = "Unpaired")
  
  if (nrow(unpaired_gene_data) > 0) {
    # Ensure columns match before combining
    common_cols <- intersect(names(paired_gene_data), names(unpaired_gene_data))
    gene_data <- rbind(paired_gene_data[, common_cols], unpaired_gene_data[, common_cols])
  
    # Perform t-tests between means
    t_test_mean <- t.test(gene_data$Expression[gene_data$State == "preAML_unpaired"], 
                          gene_data$Expression[gene_data$State == "AML"])
    
    # Perform Wilcoxon tests between medians
    t_test_median <- wilcox.test(gene_data$Expression[gene_data$State == "preAML_unpaired"], 
                                 gene_data$Expression[gene_data$State == "AML"])
    
    p <- create_comparison_violin_plot(gene_data, gene_name, paste0("comparison_violin_plots/consistent/", gene_name, "_consistent_violin_plot.pdf"), "preAML_unpaired", "AML", t_test_mean, t_test_median)
    consistent_plots[[gene_id]] <- p
    
    # Store statistical results
    stat_results_consistent[[gene_id]] <- list(
      gene_name = gene_name,
      mean_p_value = t_test_mean$p.value,
      median_p_value = t_test_median$p.value
    )
  }
}


# Plot opposite test: Paired AML with unpaired noAML


In [None]:

opposite_plots <- list()
for (gene_id in unique(paired_data_long$X)) {
  gene_name <- featureData[gene_id, 2]
  
  paired_gene_data <- paired_data_long %>% filter(X == gene_id & State == "AML") %>% mutate(Source = "Paired")
  unpaired_gene_data <- unpaired_data_long %>% filter(X == gene_id & State == "noAML") %>% mutate(State = "noAML", Source = "Unpaired")
  
  if (nrow(unpaired_gene_data) > 0) {
    # Ensure columns match before combining
    common_cols <- intersect(names(paired_gene_data), names(unpaired_gene_data))
    gene_data <- rbind(paired_gene_data[, common_cols], unpaired_gene_data[, common_cols])
  
    # Perform t-tests between means
    t_test_mean <- t.test(gene_data$Expression[gene_data$State == "noAML"], 
                          gene_data$Expression[gene_data$State == "AML"])
    
    # Perform Wilcoxon tests between medians
    t_test_median <- wilcox.test(gene_data$Expression[gene_data$State == "noAML"], 
                                 gene_data$Expression[gene_data$State == "AML"])
    
    p <- create_comparison_violin_plot(gene_data, gene_name, paste0("comparison_violin_plots/opposite/", gene_name, "_opposite_violin_plot.pdf"), "noAML", "AML", t_test_mean, t_test_median)
    opposite_plots[[gene_id]] <- p
    
    # Store statistical results
    stat_results_opposite[[gene_id]] <- list(
      gene_name = gene_name,
      mean_p_value = t_test_mean$p.value,
      median_p_value = t_test_median$p.value
    )
  }
}


# Combine statistical results into data frames


In [None]:
stat_results_consistent_df <- do.call(rbind, lapply(stat_results_consistent, function(x) data.frame(gene_name = x$gene_name, mean_p_value = x$mean_p_value, median_p_value = x$median_p_value)))
stat_results_opposite_df <- do.call(rbind, lapply(stat_results_opposite, function(x) data.frame(gene_name = x$gene_name, mean_p_value = x$mean_p_value, median_p_value = x$median_p_value)))

# Save statistical results to CSV files

In [None]:
write.csv(stat_results_consistent_df, "stat_results_consistent.csv", row.names = FALSE)
write.csv(stat_results_opposite_df, "stat_results_opposite.csv", row.names = FALSE)


# Display the plots (optional, can be removed if running in a non-interactive environment)

In [None]:
for (p in consistent_plots) {
  print(p)
}
for (p in opposite_plots) {
  print(p)
}

# Combine individual PDF files into a single PDF for consistent and opposite plots

In [None]:
pdf_combine(input = list.files("comparison_violin_plots/consistent", full.names = TRUE), output = "consistent_violin_plots_combined.pdf")
pdf_combine(input = list.files("comparison_violin_plots/opposite", full.names = TRUE), output = "opposite_violin_plots_combined.pdf")

In [None]:
# Load necessary libraries
library(ggplot2)
library(dplyr)

# Calculate summary statistics for opposite test p-values
opposite_p_values <- stat_results_opposite_df %>%
  mutate(Significant = ifelse(mean_p_value < 0.05, "Yes", "No"))

summary_stats_opposite <- opposite_p_values %>%
  summarize(
    mean_p_value_mean = mean(mean_p_value),
    median_p_value_mean = median(mean_p_value),
    sd_p_value_mean = sd(mean_p_value),
    mean_p_value_median = mean(median_p_value),
    median_p_value_median = median(median_p_value),
    sd_p_value_median = sd(median_p_value)
  )

print(summary_stats_opposite)

# Visualize the distribution of mean p-values
ggplot(opposite_p_values, aes(x = mean_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "skyblue", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Mean P-values for Opposite Test", x = "Mean P-value", y = "Frequency")

# Visualize the distribution of median p-values
ggplot(opposite_p_values, aes(x = median_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "lightgreen", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Median P-values for Opposite Test", x = "Median P-value", y = "Frequency")

# QQ plot for mean p-values
ggplot(opposite_p_values, aes(sample = mean_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Mean P-values for Opposite Test")

# QQ plot for median p-values
ggplot(opposite_p_values, aes(sample = median_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Median P-values for Opposite Test")

# Identify significant genes based on mean p-values
significant_genes_mean_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(gene_name, mean_p_value)

# Identify significant genes based on median p-values
significant_genes_median_opposite <- opposite_p_values %>%
  filter(median_p_value < 0.05) %>%
  select(gene_name, median_p_value)

# Print the number of significant genes
cat("Number of significant genes based on mean p-value (opposite test):", nrow(significant_genes_mean_opposite), "\n")
cat("Number of significant genes based on median p-value (opposite test):", nrow(significant_genes_median_opposite), "\n")


# Save significant genes to CSV files
write.csv(significant_genes_mean_opposite, "significant_genes_mean_opposite.csv", row.names = FALSE)
write.csv(significant_genes_median_opposite, "significant_genes_median_opposite.csv", row.names = FALSE)

In [None]:
# Load necessary libraries
library(ggplot2)
library(dplyr)

# Calculate summary statistics for opposite test p-values
consistent_p_values <- stat_results_consistent_df %>%
  mutate(Significant = ifelse(mean_p_value < 0.05, "Yes", "No"))

summary_stats_consistent <- consistent_p_values %>%
  summarize(
    mean_p_value_mean = mean(mean_p_value),
    median_p_value_mean = median(mean_p_value),
    sd_p_value_mean = sd(mean_p_value),
    mean_p_value_median = mean(median_p_value),
    median_p_value_median = median(median_p_value),
    sd_p_value_median = sd(median_p_value)
  )

print(summary_stats_consistent)

# Visualize the distribution of mean p-values
ggplot(consistent_p_values, aes(x = mean_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "skyblue", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Mean P-values for Consistent Test", x = "Mean P-value", y = "Frequency")

# Visualize the distribution of median p-values
ggplot(consistent_p_values, aes(x = median_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "lightgreen", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Median P-values for Consistent Test", x = "Median P-value", y = "Frequency")

# QQ plot for mean p-values
ggplot(consistent_p_values, aes(sample = mean_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Mean P-values for Opposite Test")

# QQ plot for median p-values
ggplot(consistent_p_values, aes(sample = median_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Median P-values for Opposite Test")

# Identify significant genes based on mean p-values
significant_genes_mean_consistent <- consistent_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(gene_name, mean_p_value)

# Identify significant genes based on median p-values
significant_genes_median_consistent < consistent_p_values %>%
  filter(median_p_value < 0.05) %>%
  select(gene_name, median_p_value)

# Print the number of significant genes
cat("Number of significant genes based on mean p-value (consistent test):", nrow(significant_genes_mean_consistent), "\n")
cat("Number of significant genes based on median p-value (consistent test):", nrow(significant_genes_median_consistent), "\n")


# Save significant genes to CSV files
write.csv(significant_genes_mean_consistent, "significant_genes_mean_consistent.csv", row.names = FALSE)
write.csv(significant_genes_median_consistent, "significant_genes_median_consistent.csv", row.names = FALSE)

In [None]:
# Load necessary libraries
library(ggplot2)
library(dplyr)

# Calculate summary statistics for opposite test p-values
significant_p_values <- stat_results_opposite_df %>%
  mutate(Significant = ifelse(mean_p_value < 0.05, "Yes", "No"))

summary_stats_opposite <- opposite_p_values %>%
  summarize(
    mean_p_value_mean = mean(mean_p_value),
    median_p_value_mean = median(mean_p_value),
    sd_p_value_mean = sd(mean_p_value),
    mean_p_value_median = mean(median_p_value),
    median_p_value_median = median(median_p_value),
    sd_p_value_median = sd(median_p_value)
  )

print(summary_stats_opposite)

# Visualize the distribution of mean p-values
ggplot(opposite_p_values, aes(x = mean_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "skyblue", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Mean P-values for Opposite Test", x = "Mean P-value", y = "Frequency")

# Visualize the distribution of median p-values
ggplot(opposite_p_values, aes(x = median_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "lightgreen", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Median P-values for Opposite Test", x = "Median P-value", y = "Frequency")

# QQ plot for mean p-values
ggplot(opposite_p_values, aes(sample = mean_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Mean P-values for Opposite Test")

# QQ plot for median p-values
ggplot(opposite_p_values, aes(sample = median_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Median P-values for Opposite Test")

# Identify significant genes based on mean p-values
significant_genes_mean_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(gene_name, mean_p_value)

# Identify significant genes based on median p-values
significant_genes_median_opposite <- opposite_p_values %>%
  filter(median_p_value < 0.05) %>%
  select(gene_name, median_p_value)

# Print the number of significant genes
cat("Number of significant genes based on mean p-value (opposite test):", nrow(significant_genes_mean_opposite), "\n")
cat("Number of significant genes based on median p-value (opposite test):", nrow(significant_genes_median_opposite), "\n")


# Save significant genes to CSV files
write.csv(significant_genes_mean_opposite, "significant_genes_mean_opposite.csv", row.names = FALSE)
write.csv(significant_genes_median_opposite, "significant_genes_median_opposite.csv", row.names = FALSE)

In [None]:
# Combine slopes data for consistent and opposite analysis
slope_data_consistent <- paired_slopes %>%
  left_join(unpaired_slopes %>% select(X, Unpaired_Slope_preAML), by = "X") %>%
  filter(!is.na(Paired_Slope) & !is.na(Unpaired_Slope_preAML)) %>%
  mutate(Consistent = sign(Paired_Slope) == sign(Unpaired_Slope_preAML))

slope_data_opposite <- paired_slopes %>%
  left_join(unpaired_slopes %>% select(X, Unpaired_Slope_noAML), by = "X") %>%
  filter(!is.na(Paired_Slope) & !is.na(Unpaired_Slope_noAML)) %>%
  mutate(Opposite = sign(Paired_Slope) != sign(Unpaired_Slope_noAML))

# Identify significant junctions in consistent and opposite sets
consistent_junctions <- slope_data_consistent %>%
  filter(Consistent == TRUE) %>%
  select(X)

opposite_junctions <- slope_data_opposite %>%
  filter(Opposite == TRUE) %>%
  select(X)

# Create a contingency table
junctions <- unique(c(consistent_junctions$X, opposite_junctions$X))
consistent_counts <- sapply(junctions, function(j) sum(consistent_junctions$X == j))
opposite_counts <- sapply(junctions, function(j) sum(opposite_junctions$X == j))

contingency_table <- matrix(c(
  sum(consistent_counts > 0), sum(consistent_counts == 0),
  sum(opposite_counts > 0), sum(opposite_counts == 0)
), nrow = 2, byrow = TRUE)

# Perform Chi-Square Test of Independence
chi_square_test_result <- chisq.test(contingency_table)

# Print the results
cat("Chi-Square Test p-value:", chi_square_test_result$p.value, "\n")


In [None]:
# Perform similar operations for the opposite junctions
consistent_p_values <- slope_data_consistent%>%
  filter(Consistent == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now
head(consistent_p_values)

In [None]:

# Perform similar operations for the opposite junctions
opposite_p_values <- slope_data_opposite %>%
  filter(Opposite == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now
head(opposite_p_values)

In [None]:
# Load necessary libraries
library(ggplot2)
library(dplyr)

# Calculate summary statistics for consistent test p-values
consistent_p_values <- stat_results_consistent_df %>%
  mutate(Significant = ifelse(mean_p_value < 0.05, "Yes", "No"))

summary_stats_consistent <- consistent_p_values %>%
  summarize(
    mean_p_value_mean = mean(mean_p_value),
    median_p_value_mean = median(mean_p_value),
    sd_p_value_mean = sd(mean_p_value),
    mean_p_value_median = mean(median_p_value),
    median_p_value_median = median(median_p_value),
    sd_p_value_median = sd(median_p_value)
  )

print(summary_stats_consistent)

In [None]:
# Visualize the distribution of mean p-values
ggplot(consistent_p_values, aes(x = mean_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "skyblue", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Mean P-values for Consistent Test", x = "Mean P-value", y = "Frequency")

# Visualize the distribution of median p-values
ggplot(consistent_p_values, aes(x = median_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "lightgreen", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Median P-values for Consistent Test", x = "Median P-value", y = "Frequency")

# QQ plot for mean p-values
ggplot(consistent_p_values, aes(sample = mean_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Mean P-values for Consistent Test")

# QQ plot for median p-values
ggplot(consistent_p_values, aes(sample = median_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Median P-values for Consistent Test")

# Identify significant genes based on mean p-values
significant_genes_mean_consistent <- consistent_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, mean_p_value)


In [None]:
length(slope_data_consistent$Consistent)
consistent_table <- table(slope_data_consistent$Consistent)
dim(consistent_table)

In [None]:

# Perform similar operations for the opposite junctions
opposite_p_values <- slope_data_opposite %>%
  filter(Opposite == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now

# Identify significant genes based on mean p-values
significant_genes_mean_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, mean_p_value)

# Print the number of significant genes
cat("Number of significant junctions based on mean p-value (opposite test):", nrow(significant_genes_mean_opposite), "\n")

# Save significant genes to CSV files
write.csv(significant_genes_mean_opposite, "significant_genes_mean_opposite.csv", row.names = FALSE)

# Create a contingency table for Chi-Square Test
contingency_table <- matrix(c(
  nrow(significant_genes_mean_consistent), nrow(consistent_junctions) - nrow(significant_genes_mean_consistent),
  nrow(significant_genes_mean_opposite), nrow(opposite_junctions) - nrow(significant_genes_mean_opposite)
), nrow = 2, byrow = TRUE)

# Perform Chi-Square Test of Independence
chi_square_test_result <- chisq.test(contingency_table)

# Print the results
cat("Chi-Square Test p-value:", chi_square_test_result$p.value, "\n")

In [None]:
# Visualize the distribution of mean p-values (assuming p-values are available)
consistent_p_values <- slope_data_consistent %>%
  filter(Consistent == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now

# Identify significant genes based on mean p-values
significant_genes_mean_consistent <- consistent_p_values %>%
  filter(mean_p_value < 0.05) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value)

# Identify significant genes based on mean p-values
significant_genes_mean_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, mean_p_value)

# Print the number of significant genes
cat("Number of significant junctions based on mean p-value (consistent test):", nrow(significant_genes_mean_consistent), "\n")

# Save significant genes to CSV files
write.csv(significant_genes_mean_consistent, "significant_genes_mean_consistent.csv", row.names = FALSE)


In [None]:
# Perform similar operations for the opposite junctions
opposite_p_values <- slope_data_opposite %>%
  filter(Opposite == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now

# Identify significant genes based on mean p-values
significant_genes_mean_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, mean_p_value)

# Print the number of significant genes
cat("Number of significant junctions based on mean p-value (opposite test):", nrow(significant_genes_mean_opposite), "\n")

# Save significant genes to CSV files
write.csv(significant_genes_mean_opposite, "significant_genes_mean_opposite.csv", row.names = FALSE)

# Create a contingency table for Chi-Square Test
contingency_table <- matrix(c(
  nrow(significant_genes_mean_consistent), nrow(consistent_junctions) - nrow(significant_genes_mean_consistent),
  nrow(significant_genes_mean_opposite), nrow(opposite_junctions) - nrow(significant_genes_mean_opposite)
), nrow = 2, byrow = TRUE)

# Perform Chi-Square Test of Independence
chi_square_test_result <- chisq.test(contingency_table)

# Print the results
cat("Chi-Square Test p-value:", chi_square_test_result$p.value, "\n")

In [None]:
# Perform similar operations for the opposite junctions
opposite_p_values <- slope_data_opposite %>%
  filter(Opposite == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now

# Identify significant genes based on mean p-values
significant_genes_mean_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, mean_p_value)

# Print the number of significant genes
cat("Number of significant junctions based on mean p-value (opposite test):", nrow(significant_genes_mean_opposite), "\n")

# Save significant genes to CSV files
write.csv(significant_genes_mean_opposite, "significant_genes_mean_opposite.csv", row.names = FALSE)

# Create a contingency table for Chi-Square Test
contingency_table <- matrix(c(
  nrow(significant_genes_mean_consistent), nrow(consistent_junctions) - nrow(significant_genes_mean_consistent),
  nrow(significant_genes_mean_opposite), nrow(opposite_junctions) - nrow(significant_genes_mean_opposite)
), nrow = 2, byrow = TRUE)

# Perform Chi-Square Test of Independence
chi_square_test_result <- chisq.test(contingency_table)

# Print the results
cat("Chi-Square Test p-value:", chi_square_test_result$p.value, "\n")

In [None]:
# Identify significant junctions in consistent and opposite sets
consistent_junctions <- slope_data_consistent %>%
  filter(Consistent == TRUE) %>%
  select(X)

opposite_junctions <- slope_data_opposite %>%
  filter(Opposite == TRUE) %>%
  select(X)

# Merge consistent junctions with gene names from featureData
consistent_p_values <- slope_data_consistent %>%
  filter(Consistent == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now

In [None]:

# Visualize the distribution of mean p-values
ggplot(consistent_p_values, aes(x = mean_p_value)) +
  geom_histogram(binwidth = 0.01, fill = "skyblue", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Mean P-values for Consistent Test", x = "Mean P-value", y = "Frequency")

# Visualize the distribution of median p-values
ggplot(consistent_p_values, aes(x = mean_p_value)) + # Assuming mean_p_value for visualization
  geom_histogram(binwidth = 0.01, fill = "lightgreen", color = "black") +
  geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") +
  labs(title = "Histogram of Median P-values for Consistent Test", x = "Median P-value", y = "Frequency")

# QQ plot for mean p-values
ggplot(consistent_p_values, aes(sample = mean_p_value)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Mean P-values for Consistent Test")

# QQ plot for median p-values
ggplot(consistent_p_values, aes(sample = mean_p_value)) + # Assuming mean_p_value for QQ plot
  geom_qq() +
  geom_qq_line() +
  labs(title = "QQ Plot of Median P-values for Consistent Test")

# Identify significant genes based on mean p-values
significant_genes_mean_consistent <- consistent_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, mean_p_value)

# Identify significant genes based on median p-values
significant_genes_median_consistent <- consistent_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, median_p_value = mean_p_value) # Assuming same p-value column

# Print the number of significant genes
cat("Number of significant junctions based on mean p-value (consistent test):", nrow(significant_genes_mean_consistent), "\n")
cat("Number of significant junctions based on median p-value (consistent test):", nrow(significant_genes_median_consistent), "\n")

# Save significant genes to CSV files
write.csv(significant_genes_mean_consistent, "significant_genes_mean_consistent.csv", row.names = FALSE)
write.csv(significant_genes_median_consistent, "significant_genes_median_consistent.csv", row.names = FALSE)

# Perform similar operations for the opposite junctions
opposite_p_values <- slope_data_opposite %>%
  filter(Opposite == TRUE) %>%
  mutate(gene_name = featureData[X, 2]) %>%
  select(X, gene_name, mean_p_value = Paired_Slope) # Assuming mean_p_value is Paired_Slope for now

# Identify significant genes based on mean p-values
significant_genes_mean_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, mean_p_value)

# Identify significant genes based on median p-values
significant_genes_median_opposite <- opposite_p_values %>%
  filter(mean_p_value < 0.05) %>%
  select(X, gene_name, median_p_value = mean_p_value) # Assuming same p-value column

# Print the number of significant genes
cat("Number of significant junctions based on mean p-value (opposite test):", nrow(significant_genes_mean_opposite), "\n")
cat("Number of significant junctions based on median p-value (opposite test):", nrow(significant_genes_median_opposite), "\n")

# Save significant genes to CSV files
write.csv(significant_genes_mean_opposite, "significant_genes_mean_opposite.csv", row.names = FALSE)
write.csv(significant_genes_median_opposite, "significant_genes_median_opposite.csv", row.names = FALSE)

# Create a contingency table for Chi-Square Test
contingency_table <- matrix(c(
  nrow(significant_genes_mean_consistent), nrow(consistent_junctions) - nrow(significant_genes_mean_consistent),
  nrow(significant_genes_mean_opposite), nrow(opposite_junctions) - nrow(significant_genes_mean_opposite)
), nrow = 2, byrow = TRUE)

# Perform Chi-Square Test of Independence
chi_square_test_result <- chisq.test(contingency_table)

# Print the results
cat("Chi-Square Test p-value:", chi_square_test_result$p.value, "\n")
