In [1]:
# Replace double tabs with a single tab
file_content <- readLines("/home/ylee/RBPDB/PFMs/matrix_list.txt")
file_content_clean <- gsub("\t{2}", "\t", file_content)

temp_file <- tempfile()
writeLines(file_content_clean, temp_file)
file_info <- read.table(temp_file, header = FALSE, sep = "\t")
unlink(temp_file)    # Remove the temporary file

In [2]:
colnames(file_info) <- c("motif_id", "num", "RBP_name", "RBP_domain")
file_info <- file_info[, -5]
head(file_info)

Unnamed: 0_level_0,motif_id,num,RBP_name,RBP_domain
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>
1,1216_19457263,14.192382,KHDRBS3,KHx1
2,149_16041388,12.412947,QKI,KHx1
3,1176_19561594,8.20637,Vts1,SAMx1
4,662_1717938,11.815652,SNRPA,RRMx2
5,950_7908267,8.432399,PABPC1,RRMx4
6,1173_19561594,6.607919,SFRS1,RRMx2


In [4]:
library(AnnotationDbi)
library(org.Hs.eg.db) # Human gene database
library(org.Mm.eg.db) # Mouse gene database

In [5]:
# bring valid key list
valid_human_keys <- c(keys(org.Hs.eg.db, keytype = "SYMBOL"), keys(org.Hs.eg.db, keytype = "ENSEMBL"))
valid_mouse_keys <- c(keys(org.Mm.eg.db, keytype = "SYMBOL"), keys(org.Mm.eg.db, keytype = "ENSEMBL"))

# check if the RBP_name is valid
file_info$valid_human <- file_info$RBP_name %in% valid_human_keys
file_info$valid_mouse <- file_info$RBP_name %in% valid_mouse_keys

head(file_info)

Unnamed: 0_level_0,motif_id,num,RBP_name,RBP_domain,valid_human,valid_mouse
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<lgl>,<lgl>
1,1216_19457263,14.192382,KHDRBS3,KHx1,True,False
2,149_16041388,12.412947,QKI,KHx1,True,False
3,1176_19561594,8.20637,Vts1,SAMx1,False,False
4,662_1717938,11.815652,SNRPA,RRMx2,True,False
5,950_7908267,8.432399,PABPC1,RRMx4,True,False
6,1173_19561594,6.607919,SFRS1,RRMx2,False,False


In [6]:
input_dir = "/home/ylee/RBPDB/PFMs/"
output_dir = "/home/ylee/RBPDB/RBPDB_PFM_species_filtered/"

for (i in 1:nrow(file_info)) {
    RBP <- file_info[i, "RBP_name"]
    
    if (file_info[i, "valid_human"] || file_info[i, "valid_mouse"]) {
        input_file <- file.path(paste0(input_dir, file_info[i, "motif_id"], ".pfm"))
        output_file <- file.path(paste0(output_dir, file_info[i, "RBP_name"], "_PWM_", file_info[i, "motif_id"], ".txt"))
        
        motif_data <- read.table(input_file, header = FALSE, sep = "", stringsAsFactors = FALSE)
        matrix_data <- as.data.frame(t(motif_data))          # transpose data
        
        write.table(matrix_data, output_file, sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
        print(paste("Saved:", output_file))
    }
}

[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/KHDRBS3_PWM_1216_19457263.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/QKI_PWM_149_16041388.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/SNRPA_PWM_662_1717938.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/PABPC1_PWM_950_7908267.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/ZFP36_PWM_221_12324455.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/EIF4B_PWM_350_8846295.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/SNRPA_PWM_1175_19561594.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/RBMY1A1_PWM_1052_17318228.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/NOVA2_PWM_682_10811881.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/ELAVL1_PWM_1170_19561594.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/RBMY1A1_PWM_1053_17318228.txt"
[1] "Saved: /home/ylee/RBPDB/RBPDB_PFM_species_filtered/PTBP1_PWM_11

## Summary_original_info
##### Summary that includes the original database information

In [16]:
base_dir <- "/home/ylee/RBPDB/RBPDB_meme"

In [17]:
files <- list.files(base_dir, full.names = TRUE)

file_info <- data.frame(
    full_path = files,
    base_name = basename(files),
    stringsAsFactors = FALSE
)

file_info$RBP_name <- sub("_PWM.*", "", file_info$base_name)
file_info$motif_id <- sub(".meme", "", sub(".*PWM_", "", file_info$base_name))

In [18]:
for (i in 1:nrow(file_info)) {
    lines <- readLines(file_info[i, "full_path"])
    motif_line <- grep("^MOTIF", lines, value = TRUE)  # Find line starting with "MOTIF"
    motif_name <- sub("^MOTIF \\d+\\s+", "", motif_line)  # Remove "MOTIF <number> " to get the name
    file_info[i, "motif_alt_id"] <- motif_name
}

In [19]:
head(file_info)

Unnamed: 0_level_0,full_path,base_name,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
1,/home/ylee/RBPDB/RBPDB_meme/ACO1_PWM_1213_8021254.meme,ACO1_PWM_1213_8021254.meme,ACO1,1213_8021254,YAGDRH
2,/home/ylee/RBPDB/RBPDB_meme/EIF4B_PWM_350_8846295.meme,EIF4B_PWM_350_8846295.meme,EIF4B,350_8846295,BTHGGAM
3,/home/ylee/RBPDB/RBPDB_meme/EIF4B_PWM_351_8846295.meme,EIF4B_PWM_351_8846295.meme,EIF4B,351_8846295,GYBGRAM
4,/home/ylee/RBPDB/RBPDB_meme/EIF4B_PWM_352_8846295.meme,EIF4B_PWM_352_8846295.meme,EIF4B,352_8846295,GGAM
5,/home/ylee/RBPDB/RBPDB_meme/ELAVL1_PWM_1170_19561594.meme,ELAVL1_PWM_1170_19561594.meme,ELAVL1,1170_19561594,RTTW
6,/home/ylee/RBPDB/RBPDB_meme/ELAVL2_PWM_782_8497264.meme,ELAVL2_PWM_782_8497264.meme,ELAVL2,782_8497264,HTYMTTTDYWTTHN


In [20]:
file_info <- file_info[, -c(1, 2)]
head(file_info)

Unnamed: 0_level_0,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,ACO1,1213_8021254,YAGDRH
2,EIF4B,350_8846295,BTHGGAM
3,EIF4B,351_8846295,GYBGRAM
4,EIF4B,352_8846295,GGAM
5,ELAVL1,1170_19561594,RTTW
6,ELAVL2,782_8497264,HTYMTTTDYWTTHN


In [21]:
length(unique(file_info$RBP_name))

In [22]:
nrow(file_info)

In [23]:
write.csv(file_info, "/home/ylee/RBPDB/RBPDB_RBP_motifs_original_db_info.csv", row.names = FALSE)

## Summary
##### Summary that includes annotation of fimo

In [28]:
library(dplyr)
library(tidyr)

In [29]:
base_dir <- "/home/ylee/RBPDB/RBPDB_meme_combined"

In [30]:
files <- list.files(base_dir, full.names = TRUE)
rbp_list <- sub("_combined.*", "", basename(files))
length(rbp_list)

In [31]:
# Extract RBP related motifs from the MEME file
extract_motifs <- function(rbp_list, database) { 
    RBP_motifs <- data.frame(
        RBP_name = character(),
        motif_id = character(),
        motif_alt_id = character()    
    )
    
    for (rbp in rbp_list){
        RBP_name <- rbp
        motif_id <- NULL
        motif_alt_id <- NULL
        
        meme_file <- paste0(base_dir, "/", rbp, "_combined_", database, ".meme")
        lines <- readLines(meme_file)

        for (line in lines) {
            if (grepl("^MOTIF", line)) {
              motif_id <- sub("MOTIF (\\S+) (\\S+)", "\\1", line)  # extract motif ID
              motif_alt_id <- sub("MOTIF (\\S+) (\\S+)", "\\2", line)  # extract motif alternative ID
            }
            if (grepl("^letter-probability matrix", line)) {
                new_row <- data.frame(
                    RBP_name = rbp,
                    motif_id = motif_id,
                    motif_alt_id = motif_alt_id
                )
                RBP_motifs <- bind_rows(RBP_motifs, new_row)
            }
        }
    }
  
  return(RBP_motifs)
}

In [33]:
RBP_motifs <- extract_motifs(rbp_list, "RBPDB")
head(RBP_motifs)

Unnamed: 0_level_0,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,ACO1,1.0,YAGDRH
2,EIF4B,1.1,BTHGGAM
3,EIF4B,1.2,GYBGRAM
4,EIF4B,1.3,GGAM
5,ELAVL1,1.0,RTTW
6,ELAVL2,1.1,HTYMTTTDYWTTHN


In [34]:
length(unique(RBP_motifs$RBP_name))
nrow(RBP_motifs)

In [35]:
write.csv(RBP_motifs, "/home/ylee/RBPDB/RBPDB_RBP_motifs.csv", row.names = FALSE)