In [24]:
input_dir <- "/home/ylee/mCross/eCLIP_PWM/"
output_dir <- "/home/ylee/mCross/eCLIP_PFM_processed/"
dir.create(output_dir, showWarnings = FALSE)

files <- list.files(input_dir, pattern = "*.mat$", full.names = TRUE)
files <- gsub("//", "/", files)

## Extract count matrix from mCross result

In [30]:
# function to extract PFM
extract_pfm <- function(file) {
    lines <- readLines(file)
    
    start_idx <- grep("^P0", lines)
    
    if (length(start_idx) > 0) {
        end_idx <- grep("^XX", lines[(start_idx + 1):length(lines)])

        if (length(end_idx) > 0) {
            end_idx <- start_idx + end_idx[1] 
            
            # Extract lines within the PFM section
            pfm_lines <- lines[(start_idx + 1):(end_idx - 1)]
            
            # Convert lines to a matrix
            pfm_matrix <- do.call(rbind, lapply(pfm_lines, function(line) {
                counts <- strsplit(line, "\\s+")[[1]][-1]   # Remove position number
                as.numeric(counts[1:4])                     # Keep only the four numeric columns
            }))
            
            return(pfm_matrix)
        }
    }
    warning(paste("No valid PFM section found in", file))
    return(NULL)
    
}

In [31]:
for (file in files) {
  pfm_matrix <- extract_pfm(file)
  
  if (!is.null(pfm_matrix)) {
    # Define output file name based on the input file name
    output_file <- file.path(output_dir, paste0(basename(file), "_PFM.txt"))
    
    # Write the matrix to a text file, space-delimited
    write.table(pfm_matrix, file = output_file, sep = " ", row.names = FALSE, col.names = FALSE, quote = FALSE)
  }
}

cat("Processing complete. PFM matrices saved in:", output_dir, "\n")

“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/HepG2.DHX30.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/HepG2.EFTUD2.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/HepG2.FKBP4.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/HepG2.FUBP3.top10.cluster.m1.03.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/HepG2.IGF2BP3.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/HepG2.RBM15.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/K562.AKAP8L.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/K562.DDX42.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/K562.DROSHA.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/ylee/mCross/eCLIP_PWM/K562.FASTKD2.top10.cluster.m1.00.mat”
“No valid PFM section found in /home/

Processing complete. PFM matrices saved in: /home/ylee/mCross/eCLIP_PFM_processed/ 


## Summary

In [211]:
library(dplyr)
library(tidyr)

In [153]:
base_dir <- "/home/ylee/mCross/eCLIP_meme_cell_combined"

In [154]:
files <- list.files(base_dir, full.names = TRUE)
rbp_list <- sub("_combined.*", "", basename(files))
length(rbp_list)

In [155]:
# Extract RBP related motifs from the MEME file
extract_motifs <- function(rbp_list, database) { 
    RBP_motifs <- data.frame(
        RBP_name = character(),
        motif_id = character(),
        motif_alt_id = character()    
    )
    
    for (rbp in rbp_list){
        RBP_name <- rbp
        motif_id <- NULL
        motif_alt_id <- NULL
        
        meme_file <- paste0(base_dir, "/", rbp, "_combined_", database, ".meme")
        lines <- readLines(meme_file)

        for (line in lines) {
            if (grepl("^MOTIF", line)) {
              motif_id <- sub("MOTIF (\\S+) (\\S+)", "\\1", line)  # extract motif ID
              motif_alt_id <- sub("MOTIF (\\S+) (\\S+)", "\\2", line)  # extract motif alternative ID
            }
            if (grepl("^letter-probability matrix", line)) {
                new_row <- data.frame(
                    RBP_name = rbp,
                    motif_id = motif_id,
                    motif_alt_id = motif_alt_id
                )
                RBP_motifs <- bind_rows(RBP_motifs, new_row)
            }
        }
    }
  
  return(RBP_motifs)
}

In [156]:
RBP_motifs <- extract_motifs(rbp_list, "mCross")
head(RBP_motifs)

Unnamed: 0_level_0,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,AARS,1.1,YWTDGCTAGKB
2,AARS,1.1,WSARYTCACYK
3,AARS,1.2,YNGWCATAGDW
4,AARS,1.3,DNGATCATANN
5,AARS,1.4,BHVTTWCCAVW
6,AARS,1.5,NNAAGSCCTNN


In [157]:
length(unique(RBP_motifs$RBP_name))
nrow(RBP_motifs)

In [158]:
write.csv(RBP_motifs, "/home/ylee/mCross/mCross_RBP_motifs.csv", row.names = FALSE)

## All Summary_original_db_info

In [202]:
mCross_info <- read.csv("/home/ylee/mCross/mCross_RBP_motifs.csv", header = TRUE)
CisBP_RNA_info <- read.csv("/home/ylee/CisBP-RNA/CisBP-RNA_RBP_motifs.csv", header = TRUE)
oRNAment_info <- read.csv("/home/ylee/oRNAment/oRNAment_RBP_motifs.csv", header = TRUE)
RBPDB_info <- read.csv("/home/ylee/RBPDB/RBPDB_RBP_motifs.csv", header = TRUE)

In [203]:
mCross_info <- data.frame(database = "mCross", mCross_info)     # mCross_info$database <- "mCross"
head(mCross_info, 3)
nrow(mCross_info)

Unnamed: 0_level_0,database,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,mCross,AARS,1.1,YWTDGCTAGKB
2,mCross,AARS,1.1,WSARYTCACYK
3,mCross,AARS,1.2,YNGWCATAGDW


In [204]:
CisBP_RNA_info <- data.frame(database = "CisBP_RNA", CisBP_RNA_info)
head(CisBP_RNA_info, 3)
nrow(CisBP_RNA_info)

Unnamed: 0_level_0,database,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,CisBP_RNA,A1CF,1.1,WTAATTR
2,CisBP_RNA,A1CF,1.2,DTAATTV
3,CisBP_RNA,A2BP1,1.1,WGCATGM


In [205]:
oRNAment_info <- data.frame(database = "oRNAment", oRNAment_info)
head(oRNAment_info, 3)
nrow(oRNAment_info)

Unnamed: 0_level_0,database,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,oRNAment,A1CF,1.1,NTAATTA
2,oRNAment,A1CF,1.2,VAATCAN
3,oRNAment,A1CF,1.3,SGGRCTG


In [206]:
RBPDB_info <- data.frame(database = "RBPDB", RBPDB_info)
head(RBPDB_info, 3)
nrow(RBPDB_info)

Unnamed: 0_level_0,database,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,RBPDB,ACO1,1.0,YAGDRH
2,RBPDB,EIF4B,1.1,BTHGGAM
3,RBPDB,EIF4B,1.2,GYBGRAM


In [207]:
All_info <- rbind(mCross_info, CisBP_RNA_info, oRNAment_info, RBPDB_info)

In [208]:
nrow(All_info)

In [209]:
length(unique(All_info$RBP_name))

## All Summary

In [212]:
base_dir <- "/home/ylee/AllmotifDB/AllmotifDB_meme_combined"

In [223]:
files <- list.files(base_dir, full.names = TRUE)
rbp_list <- sub("_combined.*", "", basename(files))
length(rbp_list)

In [228]:
# Extract RBP related motifs from the MEME file
extract_motifs <- function(rbp_list) { 
    RBP_motifs <- data.frame(
        RBP_name = character(),
        motif_id = character(),
        motif_alt_id = character()    
    )
    
    for (rbp in rbp_list){
        RBP_name <- rbp
        motif_id <- NULL
        motif_alt_id <- NULL
        
        meme_file <- paste0(base_dir, "/", rbp, "_combined.meme")
        lines <- readLines(meme_file)

        for (line in lines) {
            if (grepl("^MOTIF", line)) {
              motif_id <- sub("MOTIF (\\S+) (\\S+)", "\\1", line)  # extract motif ID
              motif_alt_id <- sub("MOTIF (\\S+) (\\S+)", "\\2", line)  # extract motif alternative ID
            }
            if (grepl("^letter-probability matrix", line)) {
                new_row <- data.frame(
                    RBP_name = rbp,
                    motif_id = motif_id,
                    motif_alt_id = motif_alt_id
                )
                RBP_motifs <- bind_rows(RBP_motifs, new_row)
            }
        }
    }
  
  return(RBP_motifs)
}

In [229]:
RBP_motifs <- extract_motifs(rbp_list)
head(RBP_motifs)

Unnamed: 0_level_0,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,A1CF,1.1.1,WTAATTR
2,A1CF,1.1.2,NTAATTA
3,A1CF,1.2.1,DTAATTV
4,A1CF,1.2.2,VAATCAN
5,A1CF,1.3,SGGRCTG
6,A1CF,1.4,AGHTCGG


In [220]:
nrow(RBP_motifs)

In [221]:
length(unique(RBP_motifs$RBP_name))

## Merged information of RBP motifs

In [230]:
All_info_sorted <- All_info[order(All_info$RBP_name, All_info$motif_alt_id), ]
RBP_motifs_sorted <- RBP_motifs[order(RBP_motifs$RBP_name, RBP_motifs$motif_alt_id), ]

In [236]:
head(All_info_sorted)
head(RBP_motifs_sorted)

Unnamed: 0_level_0,database,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1308,oRNAment,A1CF,1.5,AATAWAV
1307,oRNAment,A1CF,1.4,AGHTCGG
1073,CisBP_RNA,A1CF,1.2,DTAATTV
1304,oRNAment,A1CF,1.1,NTAATTA
1306,oRNAment,A1CF,1.3,SGGRCTG
1305,oRNAment,A1CF,1.2,VAATCAN


Unnamed: 0_level_0,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>
7,A1CF,1.5,AATAWAV
6,A1CF,1.4,AGHTCGG
3,A1CF,1.2.1,DTAATTV
2,A1CF,1.1.2,NTAATTA
5,A1CF,1.3,SGGRCTG
4,A1CF,1.2.2,VAATCAN


In [234]:
merged_df <- cbind(All_info_sorted[ , -c(3, 4)], RBP_motifs_sorted[ , -1])

In [235]:
merged_df

Unnamed: 0_level_0,database,RBP_name,motif_id,motif_alt_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1308,oRNAment,A1CF,1.5,AATAWAV
1307,oRNAment,A1CF,1.4,AGHTCGG
1073,CisBP_RNA,A1CF,1.2.1,DTAATTV
1304,oRNAment,A1CF,1.1.2,NTAATTA
1306,oRNAment,A1CF,1.3,SGGRCTG
1305,oRNAment,A1CF,1.2.2,VAATCAN
1072,CisBP_RNA,A1CF,1.1.1,WTAATTR
1309,oRNAment,A1CF,1.6,WTAATTR
1075,CisBP_RNA,A2BP1,1.2,TGCATG
1074,CisBP_RNA,A2BP1,1.1,WGCATGM


In [237]:
write.csv(merged_df, "/home/ylee/AllmotifDB/All_RBP_motifs.csv", row.names = FALSE)