## Cluster-Buster on SOX10-KD based library

In [7]:
cbust_root_dir="/staging/leuven/stg_00002/lcb/lcb_projects/CSE/SOX10-KD_library/analysis/cbust"
tf_motifs_dir="${cbust_root_dir}/data/motifs"

### Make fasta file of all human sequences

In [25]:
cat /staging/leuven/stg_00002/lcb/lcb_projects/CSE/SOX10-KD_library/analysis/HOMER/data/*.noNC.fa > ${cbust_root_dir}/data/human.SOX10KD.sequences.fa
amplified_cbust_seq_names_fasta_filename="${cbust_root_dir}/data/human.SOX10KD.sequences.fa"

In [26]:
cat ${cbust_root_dir}/data/human.SOX10KD.sequences.fa | wc -l

6768


### Download motifs

In [None]:
cd ${tf_motifs_dir}

# SOX10 dimers:
wget -nc http://motifcollections.aertslab.org/v8/singletons/tfdimers__MD00293.cb

# SOX10 super dimer:
wget -nc http://motifcollections.aertslab.org/v8/singletons/transfac_pro__M08838.cb

cat tfdimers__MD00293.cb transfac_pro__M08838.cb > SOX10_dimers.cb

# MITF super motif:
wget -nc http://motifcollections.aertslab.org/v8/singletons/homer__RTCATGTGAC_MITF.cb

## Score sequences with Cluster-Buster

__Create Function__

In [4]:
# Load Cluster-Buster module.
module load Cluster-Buster/20200507-GCCcore-6.4.0

score_full_amplified_regions_with_cluster_buster () {
    local cluster_buster_motif_filename="${1}"
    local cbust_BED_output_file="${2}"
    local cluster_threshold="${3}"
    local motif_threshold="${4}"
    
    if [ "${#@}" -ne 4 ] ; then
        printf 'Usage:  score_full_amplified_regions_with_cluster_buster \\\n'
        printf '            cluster_buster_motif_filename \\\n'
        printf '            cbust_BED_output_file \\\n'
        printf '            cluster_threshold \\\n'
        printf '            motif_threshold\n'
        return 1;
    fi

    printf 'Score amplified regions with "%s": c=%s m=%s.\n' \
        "${motif_name}" \
        "${cluster_threshold}" \
        "${motif_threshold}"
    
    cbust \
        -c ${cluster_threshold} \
        -m ${motif_threshold}  \
        -r 1000000 \
        -G 1 \
        -f 5 \
        "${cluster_buster_motif_filename}" \
        "${amplified_cbust_seq_names_fasta_filename}" \
      > "${cbust_BED_output_file}"

    printf 'Output Cluster-Buster BED file: %s \n' \
        "${cbust_BED_output_file}"
}

In [27]:
# Motifs to use for running cbust (one run per .cb file)
cluster_buster_motif_filenames="
   ${tf_motifs_dir}/SOX10_dimers.cb
   ${tf_motifs_dir}/homer__RTCATGTGAC_MITF.cb
"

# Define thresholds used for cluster score and motif score
cluster_score_thresholds='0.0'
motif_score_threholds='0.0'

# Run cbust custom function
for cluster_buster_motif_filename in ${cluster_buster_motif_filenames} ; do
    # Get motif name from Cluster-Buster filename:
    #   - Keep basename only.
    #   - Remove ".cb" extension.
    motif_name="${cluster_buster_motif_filename##*/}"
    motif_name="${motif_name%.cb}"

    for cluster_score_threshold in ${cluster_score_thresholds} ; do
        for motif_score_threshold in ${motif_score_threholds} ; do
            # Define Cluster-Buster output filename.
            cbust_BED_output_file="${cbust_root_dir}/results/${motif_name}.c_${cluster_score_threshold}__m_${motif_score_threshold}.bed"
            
            # Score sequences with Cluster-Buster with the current thresholds.
            score_full_amplified_regions_with_cluster_buster \
                "${cluster_buster_motif_filename}" \
                "${cbust_BED_output_file}" \
                "${cluster_score_threshold}" \
                "${motif_score_threshold}"
        done
    done
done

Score amplified regions with "SOX10_dimers": c=0.0 m=0.0.
Output Cluster-Buster BED file: /staging/leuven/stg_00002/lcb/lcb_projects/CSE/SOX10-KD_library/analysis/cbust/results/SOX10_dimers.c_0.0__m_0.0.bedScore amplified regions with "homer__RTCATGTGAC_MITF": c=0.0 m=0.0.
Output Cluster-Buster BED file: /staging/leuven/stg_00002/lcb/lcb_projects/CSE/SOX10-KD_library/analysis/cbust/results/homer__RTCATGTGAC_MITF.c_0.0__m_0.0.bed

In [None]:
# Print number of CRMs and number of motif matches for each motif and different cluster and motif score threshold.
awk \
    -F '\t' \
    '
    {
        if ($11 == "cluster") {
            cluster_counts[FILENAME] += 1; 
        }
        
        if ($11 == "motif") {
            motif_counts[FILENAME] += 1; 
        }
    }
    END {
        for (file_idx in cluster_counts) {
            print file_idx "\tnumber of cluster\t" cluster_counts[file_idx];
        }
        
        for (file_idx in motif_counts) {
            print file_idx "\tnumber of motifs\t" motif_counts[file_idx];
        }
    }' "${cbust_root_dir}/results"/*.bed | sed "s@${cbust_root_dir}/results/@@" | sort -k 1,1V

__Create file with sequence name and cluster score__

In [28]:
for file in ${cbust_root_dir}/results/*.bed ; do
    grep -v 'motif' $file | awk '{print $7 "\t" $5}' > ${cbust_root_dir}/results/cluster_score_df/${file##*/}
    done

chr1:19393640-19393830@@Human_restTopic11_346::chr1:19393640-19393830	8.27
chr13:54801156-54801346@@Human_restTopic11_57::chr13:54801156-54801346	7.16
chr15:68972395-68972585@@Human_restTopic11_1002::chr15:68972395-68972585	7.3
chr2:206563716-206563906@@Human_247::chr2:206563716-206563906	7.51
chr2:206563783-206563973@@Human_247::chr2:206563783-206563973	7.56
chr4:72461607-72461797@@Human_restTopic11_537::chr4:72461607-72461797	8.11
chr6:8814749-8814939@@Human_restTopic11_90::chr6:8814749-8814939	23.3
chr17:38934360-38934550@@Human_restTopic11_925::chr17:38934360-38934550	7.11
chr2:7151586-7151776@@Human_restTopic11_877::chr2:7151586-7151776	7.23
