# Bash 

***
###  

## 1. mRNA annotation

+ File1: gencode.v30.annotation.gtf
+ URL1: http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz

+ File2: gencode.v30.pc_transcripts.fa
+ URL2: http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.pc_transcripts.fa.gz

In [2]:
# To obtain the major isoforms of protein coding mRNAs

# Filter nonsense_mediated_decay, non_stop_decay, IG_*_gene, TR_*_gene, polymorphic_pseudogene
grep -w 'gene_type "protein_coding"' ./References/gencode.v30.annotation.gtf \
| grep -w 'transcript_type "protein_coding"' > ./References/gencode.v30.pc_mRNA_annotation.gtf

# Convert to genePred
gtfToGenePred ./References/gencode.v30.pc_mRNA_annotation.gtf ./References/gencode.v30.pc_mRNA_annotation.refseq -genePredExt # 83645

# Conver fasta to tabular file
seqkit fx2tab  ./References/gencode.v30.pc_transcripts.fa > ./References/gencode.v30.pc_transcripts.tab

# Get mRNA sequence 
cut -f1 ./References/gencode.v30.pc_mRNA_annotation.refseq > ./References/mRNA.temp
grep -F -f ./References/mRNA.temp ./References/gencode.v30.pc_transcripts.tab  > ./References/gencode.v30.pc_mRNA_transcripts.tab

# Get major isoform of mRNA
cat ./References/gencode.v30.pc_mRNA_transcripts.tab \
| sort -t '|'  -k5 | awk -F '|'  '!a[$6]++' \
| grep "201" > ./References/gencode.v30.pc_mRNA_transcripts_major.txt # 18607

sed "s:\t::g" ./References/gencode.v30.pc_mRNA_transcripts_major.txt \
| awk -F '|' -v OFS='\t' 'BEGIN{print "ensembl_gene_id\tname\tcdna"} {print $2,$6,$NF}'\
> ./References/gencode.v30.pc_mRNA_transcripts_major_compact.txt

sed "s:\t::g" ./References/gencode.v30.pc_mRNA_transcripts_major.txt \
| awk -F '|' -v OFS='\t' 'BEGIN{print "ensembl_transcript_id\tname\tcdna"} {print $1,$6,$NF}'\
> ./References/gencode.v30.pc_mRNA_transcripts_major_compact_trans_id.txt

rm ./References/mRNA.temp

In [6]:
# Convert to genePred format with gene name
gtfToGenePred ./References/gencode.v30.annotation.gtf ./References/gencode.v30.annotation.refseq -genePredExt

# Convert to bed12
genePredToBed ./References/gencode.v30.annotation.refseq ./References/gencode.v30.annotation.bed12

***
###  

## 2. mRNA localization resources

### 2.1 Cefra-seq
- File: 2018_CeFra_Seq_polyA_plus.tsv
- URL: https://rnajournal.cshlp.org/content/suppl/2017/10/27/rna.063172.117.DC1/Supplemental_File_3.tsv

### 2.2 APEX-seq
- File: 2019_CELL_APEXSeq.tsv
- URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6786773/bin/NIHMS1531987-supplement-3.xlsx

*** 