# To drop duplications which may be both in nuc and cyto classes from WMR_Datasets

In [5]:
import pandas as pd

In [6]:
cyto_f = "../Original_data/lncRNA_info_cyto_woRNALocate.csv"
nuc_f = "../Original_data/lncRNA_info_nuc_woRNALocate.csv"

# load data
dataset_cyto = pd.read_csv(cyto_f,sep=',',index_col = False)    #1806    
dataset_nuc = pd.read_csv(nuc_f,sep=',',index_col = False)    #1986
# set tag: nuclear 1 / cytosol 0
dataset_nuc['tag'] = 1;dataset_cyto['tag'] = 0
# concat the nuc and cyto dataset
dataset = pd.concat([dataset_nuc,dataset_cyto]) # 3792
# remove duplicated
dataset.drop_duplicates(keep="first",subset=["ensembl_gene_id","name","cdna"],inplace=True) # 3792
# output nuc_df cyto_df
nuc_df = dataset[dataset["tag"] == 1].iloc[:,0:3]
nuc_df.to_csv("./lncRNA_info_nuc_woRNALocate.tsv.temp",sep = '\t', index = False) #1986
cyto_df = dataset[dataset["tag"] == 0].iloc[:,0:3]
cyto_df.to_csv("./lncRNA_info_cyto_woRNALocate.tsv.temp",sep = '\t',index = False) #1806

In [7]:
dataset["length"] = dataset["cdna"].apply(len)
dataset["length"].describe(percentiles=[.05, .25, .75, .95])
dataset.to_csv("lncRNA_info_total_len.tsv.temp",sep = '\t',index = False)

count      3792.000000
mean       1506.691983
std        3894.911318
min          68.000000
5%          382.550000
25%         562.000000
50%         852.000000
75%        1937.000000
95%        3825.250000
max      205012.000000
Name: length, dtype: float64

# Convert gene id to transcript id

In [8]:
%%bash

# file path
Nuc="./lncRNA_info_nuc_woRNALocate.tsv.temp"
Cyto="./lncRNA_info_cyto_woRNALocate.tsv.temp"
Total="./lncRNA_info_total_len.tsv.temp"
Gencode_trans_fa="./gencode.v30.lncRNA_transcripts.fa"

# get transcript id for nuc-lncRNA
touch Nuc_trans.temp
echo "ensembl_transcript_id" > Nuc_trans.temp
sed '1d' $Nuc | while read line 
do
    gene_id=`echo "$line" | cut -f1`
    temp=`grep $gene_id ../Original_data/gencode.v30.lncRNA_transcripts_major.txt | cut -d. -f1`
    echo "$temp">> Nuc_trans.temp
done

# lncRNA_nuc_woRNALocate_transcript.txt   
paste Nuc_trans.temp $Nuc | awk -v OFS='\t' '{print $1,$3,$4}' > lncRNA_info_nuc_woRNALocate.tsv

# get transcript id for cyto-lncRNA
touch Cyto_trans.temp
echo "ensembl_transcript_id" > Cyto_trans.temp
sed '1d' $Cyto | while read line 
do
    gene_id=`echo "$line" | cut -f1`
    temp=`grep $gene_id ../Original_data/gencode.v30.lncRNA_transcripts_major.txt | cut -d. -f1`
    echo "$temp">> Cyto_trans.temp
done

# lncRNA_cyto_woRNALocate_transcript.txt
paste Cyto_trans.temp $Cyto | awk -v OFS='\t' '{print $1,$3,$4}' > lncRNA_info_cyto_woRNALocate.tsv

touch Total_trans.temp
echo "ensembl_transcript_id" > Total_trans.temp
sed '1d' Nuc_trans.temp >> Total_trans.temp
sed '1d' Cyto_trans.temp >> Total_trans.temp
cut -f2-5 $Total | paste Total_trans.temp - > lncRNA_info_total_len.tsv

rm *temp

In [4]:
%%bash

# file path
Nuc="./lncRNA_info_nuc_woRNALocate.tsv.temp"
Cyto="./lncRNA_info_cyto_woRNALocate.tsv.temp"
Total="./lncRNA_info_total_len.tsv.temp"
Gencode_trans_fa="./gencode.v30.lncRNA_transcripts.fa"

#convert fasta to tab
seqkit fx2tab $Gencode_trans_fa > ./gencode.v30.lncRNA_transcripts.tab


# get transcript id for nuc-lncRNA
touch Nuc_trans.temp
echo "ensembl_transcript_id" > Nuc_trans.temp
sed '1d' $Nuc | while read line 
do
    gene_id=`echo "$line" | cut -f1`
    seq=`echo "$line" | cut -f3`
    if [ "$gene_id"=="ENSG00000281344" ];then    # this lncRNA is too long(200000nt)
        temp=`grep -w $gene_id ./gencode.v30.lncRNA_transcripts.tab | cut -d. -f1 | sed -n '1p'`
    else
        temp=`grep -w $gene_id ./gencode.v30.lncRNA_transcripts.tab | grep -w $seq | cut -d. -f1 | sed -n '1p'` # duplications of PAR(pseudoautosomal regions)
    fi
    echo "$temp">> Nuc_trans.temp
done

# lncRNA_nuc_woRNALocate_transcript.txt   
paste Nuc_trans.temp $Nuc | awk -v OFS='\t' '{print $1,$3,$4}' > lncRNA_info_nuc_woRNALocate.tsv

# get transcript id for cyto-lncRNA
touch Cyto_trans.temp
echo "ensembl_transcript_id" > Cyto_trans.temp
sed '1d' $Cyto | while read line 
do
    gene_id=`echo "$line" | cut -f1`
    seq=`echo "$line" | cut -f3`
    grep -w $gene_id ./gencode.v30.lncRNA_transcripts.tab | grep -w $seq | cut -d. -f1 >> Cyto_trans.temp
done

# lncRNA_cyto_woRNALocate_transcript.txt
paste Cyto_trans.temp $Cyto | awk -v OFS='\t' '{print $1,$3,$4}' > lncRNA_info_cyto_woRNALocate.tsv

touch Total_trans.temp
echo "ensembl_transcript_id" > Total_trans.temp
sed '1d' Nuc_trans.temp >> Total_trans.temp
sed '1d' Cyto_trans.temp >> Total_trans.temp
cut -f2-5 $Total | paste Total_trans.temp - > lncRNA_info_total_len.tsv

rm *temp gencode.v30.lncRNA_transcripts.tab