# To drop duplications which may be both in nuc and cyto classes

In [1]:
import pandas as pd

In [2]:
cyto_f = "mRNA_info_cyto_cefra_apex.csv"
nuc_f = "mRNA_info_nuc_cefra_apex.csv"

# load data
dataset_cyto = pd.read_csv(cyto_f,sep=',',index_col = False)    # 2924    
dataset_nuc = pd.read_csv(nuc_f,sep=',',index_col = False)    # 2256
# set tag: nuclear 1 / cytosol 0
dataset_nuc['tag'] = 1;dataset_cyto['tag'] = 0
# concat the nuc and cyto dataset
dataset = pd.concat([dataset_nuc,dataset_cyto]) # 5180
# remove duplicated
dataset.drop_duplicates(keep="first",subset=["ensembl_gene_id","name","cdna"],inplace=True) # 5180
# output nuc_df cyto_df
nuc_df = dataset[dataset["tag"] == 1].iloc[:,0:3]
nuc_df.to_csv("mRNA_info_nuc_cefra_apex.tsv.temp",sep = '\t', index = False) # 2256
cyto_df = dataset[dataset["tag"] == 0].iloc[:,0:3]
cyto_df.to_csv("mRNA_info_cyto_cefra_apex.tsv.temp",sep = '\t',index = False) # 2924

In [3]:
dataset["length"] = dataset["cdna"].apply(len)
dataset["length"].describe(percentiles=[.05, .25, .75, .95])
dataset.to_csv("mRNA_info_total_len_cefra_apex.tsv.temp",sep = '\t',index = False)

count     5180.000000
mean      3434.105985
std       2590.048903
min        207.000000
5%         695.950000
25%       1623.000000
50%       2742.000000
75%       4566.250000
95%       8469.850000
max      34626.000000
Name: length, dtype: float64

# Convert gene id to transcript id

In [4]:
%%bash
# nuc_mRNA
sed '1d' mRNA_info_nuc_cefra_apex.tsv.temp | cut -f1 > nuc_gene_id.temp
touch nuc_transcript_id.temp
echo "ensembl_transcript_id" > nuc_transcript_id.temp
cat nuc_gene_id.temp | while read gene_id
do
    grep $gene_id ../Original_data/gencode.v30.pc_mRNA_transcripts_major.txt| cut -d. -f1 >> nuc_transcript_id.temp
done
cut -f2-3 mRNA_info_nuc_cefra_apex.tsv.temp | paste nuc_transcript_id.temp - > mRNA_info_nuc_cefra_apex.tsv

# cyto_mRNA
sed '1d' mRNA_info_cyto_cefra_apex.tsv.temp | cut -f1 > cyto_gene_id.temp
touch cyto_transcript_id.temp
echo "ensembl_transcript_id" > cyto_transcript_id.temp
cat cyto_gene_id.temp | while read gene_id
do
    grep $gene_id ../Original_data/gencode.v30.pc_mRNA_transcripts_major.txt| cut -d. -f1 >> cyto_transcript_id.temp
done
cut -f2-3 mRNA_info_cyto_cefra_apex.tsv.temp | paste cyto_transcript_id.temp - > mRNA_info_cyto_cefra_apex.tsv

# total_mRNA
touch total_transcript_id.temp
echo "ensembl_transcript_id" > total_transcript_id.temp
sed '1d' nuc_transcript_id.temp >> total_transcript_id.temp
sed '1d' cyto_transcript_id.temp >> total_transcript_id.temp
cut -f2-5 mRNA_info_total_len_cefra_apex.tsv.temp | paste total_transcript_id.temp - > mRNA_info_total_len_cefra_apex.tsv

rm *temp