In [1]:
import pandas as pd
from tqdm.auto import tqdm

# Get data for paper

In [2]:
path_to_ts = '/home/Mount_sda1/alexmakh/TargetScan/result_CWCS.tsv'
path_to_mirdb = '/home/Mount_sda1/alexmakh/isomiRNA/mirdb_custom_isores.tsv'
path_to_mirdb_new = '/home/Mount_sda1/alexmakh/isomiRNA/mirdb_custom_isores_errs.tsv'
path_to_mapper = '/home/Mount_sda1/alexmakh/TargetScan/input/UTR_sequences_human.txt'

In [3]:
genes = 'Gene Symbol'
refseq = 'Refseq ID'
isomir = 'isomiRNA'

In [4]:
ts = pd.read_csv(path_to_ts, sep='\t', header=None).rename(columns={0: refseq, 1: isomir, 2: 'CWCS'})
ts[['pre-miRNA', isomir]] = ts[isomir].str.split('.').tolist()
ts = ts.sort_values('CWCS').drop_duplicates([refseq, isomir], keep='first')

In [5]:
mirdb = pd.read_csv(path_to_mirdb, sep='\t').sort_values('Target Score', ascending=False)
mirdb = mirdb.drop_duplicates([genes, isomir], keep='first')

mirdb_new = pd.read_csv(path_to_mirdb_new, sep='\t').sort_values('Target Score', ascending=False)
mirdb_new = mirdb_new.drop_duplicates([genes, isomir], keep='first')

In [6]:
def pshape(df, name):
    print(f'{name}: {df.shape}')

In [7]:
for i, j in zip([ts, mirdb, mirdb_new], ['ts', 'mirdb', 'mirdb_new']):
    pshape(i, j)

ts: (101618911, 4)
mirdb: (6695459, 4)
mirdb_new: (147575, 4)


In [8]:
mirdb = pd.concat([mirdb, mirdb_new], axis=0).sort_values('Target Score', ascending=False)
mirdb = mirdb.drop_duplicates([genes, isomir], keep='first')
pshape(mirdb, 'mirdb_concat')

mirdb_concat: (6843034, 4)


In [9]:
mapper = pd.read_csv(path_to_mapper, sep='\t')[[refseq, genes]].drop_duplicates()
pshape(mapper, 'mapper')
mapper.head()

mapper: (28352, 2)


Unnamed: 0,Refseq ID,Gene Symbol
0,CDR1as,CDR1as
1,ENST00000000233.5,ARF5
2,ENST00000000412.3,M6PR
3,ENST00000001008.4,FKBP4
4,ENST00000001146.2,CYP26B1


In [10]:
ts_ = ts.set_index(refseq).join(mapper.set_index(refseq)).reset_index(drop=True)
pshape(ts_, 'new ts')

ts_ = ts_[[isomir, 'CWCS', genes, 'pre-miRNA']].sort_values('CWCS').drop_duplicates([genes, 'isomiRNA'], keep='first')
pshape(ts_, 'new ts no duplicates')

ts_.head()

new ts: (101618911, 4)
new ts no duplicates: (83478977, 4)


Unnamed: 0,isomiRNA,CWCS,Gene Symbol,pre-miRNA
77892837,hsa-miR-6795-3p|+2,-94.434,FAM230A,hsa-mir-6795
77892838,hsa-miR-3196|+1,-88.142,FAM230A,hsa-mir-3196
66440011,hsa-miR-1253|+2,-87.869,ANKDD1A,hsa-mir-1253
81813054,hsa-miR-6732-3p|+2,-86.129,MUC19,hsa-mir-6732
66440012,hsa-miR-3185|0,-86.126,ANKDD1A,hsa-mir-3185


In [11]:
ts = ts_.copy()

In [12]:
path_to_save = '/home/alexmakh/huge/alexmakh/data_to_paper/'

In [13]:
ts.to_csv(path_to_save + 'ts_res_full.tsv', sep='\t', index=None)
mirdb.to_csv(path_to_save + 'mirdb_res_full.tsv', sep='\t', index=None)

mirdb[mirdb['Target Score'] >= 80].to_csv(path_to_save + 'mirdb_res_80_upper.tsv', sep='\t', index=None)

mirdb_mirs = set(mirdb['isomiRNA'])
ts_mirs = set(ts['isomiRNA'])
print('mirdb - ts mirnas:', len(mirdb_mirs - ts_mirs), 'ts - mirdb mirnas', len(ts_mirs - mirdb_mirs))

mirdb - ts mirnas: 0 ts - mirdb mirnas 117


In [14]:
%reset -f

In [15]:
import pandas as pd
from tqdm.auto import tqdm
path_to_save = '/home/alexmakh/huge/alexmakh/data_to_paper/'
genes = 'Gene Symbol'
refseq = 'Refseq ID'
isomir = 'isomiRNA'

In [16]:
ts = pd.read_csv(path_to_save + 'ts_res_full.tsv', sep='\t', index_col=0)
mirdb = pd.read_csv(path_to_save + 'mirdb_res_80_upper.tsv', sep='\t', index_col=0)
ts.shape, mirdb.shape

((83478977, 3), (1751327, 3))

In [17]:
%%time
groupby_ts = ts.groupby(isomir)
groupby_mirdb = mirdb.groupby(isomir)
counts = groupby_mirdb.size()

comparasion = pd.DataFrame({}, columns=[isomir, 'TargetScan full', 'TargetScan to mirdb size']).set_index(isomir)

CPU times: user 332 ms, sys: 40 ms, total: 372 ms
Wall time: 368 ms


In [18]:
for mir, t_ in tqdm(groupby_ts):

    try:
        tm_ = t_.nsmallest(counts[mir], 'CWCS')
    except KeyError:
        continue

    tm_.to_csv(path_to_save + 'ts_res_80_upper.tsv', sep='\t', mode='a', header=None)

    
    m_ = set(groupby_mirdb.get_group(mir)[genes])
    t_ = set(t_[genes])
    tm_ = set(tm_[genes])
    comparasion.loc[mir, 'TargetScan full'] = len(t_ & m_) / len(m_) * 100
    comparasion.loc[mir, 'TargetScan to mirdb size'] = len(tm_ & m_) / len(m_) * 100
comparasion.to_csv(path_to_save + 'comparasion.tsv', sep='\t')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12258.0), HTML(value='')))




#### Append to first line:
|isomiRNA|CWCS|Gene Symbol|pre-miRNA
|-|-|-|-|

In [19]:
!sed -i '1 iisomiRNA\tCWCS\tGene Symbol\tpre-miRNA' /home/alexmakh/huge/alexmakh/data_to_paper/ts_res_80_upper.tsv