http://mirwalk.umm.uni-heidelberg.de/resources/

The binding site is determined using the RNAduplex programme from the ViennaRNA software package. RNAduplex forms intermolecular pairs and neglects the competition between intramolecular folding and hybridization. It is used as a pre-filter in the TarPmiR prediction software.

More information:
https://www.tbi.univie.ac.at/RNA/tutorial/#sec6_3

In [2]:
import pandas as pd
import numpy as np

In [3]:
db = pd.read_csv('/Users/Anna/Study/python/biopython/mirna/mirna db/hsa_miRWalk_3UTR.txt', delimiter = '\t')
db.head()

Unnamed: 0,miRNA,mRNA,Genesymbol,binding_site,duplex,binding_probability
0,hsa-let-7a-5p,NM_001256426,PDLIM5,42334272,TGAGGTAGTAGGTTGTATAGTT#AACTGAATACTTCTCCAATAGCA...,0.923077
1,hsa-let-7a-2-3p,NM_001256426,PDLIM5,32673297,CTGTACAGCCTCCTAGCTTTCC#AGTCCCATGTACTTGGGAGGCTG...,0.846154
2,hsa-let-7b-3p,NM_001256426,PDLIM5,26952717,CTATACAACCTACTGCCTTCCC#GAACCCGGGAGGGAGAGGTTGC#...,0.846154
3,hsa-let-7c-5p,NM_001256426,PDLIM5,29232946,TGAGGTAGTAGGTTGTATGGTT#ACCATATAGCTTATAAGTCTCAA...,0.923077
4,hsa-let-7c-3p,NM_001256426,PDLIM5,37163747,CTGTACAACCTTCTAGCTTTCC#GGAAAGCTTTTATTCACAGAGGT...,0.923077


In [4]:
db[db['Genesymbol']=='SPANXD']

Unnamed: 0,miRNA,mRNA,Genesymbol,binding_site,duplex,binding_probability
6709225,hsa-miR-320a-3p,NM_032417,SPANXD,634657,AAAAGCTGGGTTGAGAGGGCGA#GCTACATCTCTCAACCTTGGGCA...,0.846154
6709226,hsa-miR-377-5p,NM_032417,SPANXD,644663,AGAGGTTGCCCTTGGTGAATTC#TCAACCTTGGGCAATGACA#.(....,0.846154
6709227,hsa-miR-550a-5p,NM_032417,SPANXD,634658,AGTGCCTGAGGGAGTAAGAGCCC#GCTACATCTCTCAACCTTGGGC...,0.923077
6709228,hsa-miR-550a-3-5p,NM_032417,SPANXD,640658,AGTGCCTGAGGGAGTAAGAG#TCTCTCAACCTTGGGCAA#..((((...,0.923077
6709229,hsa-miR-320c,NM_032417,SPANXD,637657,AAAAGCTGGGTTGAGAGGGT#ACATCTCTCAACCTTGGGCA#.......,0.871795
6709230,hsa-miR-1285-5p,NM_032417,SPANXD,649677,GATCTCACTTTGTTGCCCAGG#CTTGGGCAATGACAATAAAGTTTG...,1.0
6709231,hsa-miR-320d,NM_032417,SPANXD,640657,AAAAGCTGGGTTGAGAGGA#TCTCTCAACCTTGGGCA#....((((...,0.923077
6709232,hsa-miR-711,NM_032417,SPANXD,636656,GGGACCCAGGGAGAGACGTAAG#TACATCTCTCAACCTTGGGC#.....,1.0
6709233,hsa-miR-3135a,NM_032417,SPANXD,634658,TGCCTAGGCTGAGACTGCAGTG#GCTACATCTCTCAACCTTGGGCA...,0.846154
6709234,hsa-miR-3192-5p,NM_032417,SPANXD,634648,TCTGGGAGGTTGTAGCAGTGGAA#GCTACATCTCTCAA#..(((((...,1.0


In [6]:
db_mod = db.copy()

In [7]:
db_mod['duplex'] = db_mod['duplex'].apply(lambda x: x.split('#'))

In [8]:
db_mod['d1_miRNAseq'] = db_mod['duplex'].apply(lambda x: x[0])
db_mod['d2_mRNA_bind_site'] = db_mod['duplex'].apply(lambda x: x[1])

d1 - miRNA seq

UGAGGUAGUAGGUUGUAUAGUU
TGAGGTAGTAGGTTGTATAGTT

"&" character as separator.
”.” denotes bases that are essentially unpaired
”,” weakly paired
”|”strongly paired without preference
”{},()” weakly ( >33%) upstream (downstream) paired or strongly ( >66%) up-/downstream paired bases, respectively.

In [9]:
db_mod.drop('duplex', axis=1, inplace=True)

In [10]:
df_lib = db_mod.groupby('miRNA', as_index=False)['d1_miRNAseq'].value_counts()
df_lib.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2656 entries, 0 to 2655
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   miRNA        2656 non-null   object
 1   d1_miRNAseq  2656 non-null   object
 2   count        2656 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 62.4+ KB


In [26]:
df_lib.nunique()

miRNA          2656
d1_miRNAseq    2632
count          2518
dtype: int64

In [27]:
duplicates = df_lib[df_lib.duplicated(subset='d1_miRNAseq', keep=False)==True].sort_values(by='d1_miRNAseq')
display(duplicates)
display(duplicates.info())

Unnamed: 0,miRNA,d1_miRNAseq,count
1776,hsa-miR-548t-3p,AAAAACCACAATTACTTTTGCACCA,3586
1706,hsa-miR-548aa,AAAAACCACAATTACTTTTGCACCA,3586
1772,hsa-miR-548o-5p,AAAAGTAATTGCGGTTTTTGCC,2265
1753,hsa-miR-548c-5p,AAAAGTAATTGCGGTTTTTGCC,2265
1722,hsa-miR-548am-5p,AAAAGTAATTGCGGTTTTTGCC,2265
1712,hsa-miR-548ae-5p,AAAAGTAATTGTGGTTTTTG,378
1710,hsa-miR-548ad-5p,AAAAGTAATTGTGGTTTTTG,378
1866,hsa-miR-570-5p,AAAGGTAATTGCAGTTTTTCCC,5628
1716,hsa-miR-548ai,AAAGGTAATTGCAGTTTTTCCC,5628
414,hsa-miR-199a-3p,ACAGTAGTCTGCACATTGGTTA,10394


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 1776 to 846
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   miRNA        40 non-null     object
 1   d1_miRNAseq  40 non-null     object
 2   count        40 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ KB


None

In [28]:
mirna_seq_clean = df_lib.drop_duplicates(subset='d1_miRNAseq', keep=False, ignore_index=True)
display(mirna_seq_clean)
display(mirna_seq_clean.nunique())

Unnamed: 0,miRNA,d1_miRNAseq,count
0,hsa-let-7a-2-3p,CTGTACAGCCTCCTAGCTTTCC,20958
1,hsa-let-7a-3p,CTATACAATCTACTGTCTTTC,1353
2,hsa-let-7a-5p,TGAGGTAGTAGGTTGTATAGTT,16373
3,hsa-let-7b-3p,CTATACAACCTACTGCCTTCCC,14363
4,hsa-let-7b-5p,TGAGGTAGTAGGTTGTGTGGTT,23817
...,...,...,...
2611,hsa-miR-9986,TGTGAGGTTGTCATGCCTGC,19479
2612,hsa-miR-99a-3p,CAAGCTCGCTTCTATGGGTCTG,17268
2613,hsa-miR-99a-5p,AACCCGTAGATCCGATCTTGTG,6074
2614,hsa-miR-99b-3p,CAAGCTCGTGTCTGTGGGTCCG,19293


miRNA          2616
d1_miRNAseq    2616
count          2503
dtype: int64

In [29]:
mirna_seq_clean['n'] = 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mirna_seq_clean['n'] = 8


In [30]:
from random import choice
def string(length):
    DNA=""
    for count in range(length):
        DNA+=choice("CGTA")
    return DNA

string(8)

'TCTCTCAA'

In [31]:
seq = []
a = string(8)
seq.append(a)

for el in range(len(mirna_seq_clean['n'].index)):
    b = string(mirna_seq_clean.loc[el]['n'])
    while b in seq:
        b = string(mirna_seq_clean.loc[el]['n']) 
    seq.append(b)

print(len(seq))

2617


In [32]:
seq_df = pd.DataFrame(seq, columns = ['barcode'])

seq_df.nunique()

barcode    2617
dtype: int64

In [33]:
final_df = pd.concat([mirna_seq_clean,seq_df], axis = 1)
display(final_df.head())
display(final_df.nunique())
display(final_df.info())

Unnamed: 0,miRNA,d1_miRNAseq,count,n,barcode
0,hsa-let-7a-2-3p,CTGTACAGCCTCCTAGCTTTCC,20958.0,8.0,CCTGGGAA
1,hsa-let-7a-3p,CTATACAATCTACTGTCTTTC,1353.0,8.0,CGGAGCAA
2,hsa-let-7a-5p,TGAGGTAGTAGGTTGTATAGTT,16373.0,8.0,GATGTGGG
3,hsa-let-7b-3p,CTATACAACCTACTGCCTTCCC,14363.0,8.0,GATCTCCT
4,hsa-let-7b-5p,TGAGGTAGTAGGTTGTGTGGTT,23817.0,8.0,ACAGGCGT


miRNA          2616
d1_miRNAseq    2616
count          2503
n                 1
barcode        2617
dtype: int64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2617 entries, 0 to 2616
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   miRNA        2616 non-null   object 
 1   d1_miRNAseq  2616 non-null   object 
 2   count        2616 non-null   float64
 3   n            2616 non-null   float64
 4   barcode      2617 non-null   object 
dtypes: float64(2), object(3)
memory usage: 122.7+ KB


None

In [34]:
final_df.drop([2616], axis=0, inplace=True)
final_df.drop(['n'], axis=1, inplace=True)

In [35]:
final_df['len_miRNA'] = final_df['d1_miRNAseq'].apply(lambda x: len(x))

In [36]:
display(final_df.head())
display(final_df.nunique())

Unnamed: 0,miRNA,d1_miRNAseq,count,barcode,len_miRNA
0,hsa-let-7a-2-3p,CTGTACAGCCTCCTAGCTTTCC,20958.0,CCTGGGAA,22
1,hsa-let-7a-3p,CTATACAATCTACTGTCTTTC,1353.0,CGGAGCAA,21
2,hsa-let-7a-5p,TGAGGTAGTAGGTTGTATAGTT,16373.0,GATGTGGG,22
3,hsa-let-7b-3p,CTATACAACCTACTGCCTTCCC,14363.0,GATCTCCT,22
4,hsa-let-7b-5p,TGAGGTAGTAGGTTGTGTGGTT,23817.0,ACAGGCGT,22


miRNA          2616
d1_miRNAseq    2616
count          2503
barcode        2616
len_miRNA        13
dtype: int64

In [40]:
mask = ['count', 'barcode', 'len_miRNA']
mirna_seq = final_df.drop(mask, axis = 1)
mirna_seq['start_utr_spanxd'] = ('caagaagc').upper()
mirna_seq['barcode'] = 'NNNNNNNN'
mirna_seq['end_utr_spanxd'] = ('aatgacaataaagtttgagaagctga').upper()
mirna_seq['link'] = string(20)

In [41]:
mirna_seq = mirna_seq[['miRNA', 'start_utr_spanxd', 'd1_miRNAseq', 'barcode', 'end_utr_spanxd', 'link']]

In [42]:
mirna_seq

Unnamed: 0,miRNA,start_utr_spanxd,d1_miRNAseq,barcode,end_utr_spanxd,link
0,hsa-let-7a-2-3p,CAAGAAGC,CTGTACAGCCTCCTAGCTTTCC,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
1,hsa-let-7a-3p,CAAGAAGC,CTATACAATCTACTGTCTTTC,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
2,hsa-let-7a-5p,CAAGAAGC,TGAGGTAGTAGGTTGTATAGTT,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
3,hsa-let-7b-3p,CAAGAAGC,CTATACAACCTACTGCCTTCCC,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
4,hsa-let-7b-5p,CAAGAAGC,TGAGGTAGTAGGTTGTGTGGTT,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
...,...,...,...,...,...,...
2611,hsa-miR-9986,CAAGAAGC,TGTGAGGTTGTCATGCCTGC,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
2612,hsa-miR-99a-3p,CAAGAAGC,CAAGCTCGCTTCTATGGGTCTG,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
2613,hsa-miR-99a-5p,CAAGAAGC,AACCCGTAGATCCGATCTTGTG,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA
2614,hsa-miR-99b-3p,CAAGAAGC,CAAGCTCGTGTCTGTGGGTCCG,NNNNNNNN,AATGACAATAAAGTTTGAGAAGCTGA,ACCAGTACAAGCAAATGCTA


In [47]:
#скачать после уточнения utr
mirna_seq.to_excel('/Users/Anna/Study/python/biopython/mirna/mirna db/final.mirna_seq.xlsx')