In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings

In [4]:
os.chdir('..')
warnings.filterwarnings('ignore')

In [68]:
import re
import pickle
import pandas as pd
import numpy as np

from Bio import SeqIO
from mofiwo.connector.ensembl import read_ensembl
from mofiwo.utility.rna_handler import load_rna_fasta_zipfile
from mofiwo.utility.rna_handler import generate_3utr_from_cdna_cds

In [7]:
fasta_cds_zipfile = r'D:\workspace\rnamotif\samples\s1_cds.zip'
fasta_cdna_zipfile = r'D:\workspace\rnamotif\samples\s1_cdna.zip'

In [9]:
pickle_files = [x for x in os.listdir(os.path.expanduser(r'~\Downloads')) if x.endswith('pickle')]

In [25]:
df_apical = pd.DataFrame(pickle.load(open(os.path.join(os.path.expanduser(r'~\Downloads'), 'apical_bp_3.pickle'), 'rb')))
df_basal = pd.DataFrame(pickle.load(open(os.path.join(os.path.expanduser(r'~\Downloads'), 'basal_bp_3.pickle'), 'rb')))

In [None]:
dic_cds = load_rna_fasta_zipfile(fasta_cds_zipfile)
dic_cdna = load_rna_fasta_zipfile(fasta_cdna_zipfile)
dic_utr3 = generate_3utr_from_cdna_cds(dic_cdna, dic_cds)

In [32]:
df_basal.shape

(64, 87)

In [31]:
df_basal[[x for x in df_basal.columns if x in dic_utr3.keys()]].shape

(64, 87)

In [19]:
seq_str = str(dic_utr3['ENSMUST00000087600'].seq)
for x in re.finditer('AAAAAA', seq_str):
    print(f'{x.start()}:{x.end()} - {seq_str[x.start():x.end()]}')

88:94 - AAAAAA
2646:2652 - AAAAAA
2905:2911 - AAAAAA
2911:2917 - AAAAAA
3317:3323 - AAAAAA
3849:3855 - AAAAAA


In [24]:
pd.Series([len(x.seq) for x in dic_utr3.values()]).describe()

count     7224.000000
mean      1717.109635
std       1798.167506
min          1.000000
25%        511.000000
50%       1187.500000
75%       2364.000000
max      39397.000000
dtype: float64

In [49]:
foo = df_basal.apply(lambda x: x / len(dic_utr3[x.name].seq))

In [83]:
len(dic_utr3['ENSMUST00000087600'].seq)

3881

In [82]:
df_basal['ENSMUST00000087600'].sum()

3661

In [None]:
bp_all = itertools.product(['A','T','C','G'], repeat=3)
bp_all = [''.join(x) for x in bp_all]
bp_all

In [None]:
import concurrent.futures
import urllib.request

def calculate_bp_count(seq_str, bp_all):
    bp_cnt = {bp:0 for bp in bp_all}
    for bp in bp_all:
        num_bp = len(re.findall(bp, seq_str)) 
        if num_bp > 0:
            bp_cnt[bp] += num_bp
    return bp_cnt

In [None]:
seq_str = str(dic_utr3['ENSMUST00000087600'].seq)

In [74]:
{x: [len(dic_utr3[x].seq), round(bar[x],2), df_basal.loc['AAA'][x]] for x in bar[bar > 0.05].index}

{'ENSMUST00000113402': [246, 0.07, 18],
 'ENSMUST00000081840': [54, 0.06, 3],
 'ENSMUST00000160662': [733, 0.08, 61],
 'ENSMUST00000078804': [68, 0.06, 4],
 'ENSMUST00000142643': [50, 0.06, 3],
 'ENSMUST00000000756': [42, 0.07, 3],
 'ENSMUST00000106588': [36, 0.08, 3],
 'ENSMUST00000027193': [159, 0.05, 8],
 'ENSMUST00000188524': [44, 0.07, 3]}

In [84]:
[len(dic_utr3[x].seq) - df_basal[x].sum() * 3 for x in df_basal.columns]

[-7102,
 -3711,
 -1950,
 -1773,
 -3748,
 -23789,
 -4180,
 -10684,
 -4172,
 -3589,
 -5777,
 -5993,
 -3345,
 -4853,
 -380,
 -5043,
 -4938,
 -3970,
 -2152,
 -2638,
 -2556,
 -3815,
 -6611,
 -1501,
 -15978,
 -13928,
 -4980,
 -5453,
 -1023,
 -187,
 -80,
 -118,
 -5572,
 -4482,
 -490,
 -1208,
 -6319,
 -12341,
 -3831,
 -3375,
 -7877,
 -2019,
 -147,
 -390,
 -14712,
 -87,
 -5197,
 -1572,
 -10616,
 -3571,
 -30464,
 -72998,
 -135,
 -7501,
 -4243,
 -5205,
 -1355,
 -112,
 -3809,
 -6106,
 -328,
 -79,
 -4355,
 -2425,
 -66,
 -4793,
 -6011,
 -1177,
 -3742,
 -3070,
 -117,
 -71,
 -1314,
 -60,
 -96,
 -3365,
 -2970,
 -1815,
 -738,
 -9192,
 -1240,
 -1108,
 -396,
 -2275,
 -264,
 -79,
 -2782]

246

In [78]:
df_basal['ENSMUST00000113402']

AAA    18
AAT     6
AAC     2
AAG     6
ATA     6
       ..
GCG     1
GGA     7
GGT     5
GGC     7
GGG    11
Name: ENSMUST00000113402, Length: 64, dtype: int64

In [135]:
def generate_ranking_series(df, col_name):
    _df = df.sum(axis=1)
    _df = _df.sort_values(ascending=False)
    df2 = pd.Series([i+1 for i,x in enumerate(_df)], _df.index)
    df2.name = col_name
    return df2

In [140]:
df_rank = pd.merge(generate_ranking_series(df_apical, 'apical'), generate_ranking_series(df_basal, 'basal'), right_index=True, left_index=True)
df_rank.corr().applymap('{:,.2f}'.format)

Unnamed: 0,apical,basal
apical,1.0,0.33
basal,0.33,1.0


In [146]:
df_apical_even = df_apical[[x for i, x in enumerate(df_apical.columns) if i % 2 == 0]]
df_apical_odd = df_apical[[x for i, x in enumerate(df_apical.columns) if i % 2 == 1]]
df_rank2 = pd.merge(generate_ranking_series(df_apical_even, 'apical1'), generate_ranking_series(df_apical_odd, 'apical2'), right_index=True, left_index=True)
df_rank2.corr().applymap('{:,.2f}'.format)

Unnamed: 0,apical1,apical2
apical1,1.0,0.33
apical2,0.33,1.0


In [157]:
df_basal_even = df_basal[[x for i, x in enumerate(df_basal.columns) if i % 2 == 0]]
df_basal_odd = df_basal[[x for i, x in enumerate(df_basal.columns) if i % 2 == 1]]
df_rank3 = pd.merge(generate_ranking_series(df_basal_even, 'basal1'), generate_ranking_series(df_basal_odd, 'basal2'), right_index=True, left_index=True)
df_rank3.corr().applymap('{:,.2f}'.format)

Unnamed: 0,basal1,basal2
basal1,1.0,0.62
basal2,0.62,1.0


In [156]:
df_rank4 = pd.merge(generate_ranking_series(df_basal_even, 'basal1'),generate_ranking_series(df_apical_odd, 'apical2'), right_index=True, left_index=True)
df_rank4.corr().applymap('{:,.2f}'.format)

Unnamed: 0,basal1,apical2
basal1,1.0,0.3
apical2,0.3,1.0


In [162]:
df_rank5 = pd.merge(generate_ranking_series(df_basal_odd, 'basal2'),generate_ranking_series(df_apical_even, 'apical1'), right_index=True, left_index=True)
df_rank5.corr().applymap('{:,.2f}'.format)

Unnamed: 0,basal2,apical1
basal2,1.0,0.46
apical1,0.46,1.0


In [161]:
df_rank6 = pd.merge(generate_ranking_series(df_basal_odd, 'basal2'),generate_ranking_series(df_basal_odd, 'basal2_dup'), right_index=True, left_index=True)
df_rank6.corr().applymap('{:,.2f}'.format)

Unnamed: 0,basal2,basal2_dup
basal2,1.0,1.0
basal2_dup,1.0,1.0


In [154]:
df_basal.sum(axis=1).sort_values(ascending=False)

TTTTTTTTTT    38
AAAAAAAAAA    36
CACACACACA    36
AGAAAGAAAG    33
AAGAAAGAAA    33
              ..
TCGCGGTTGG     0
TCGCGGTCAA     0
TCGCGGTCAT     0
TCGCGGTCAC     0
CAAAAAGAAT     0
Length: 1048576, dtype: int64