## mRNA location classifier in the intestinal epithelium

It provides mRNA location classifier in the intestinal epithelium cell. Through 392 highly expressed and siginficantly localised mRNA transcripts, it trains SVM (Support Vector Machine).

### STEP 1. mRNA sequence retrieval

Moor et al. found 9905 mRNAs in the intestinal epithelium. The list of mRNA is in [this supplement](https://science.sciencemag.org/highwire/filestream/697854/field_highwire_adjunct_files/2/aan2399_table_S1.xlsx). We downloaded mRNA sequences from Ensembl. The sequence data contains each 3'UTR, coding, 5'UTR sequences. The downloaded sequence files (s1_cdna.zip, s1_cds.zip) are in [this site](https://github.com/byeungchun/rnamotif/tree/main/samples).

In [81]:
import os
import numpy as np
import pandas as pd

from mofiwo.utility import (
    load_rna_fasta_zipfile,
    generate_utr_from_cdna_cds
)

In [88]:
def retrieve_seq(
    downloadloc:str,
    cdna_file:str,
    cds_file:str)->dict:
    
    # Load sequences from zip file
    seq_cdna = load_rna_fasta_zipfile(os.path.join(downloadloc, cdna_file))
    seq_cds = load_rna_fasta_zipfile(os.path.join(downloadloc, cds_file))

    # Classify CDS, UTR region
    seqs = dict()
    for k, v in generate_utr_from_cdna_cds(seq_cdna, seq_cds).items():
        seqs.update({k: {'CDS': seq_cds[k].seq, 'UT3': v['utr3'], 'UT5': v['utr5']}})
    
    return seqs

### STEP 2. feature data generation

It applies [RNA-GPS method](https://rnajournal.cshlp.org/content/early/2020/03/27/rna.074161.119.abstract) to generate features as machine learning algorithm input. Because it makes 4032 features per each sequences, it takes time to calculate. 

In [8]:
from mofiwo.analysis import generate_feature_by_kmer_loc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [89]:
def generate_feature_kmer(seqs:dict)->pd.DataFrame:
    # generate feature and fill NaN to 0.0
    df_seq = generate_feature_by_kmer_loc(seqs, need_log=True)
    df_seq.fillna(0.0, inplace=True)

    return df_seq

In [158]:
def generate_class_y_val(df_s1:pd.DataFrame)->pd.DataFrame:
    # Remove non-existing sequence and add a classification value for b values
    df_s1['b2'] = df_s1.b.apply(lambda x: 1 if x>0 else -1)
    
    return df_s1

In [153]:
def get_feature_id(
    df_s1:pd.DataFrame,
    min_bval:float=0.5, 
    min_mean_obs:float=3.5, 
    max_qval:float=0.2,
    choice_prob:float=0.8, # Randomly selected
    used_id:list=None) -> list:
    
    if used_id is not None:
        df_s1 = df_s1[df_s1.target_id.apply(lambda x: x not in list(used_id))]
    
    feature_id = df_s1[(abs(df_s1.b) > min_bval) & 
                       (df_s1.mean_obs > min_mean_obs) & 
                       (df_s1.qval < max_qval)
                      ].target_id
    feature_id = np.random.choice(feature_id, int(len(feature_id) * choice_prob), replace=False)
    return feature_id

In [12]:
def generate_feature_dataframe(
    feature_ids: list,
    df_seq: pd.DataFrame,
    df_s1: pd.DataFrame)->pd.DataFrame:
    
    df_x = df_seq[feature_ids].T
    df_y = pd.DataFrame(
        {x: int(df_s1[df_s1.target_id == x]['b2']) for x in feature_ids}, 
        index=[0]).T
    
    return pd.merge(df_x, df_y, left_index=True, right_index=True)

In [13]:
def generate_train_test_dataset(
    df_xy:pd.DataFrame,
    test_size: float = 0.2,
    random_state: int = 0
)->dict:
    X = df_xy.iloc[:, :-1].values
    y = df_xy.iloc[:, -1].values
    sc = StandardScaler()
    
    if test_size == 0:
        X_train = X
        y_train = y
        X_test = None
        y_test = None
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=test_size,
            random_state=random_state
        )
        X_test = sc.transform(X_test)
        
    X_train = sc.fit_transform(X_train)
    
    return {'x_train': X_train, 'x_test': X_test, 'y_train': y_train, 'y_test': y_test}

## STEP 3. Machine learning exercise

In [14]:
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [165]:
def generate_bayes_optimized_randomforest(X_train, y_train):
    search_space = {"bootstrap": Categorical([True, False]), # values for boostrap can be either True or False
            "max_depth": Integer(6, 20), # values of max_depth are integers from 6 to 20
            "max_features": Categorical(['auto', 'sqrt','log2']), 
            "min_samples_leaf": Integer(2, 10),
            "min_samples_split": Integer(2, 10),
            "n_estimators": Integer(100, 500)
        }

    forest_clf = RandomForestClassifier()

    best_scores = list()
    def on_step(optim_result):
        if forest_bayes_search.best_score_ >= 0.90:
            print('Interrupting!')
            return True
        else:
            best_scores.append([forest_bayes_search.best_index_, forest_bayes_search.best_score_])
        if len(best_scores) % 5 == 0:
            print(f'# of exercises: {len(best_scores):5}(score: {round(forest_bayes_search.best_score_,3)})')

    forest_bayes_search = BayesSearchCV(forest_clf, search_space, n_iter=32, scoring="accuracy", n_jobs=-1, cv=5)
    forest_bayes_search.fit(X_train, y_train, callback=on_step)
    
    return forest_bayes_search    

### STEP 4. Evaluate an optimized model with unused dataset

In [174]:
def get_ml_model_test_data(
    df_s1:pd.DataFrame,
    used_id:list,
    df_seq:pd.DataFrame
)->float:
    
    test_id = get_feature_id(df_s1,used_id=feature_id)
    df_test_xy = generate_feature_dataframe(test_id, df_seq, df_s1)
    dic_test = generate_train_test_dataset(df_test_xy, test_size =0)
    
    return test_id, dic_test['x_train'], dic_test['y_train']

## Model test

In [146]:
downloadloc = os.path.join(os.path.expanduser('~'), r'workspace\rnamotif\samples')
# load sequence data from 
seqs = retrieve_seq(downloadloc,'s1_cdna.zip','s1_cds.zip')
df_s1 = pd.read_excel(os.path.join(downloadloc,'aan2399_table_S1.xlsx'))
df_s1 = df_s1[df_s1.target_id.isin(list(df_seq.columns))]

CDNA contains 2074 sequences more than CDS
Can not find coding sequence(CDS) position in ENSMUST00000205363
Can not find coding sequence(CDS) position in ENSMUST00000189670
Can not find coding sequence(CDS) position in ENSMUST00000207980
Can not find coding sequence(CDS) position in ENSMUST00000115747
Can not find coding sequence(CDS) position in ENSMUST00000150000
Can not find coding sequence(CDS) position in ENSMUST00000178641
Can not find coding sequence(CDS) position in ENSMUST00000137526
Can not find coding sequence(CDS) position in ENSMUST00000138548
Can not find coding sequence(CDS) position in ENSMUST00000124831
Can not find coding sequence(CDS) position in ENSMUST00000136936
Can not find coding sequence(CDS) position in ENSMUST00000129691
Can not find coding sequence(CDS) position in ENSMUST00000099051
Can not find coding sequence(CDS) position in ENSMUST00000108407
Can not find coding sequence(CDS) position in ENSMUST00000151408
Can not find coding sequence(CDS) position in E

Can not find coding sequence(CDS) position in ENSMUST00000193446
Can not find coding sequence(CDS) position in ENSMUST00000206915
Can not find coding sequence(CDS) position in ENSMUST00000154381
Can not find coding sequence(CDS) position in ENSMUST00000087332
Can not find coding sequence(CDS) position in ENSMUST00000101432
Can not find coding sequence(CDS) position in ENSMUST00000082418
Can not find coding sequence(CDS) position in ENSMUST00000132442
Can not find coding sequence(CDS) position in ENSMUST00000187939
Can not find coding sequence(CDS) position in ENSMUST00000145664
Can not find coding sequence(CDS) position in ENSMUST00000178404
Can not find coding sequence(CDS) position in ENSMUST00000208477
Can not find coding sequence(CDS) position in ENSMUST00000124737
Can not find coding sequence(CDS) position in ENSMUST00000091852
Can not find coding sequence(CDS) position in ENSMUST00000159516
Can not find coding sequence(CDS) position in ENSMUST00000197336
Can not find coding seque

Can not find coding sequence(CDS) position in ENSMUST00000134844
Can not find coding sequence(CDS) position in ENSMUST00000130578
Can not find coding sequence(CDS) position in ENSMUST00000174830
Can not find coding sequence(CDS) position in ENSMUST00000114464
Can not find coding sequence(CDS) position in ENSMUST00000180021
Can not find coding sequence(CDS) position in ENSMUST00000187619
Can not find coding sequence(CDS) position in ENSMUST00000161885
Can not find coding sequence(CDS) position in ENSMUST00000136835
Can not find coding sequence(CDS) position in ENSMUST00000155638
Can not find coding sequence(CDS) position in ENSMUST00000113426
Can not find coding sequence(CDS) position in ENSMUST00000201002
Can not find coding sequence(CDS) position in ENSMUST00000084986
Can not find coding sequence(CDS) position in ENSMUST00000205823
Can not find coding sequence(CDS) position in ENSMUST00000195151
Can not find coding sequence(CDS) position in ENSMUST00000107024
Can not find coding seque

Can not find coding sequence(CDS) position in ENSMUST00000146044
Can not find coding sequence(CDS) position in ENSMUST00000099046
Can not find coding sequence(CDS) position in ENSMUST00000182085
Can not find coding sequence(CDS) position in ENSMUST00000130328
Can not find coding sequence(CDS) position in ENSMUST00000201739
Can not find coding sequence(CDS) position in ENSMUST00000162898
Can not find coding sequence(CDS) position in ENSMUST00000074557
Can not find coding sequence(CDS) position in ENSMUST00000091436
Can not find coding sequence(CDS) position in ENSMUST00000208322
Can not find coding sequence(CDS) position in ENSMUST00000201198
Can not find coding sequence(CDS) position in ENSMUST00000151747
Can not find coding sequence(CDS) position in ENSMUST00000099047
Can not find coding sequence(CDS) position in ENSMUST00000202697
Can not find coding sequence(CDS) position in ENSMUST00000129979
Can not find coding sequence(CDS) position in ENSMUST00000082407
Can not find coding seque

Can not find coding sequence(CDS) position in ENSMUST00000135934
Can not find coding sequence(CDS) position in ENSMUST00000148759
Can not find coding sequence(CDS) position in ENSMUST00000185533
Can not find coding sequence(CDS) position in ENSMUST00000114653
Can not find coding sequence(CDS) position in ENSMUST00000166353
Can not find coding sequence(CDS) position in ENSMUST00000161758
Can not find coding sequence(CDS) position in ENSMUST00000148643
Can not find coding sequence(CDS) position in ENSMUST00000140178
Can not find coding sequence(CDS) position in ENSMUST00000151243
Can not find coding sequence(CDS) position in ENSMUST00000146307
Can not find coding sequence(CDS) position in ENSMUST00000156110
Can not find coding sequence(CDS) position in ENSMUST00000112417
Can not find coding sequence(CDS) position in ENSMUST00000089485
Can not find coding sequence(CDS) position in ENSMUST00000155712
Can not find coding sequence(CDS) position in ENSMUST00000185692
Can not find coding seque

(9905, 7)


In [92]:
df_seq = generate_feature_kmer(seqs)

# proceed: 100
# proceed: 200


KeyboardInterrupt: 

In [179]:
feature_id = get_feature_id(df_s1, choice_prob=0.8)
df_s1 = generate_class_y_val(df_s1)
df_xy = generate_feature_dataframe(feature_id, df_seq,df_s1)
dic_tt = generate_train_test_dataset(df_xy, test_size=0)

In [180]:
ml_model = generate_bayes_optimized_randomforest(dic_tt['x_train'],dic_tt['y_train'])

# of exercises:     5(score: 0.833)
# of exercises:    10(score: 0.833)
# of exercises:    15(score: 0.839)
# of exercises:    20(score: 0.842)
# of exercises:    25(score: 0.842)
# of exercises:    30(score: 0.842)


In [181]:
feature_test_id, x_test, y_test = get_ml_model_test_data(df_s1,feature_id,df_seq)

In [182]:
len(feature_test_id)

64

In [183]:
ml_model.score(dic_test['x_train'], dic_test['y_train'])

0.984375