## mRNA location classifier in the intestinal epithelium

It provides mRNA location classifier in the intestinal epithelium cell. Through 392 highly expressed and siginficantly localised mRNA transcripts, it trains SVM (Support Vector Machine).

### STEP 1. mRNA sequence retrieval

Moor et al. found 9905 mRNAs in the intestinal epithelium. The list of mRNA is in [this supplement](https://science.sciencemag.org/highwire/filestream/697854/field_highwire_adjunct_files/2/aan2399_table_S1.xlsx). We downloaded mRNA sequences from Ensembl. The sequence data contains each 3'UTR, coding, 5'UTR sequences. The downloaded sequence files (s1_cdna.zip, s1_cds.zip) are in [this site](https://github.com/byeungchun/rnamotif/tree/main/samples).

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import pickle
import numpy as np
import pandas as pd

In [3]:
os.chdir('..')

In [15]:
from mofiwo.utility import load_cdna_cds_zipfile

### STEP 2. feature data generation

It applies [RNA-GPS method](https://rnajournal.cshlp.org/content/early/2020/03/27/rna.074161.119.abstract) to generate features as machine learning algorithm input. Because it makes 4032 features per each sequences, it takes time to calculate. 

In [6]:
from mofiwo.analysis import (
    generate_feature_by_kmer_loc,
    generate_feature_by_kmer_loc_multi
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
def generate_feature_kmer(seqs:dict, use_parallel:bool=True)->pd.DataFrame:
    # generate feature and fill NaN to 0.0
    
    if use_parallel:
        df_seq = generate_feature_by_kmer_loc_multi(seqs)
    else:
        df_seq = generate_feature_by_kmer_loc(seqs, need_log=True)
    
    df_seq = pd.DataFrame(df_seq)
    df_seq.fillna(0.0, inplace=True)

    return df_seq

In [8]:
def generate_class_y_val(df_s1:pd.DataFrame)->pd.DataFrame:
    # Remove non-existing sequence and add a classification value for b values
    df_s1['b2'] = df_s1.b.apply(lambda x: 1 if x>0 else -1)
    
    return df_s1

In [9]:
def get_feature_id(
    df_s1:pd.DataFrame,
    min_bval:float=0.5, 
    min_mean_obs:float=3.5, 
    max_qval:float=0.2,
    choice_prob:float=0.8, # Randomly selected
    used_id:list=None) -> list:
    
    if used_id is not None:
        df_s1 = df_s1[df_s1.target_id.apply(lambda x: x not in list(used_id))]
    
    feature_id = df_s1[(abs(df_s1.b) > min_bval) & 
                       (df_s1.mean_obs > min_mean_obs) & 
                       (df_s1.qval < max_qval)
                      ].target_id
    feature_id = np.random.choice(feature_id, int(len(feature_id) * choice_prob), replace=False)
    return feature_id

In [10]:
def generate_feature_dataframe(
    feature_ids: list,
    df_seq: pd.DataFrame,
    df_s1: pd.DataFrame)->pd.DataFrame:
    
    df_x = df_seq[feature_ids].T
    df_y = pd.DataFrame(
        {x: int(df_s1[df_s1.target_id == x]['b2']) for x in feature_ids}, 
        index=[0]).T
    
    return pd.merge(df_x, df_y, left_index=True, right_index=True)

In [11]:
def generate_train_test_dataset(
    df_xy:pd.DataFrame,
    test_size: float = 0.2,
    random_state: int = 0
)->dict:
    X = df_xy.iloc[:, :-1].values
    y = df_xy.iloc[:, -1].values
    sc = StandardScaler()
    
    if test_size == 0:
        X_train = X
        y_train = y
        X_test = None
        y_test = None
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=test_size,
            random_state=random_state
        )
        X_test = sc.transform(X_test)
        
    X_train = sc.fit_transform(X_train)
    
    return {'x_train': X_train, 'x_test': X_test, 'y_train': y_train, 'y_test': y_test}

## STEP 3. Machine learning exercise

It is in mofiwo module

### STEP 4. Evaluate an optimized model with unused dataset

In [12]:
def get_ml_model_test_data(
    df_s1:pd.DataFrame,
    used_id:list,
    df_seq:pd.DataFrame,
    selected_features: pd.Index = None
)->float:
    
    test_id = get_feature_id(df_s1,used_id=feature_id)
    df_test_xy = generate_feature_dataframe(test_id, df_seq, df_s1)
    if selected_features is not None:
        df_test_xy = df_test_xy.loc[:, selected_features]
    dic_test = generate_train_test_dataset(df_test_xy, test_size =0)
    
    return test_id, dic_test['x_train'], dic_test['y_train']

## Model test with feature selection

In [16]:
import time
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [17]:
from mofiwo.analysis import execute_random_forest

In [None]:
downloadloc = os.path.join(os.path.expanduser('~'), r'workspace\rnamotif\samples')
# load sequence data from 
seqs = load_cdna_cds_zipfile(downloadloc,'s1_cdna.zip','s1_cds.zip')
df_s1 = pd.read_excel(os.path.join(downloadloc,'aan2399_table_S1.xlsx'))

In [70]:
start = time.time()

df_seq = generate_feature_kmer(seqs, use_parallel=True)

print(f'elaspsed_time(sec): {time.time() - start}')

elaspsed_time(sec): 783.926221370697


In [14]:
df_seq = pd.read_parquet('./notebook/feature_intestine.gzip.parquet')
df_seq.fillna(0.0, inplace=True)

In [15]:
df_s1 = df_s1[df_s1.target_id.isin(list(df_seq.columns))]
df_s1 = generate_class_y_val(df_s1)

In [147]:
for num_features in [50,100,150,200,300,500,700,1000]:
    bestfeatures = SelectKBest(score_func=chi2, k=num_features)

    res = list()
    for i in range(100):
        feature_id = get_feature_id(df_s1, choice_prob=0.8)
        df_xy = generate_feature_dataframe(feature_id, df_seq,df_s1)

        # featurization
        X = df_xy.iloc[:,:-1]
        y = df_xy.iloc[:,-1]
        fit = bestfeatures.fit(X,y)
        df_featured = df_xy.loc[:,pd.DataFrame(fit.scores_, index = X.columns).sort_values(0, ascending=False)[0:num_features].index]
        df_xy = pd.merge(df_featured, df_xy.iloc[:,-1],left_index=True, right_index=True)

        dic_tt = generate_train_test_dataset(df_xy, test_size=0)
        # feature_test_id, x_test, y_test = get_ml_model_test_data(df_s1,feature_id,df_seq)
        feature_test_id, x_test, y_test = get_ml_model_test_data(df_s1,feature_id,df_seq, df_xy.columns)
        res.append([{
            '_id':i,
            'x_train':dic_tt['x_train'],
            'y_train':dic_tt['y_train'],
            'x_test':x_test,
            'y_test':y_test,
            'feature_test_id':feature_id,
            'features':list(df_xy.columns)
        }])

    start = time.time()

    execute_random_forest(res, '~\Downloads\mrna2', f'ml_model_feature{num_features}')

    print(f'elapsed_time(sec): {time.time() - start}')

elapsed_time(sec): 2948.5445413589478
elapsed_time(sec): 3051.4732053279877
elapsed_time(sec): 3101.1912755966187
elapsed_time(sec): 3119.8752188682556
elapsed_time(sec): 3240.6897366046906
elapsed_time(sec): 3455.61479473114
elapsed_time(sec): 3583.795501947403
elapsed_time(sec): 3836.619782447815


In [185]:
model_folder = '~\Downloads\mrna2'

res = dict()
for num_features in [50,100,150,200,300,500,700,1000]:
    scores = dict()
    for x in glob.glob(os.path.join(os.path.expanduser(model_folder), f'ml_model_feature{num_features}_*')):
        _model = pickle.load(open(os.path.join(os.path.expanduser(model_folder),x), 'rb'))
        ml_model = _model['ml_model']
        x_test = _model['x_test']
        y_test = _model['y_test']
        scores[x] = {'score': ml_model.score(x_test, y_test), 'features': _model['features'] }
    res[num_features] = scores

In [186]:
scores = dict()
for num_features in [50,100,150,200,300,500,700,1000]:
    scores[num_features] = pd.Series([x['score'] for x in res[num_features].values()])

In [188]:
df_features = pd.DataFrame({k: v.describe() for k, v in scores.items()})

In [190]:
model_folder = '~\Downloads\mrna'

scores = dict()
for x in glob.glob(os.path.join(os.path.expanduser(model_folder), f'ml_model*')):
    _model = pickle.load(open(os.path.join(os.path.expanduser(model_folder),x), 'rb'))
    ml_model = _model['ml_model']
    x_test = _model['x_test']
    y_test = _model['y_test']
    scores[x] = {'score': ml_model.score(x_test, y_test)}


In [197]:
df_all_features = pd.DataFrame(scores.values())['score'].describe()

In [199]:
df_all =pd.merge(df_features, df_all_features, left_index=True, right_index=True)

In [200]:
df_all

Unnamed: 0,50,100,150,200,300,500,700,1000,score
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.794375,0.812813,0.819688,0.815937,0.819531,0.825781,0.816719,0.8375,0.829375
std,0.050581,0.047371,0.046448,0.045815,0.045696,0.054053,0.045668,0.047111,0.040079
min,0.671875,0.6875,0.703125,0.671875,0.71875,0.6875,0.703125,0.71875,0.734375
25%,0.765625,0.78125,0.796875,0.78125,0.78125,0.792969,0.78125,0.8125,0.796875
50%,0.796875,0.8125,0.8125,0.8125,0.8125,0.828125,0.8125,0.84375,0.828125
75%,0.828125,0.84375,0.859375,0.847656,0.84375,0.859375,0.847656,0.863281,0.859375
max,0.921875,0.921875,0.921875,0.90625,0.953125,0.921875,0.921875,0.9375,0.90625
