# Python

***
###  

In [1]:
import os
import copy
import itertools
import numpy as np
import pandas as pd
from sklearn.externals import joblib

In [7]:
# Count the frequency of k-mer in each RNA sequence
#k-mer was normalized by total k-mer count of each RNA sequence
def _count_kmer(Dataset,k): # k = 3,4,5,6,7 & length info
    
    # copy dataset
    dataset = copy.deepcopy(Dataset)
    # alphbet of nucleotide
    nucleotide = ['A','C','G','T']
    
    # generate k-mers
    #  k == 5:
    five = list(itertools.product(nucleotide,repeat=5))
    pentamer = []
    for n in five:
        pentamer.append("".join(n))
    
    #  k == 4:
    four = list(itertools.product(nucleotide,repeat=4))
    tetramer = []
    for n in four:
        tetramer.append("".join(n))

    # k == 3:
    three = list(itertools.product(nucleotide,repeat=3))
    threemer = []
    for n in three:
        threemer.append("".join(n))
    
    # input features can be combinations of diffrent k values
    if k == 34:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
    if k == 45:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 345:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
        table_kmer.update(dict.fromkeys(pentamer,0))

    # count k-mer for each sequence
    for mer in table_kmer.keys():
        table_kmer[mer] = dataset["cdna"].apply(lambda x : x.count(mer))
    
    # for k-mer raw count without normalization, index: nuc:1 or cyto:0
    rawcount_kmer_df = pd.DataFrame(table_kmer)
    df1_rawcount = pd.concat([rawcount_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    #df1_rawcount.index = dataset["tag"]

    # for k-mer frequency with normalization , index: nuc:1 or cyto:0
    freq_kmer_df = rawcount_kmer_df.apply(lambda x: x/x.sum(),axis=1)
    df1 = pd.concat([freq_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    #df1.index = dataset["tag"]

    return df1,df1_rawcount

In [2]:
# Count the frequency of k-mer in each RNA sequence
#k-mer was normalized by total k-mer count of each RNA sequence
def _count_kmer(Dataset,k): # k = 3,4,5,6,7 & length info
    
    # copy dataset
    dataset = copy.deepcopy(Dataset)
    # alphbet of nucleotide
    nucleotide = ['A','C','G','T']
    
    # generate k-mers
    #  k == 5:
    five = list(itertools.product(nucleotide,repeat=5))
    pentamer = []
    for n in five:
        pentamer.append("".join(n))
    
    #  k == 4:
    four = list(itertools.product(nucleotide,repeat=4))
    tetramer = []
    for n in four:
        tetramer.append("".join(n))

    # k == 3:
    three = list(itertools.product(nucleotide,repeat=3))
    threemer = []
    for n in three:
        threemer.append("".join(n))
    
    # input features can be combinations of diffrent k values
    if k == 34:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
    if k == 45:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 345:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
        table_kmer.update(dict.fromkeys(pentamer,0))

    # count k-mer for each sequence
    for mer in table_kmer.keys():
        table_kmer[mer] = dataset["cdna"].apply(lambda x : x.count(mer))

    # for k-mer raw count without normalization, index: nuc:1 or cyto:0
    rawcount_kmer_df = pd.DataFrame(table_kmer)
    df1_rawcount = pd.concat([rawcount_kmer_df],axis = 1)
    #df1_rawcount.index = dataset["tag"]

    # for k-mer frequency with normalization , index: nuc:1 or cyto:0
    freq_kmer_df = rawcount_kmer_df.apply(lambda x: x/x.sum(),axis=1)
    df1 = pd.concat([freq_kmer_df],axis = 1)
    #df1.index = dataset["tag"]

    return df1,df1_rawcount

In [8]:
# Input dir 
RNALight_input_dir = "../../lncRNA/03_Model_Construction/01_Machine_Learning_Model/01_ML_Model_Output"

# Load model
RNALight = joblib.load(os.path.join(RNALight_input_dir,"LightGBM/best_LightGBM_model.pkl"))

***
###   

## 1. lncRNA with intron

In [9]:
# Load lncRNA with intron file
lncRNA_with_intron_file = pd.read_csv("./01_RNA_retaining_intron_files/gencode.v30.lncRNA_transcripts_with_intron_major_compact_trans_id.tsv",sep = '\t',header= None)
lncRNA_with_intron_file.columns = ["ensembl_transcript_id","name","cdna"]

In [10]:
df_kmer_345,df_kmer_345_rawcount = _count_kmer(lncRNA_with_intron_file,345)
df_kmer_345.to_csv("lncRNA_with_intron_df_kmer345_freq.tsv",sep='\t')
df_kmer_345_rawcount.to_csv("lncRNA_with_intron_df_kmer345_rawcount.tsv",sep='\t')

del df_kmer_345['ensembl_transcript_id']
x_kmer = df_kmer_345.values

y_pred = RNALight.predict(x_kmer)
y_prob = RNALight.predict_proba(x_kmer)[:,1]

KeyboardInterrupt: 

In [None]:
lncRNA_with_intron_file["RNALight_pred_label"] =  y_pred
lncRNA_with_intron_file["RNALight_pred_prob"] = y_prob
lncRNA_with_intron_file.to_csv("lncRNA_with_intron_file_predict_df.tsv",sep = '\t',index=False)

***
###   

## 2. mRNA with intron

In [9]:
# Load mRNA with intron file
mRNA_with_intron_file = pd.read_csv("./01_RNA_retaining_intron_files/gencode.v30.pc_mRNA_transcripts_with_intron_major_compact_trans_id.tsv",sep = '\t',header= None)
mRNA_with_intron_file.columns = ["ensembl_transcript_id","name","cdna"]

In [None]:
df_kmer_345,df_kmer_345_rawcount = _count_kmer(mRNA_with_intron_file,345)
df_kmer_345.to_csv("mRNA_with_intron_df_kmer345_freq.tsv",sep='\t')
df_kmer_345_rawcount.to_csv("mRNA_with_intron_df_kmer345_rawcount.tsv",sep='\t')

del df_kmer_345['ensembl_transcript_id']
x_kmer = df_kmer_345.values

y_pred = RNALight.predict(x_kmer)
y_prob = RNALight.predict_proba(x_kmer)[:,1]

In [None]:
mRNA_with_intron_file["RNALight_pred_label"] =  y_pred
mRNA_with_intron_file["RNALight_pred_prob"] = y_prob
mRNA_with_intron_file.to_csv("mRNA_with_intron_file_predict_df.tsv",sep = '\t',index=False)