# Python

***
### 

In [1]:
import os
import copy
import itertools
import numpy as np
import pandas as pd
from sklearn.externals import joblib

In [2]:
# Count the frequency of k-mer in each RNA sequence
#k-mer was normalized by total k-mer count of each RNA sequence
def _count_kmer(Dataset,k): # k = 3,4,5
    
    # copy dataset
    dataset = copy.deepcopy(Dataset)
    # alphbet of nucleotide
    nucleotide = ['A','C','G','T']
    
    # generate k-mers
    #  k == 5:
    five = list(itertools.product(nucleotide,repeat=5))
    pentamer = []
    for n in five:
        pentamer.append("".join(n))
    
    #  k == 4:
    four = list(itertools.product(nucleotide,repeat=4))
    tetramer = []
    for n in four:
        tetramer.append("".join(n))

    # k == 3:
    three = list(itertools.product(nucleotide,repeat=3))
    threemer = []
    for n in three:
        threemer.append("".join(n))
    
    # input features can be combinations of diffrent k values
    if k == 34:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
    if k == 45:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 345:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
        table_kmer.update(dict.fromkeys(pentamer,0))

    # count k-mer for each sequence
    for mer in table_kmer.keys():
        table_kmer[mer] = dataset["cdna"].apply(lambda x : x.count(mer))
    
    # for k-mer raw count without normalization, index: nuc:1 or cyto:0
    rawcount_kmer_df = pd.DataFrame(table_kmer)
    df1_rawcount = pd.concat([rawcount_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    #df1_rawcount.index = dataset["tag"]

    # for k-mer frequency with normalization , index: nuc:1 or cyto:0
    freq_kmer_df = rawcount_kmer_df.apply(lambda x: x/x.sum(),axis=1)
    df1 = pd.concat([freq_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    #df1.index = dataset["tag"]

    return df1,df1_rawcount

In [3]:
# Input dir 
lncRNA_input_dir = "../../lncRNA/03_Model_Construction/01_Machine_Learning_Model/01_ML_Model_Output"
# Load model
RNALight = joblib.load(os.path.join(lncRNA_input_dir,"LightGBM/best_LightGBM_model.pkl"))

In [10]:
# whole genome lncRNA data
total_lncRNAs = pd.read_csv("../../lncRNA/01_Resources/References/gencode.v30.lncRNA_transcripts_major_compact_trans_id.txt",sep ='\t')
total_df_kmer_345,total_df_kmer_345_rawcount = _count_kmer(total_lncRNAs,345)
del total_df_kmer_345['ensembl_transcript_id']
total_x_kmer = total_df_kmer_345.values

# prediction
RNALight_total_lncRNA_pred = RNALight.predict(total_x_kmer)
RNALight_total_lncRNA_y_prob = RNALight.predict_proba(total_x_kmer)[:,1]

total_lncRNAs["RNALight_pred_label"] =  RNALight_total_lncRNA_pred
total_lncRNAs["RNALight_pred_prob"] = RNALight_total_lncRNA_y_prob

total_lncRNAs.to_csv("./Whole_genome_lncRNA_predict_df.tsv",sep = '\t',index=False)