In [2]:
import os
import copy
import collections
import itertools
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.externals import joblib

In [3]:
# Count the frequency of k-mer in each RNA sequence
#k-mer was normalized by total k-mer count of each RNA sequence
def _count_kmer(Dataset,k): # k = 3,4,5,
    
    # copy dataset
    dataset = copy.deepcopy(Dataset)
    # alphbet of nucleotide
    nucleotide = ['A','C','G','T']
    
    # generate k-mers
    #  k == 7:
    seven = list(itertools.product(nucleotide,repeat=7))
    sevenmer = []
    for n in seven:
        sevenmer.append("".join(n))
    
    #  k == 6:
    six = list(itertools.product(nucleotide,repeat=6))
    hexamer = []
    for n in six:
        hexamer.append("".join(n))

    #  k == 5:
    five = list(itertools.product(nucleotide,repeat=5))
    pentamer = []
    for n in five:
        pentamer.append("".join(n))
    
    #  k == 4:
    four = list(itertools.product(nucleotide,repeat=4))
    tetramer = []
    for n in four:
        tetramer.append("".join(n))

    # k == 3:
    three = list(itertools.product(nucleotide,repeat=3))
    threemer = []
    for n in three:
        threemer.append("".join(n))
    
    # input features can be combinations of diffrent k values
    if k == 34:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
    if k == 45:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 56:
        table_kmer = dict.fromkeys(pentamer,0)
        table_kmer.update(dict.fromkeys(hexamer,0))
    if k == 345:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 456:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
        table_kmer.update(dict.fromkeys(hexamer,0))
    if k == 7:
        table_kmer = dict.fromkeys(sevenmer,0)

    # count k-mer for each sequence
    for mer in table_kmer.keys():
        table_kmer[mer] = dataset["cdna"].apply(lambda x : x.count(mer))
    
    # new series of length
    length = dataset["cdna"].apply(len)
    length.name = "length"
    # for k-mer raw count without normalization, index: nuc:1 or cyto:0
    rawcount_kmer_df = pd.DataFrame(table_kmer)
    df1_rawcount = pd.concat([rawcount_kmer_df,dataset["ensembl_transcript_id"],length],axis = 1)
    #df1_rawcount.index = dataset["tag"]

    # for k-mer frequency with normalization , index: nuc:1 or cyto:0
    freq_kmer_df = rawcount_kmer_df.apply(lambda x: x/x.sum(),axis=1)
    df1 = pd.concat([freq_kmer_df,dataset["ensembl_transcript_id"],length],axis = 1)
    #df1.index = dataset["tag"]

    return df1,df1_rawcount

In [4]:
# Input dir 
snoRNA_input_dir = "../ML_original_lncRNA/20210208_cv_test"

# Output dir
output_dir = "./LncLight_snoRNA_predict"
if not (os.path.exists(output_dir)):
    os.mkdir(output_dir)
    
# Load model
LncLight = joblib.load(os.path.join(snoRNA_input_dir,"LightGBM/best_LightGBM_model.pkl"))

In [7]:
# Load input file
snoRNAs_file = pd.read_csv("../Datasets/gencode.v30.snoRNA_transcripts_major.txt",sep = '\t',header =None)
snoRNAs = snoRNAs_file.iloc[:,[0,5,9]]
snoRNAs.columns = ["ensembl_transcript_id","name","cdna"]

df_kmer_345,df_kmer_345_rawcount = _count_kmer(snoRNAs,345)
df_kmer_345.to_csv(os.path.join(output_dir,"df_kmer345_freq.tsv"),sep='\t')
df_kmer_345_rawcount.to_csv(os.path.join(output_dir,"df_kmer345_rawcount.tsv"),sep='\t')

del df_kmer_345['ensembl_transcript_id']
del df_kmer_345['length']
x_kmer = df_kmer_345.values

y_pred = LncLight.predict(x_kmer)
y_prob = LncLight.predict_proba(x_kmer)[:,1]

In [12]:
snoRNAs["LncLight_pred_label"] =  y_pred
snoRNAs["LncLight_pred_prob"] = y_prob
snoRNAs.to_csv(os.path.join(output_dir,"snoRNAs_predict_df.tsv"),sep = '\t',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
