# Python

***
###  

In [1]:
import os
import copy
import itertools
import numpy as np
import pandas as pd
from sklearn.externals import joblib

In [2]:
# Count the frequency of k-mer in each RNA sequence
#k-mer was normalized by total k-mer count of each RNA sequence
def _count_kmer(Dataset,k): # k = 3,4,5
    
    # copy dataset
    dataset = copy.deepcopy(Dataset)
    # alphbet of nucleotide
    nucleotide = ['A','C','G','T']
    
    # generate k-mers
    #  k == 5:
    five = list(itertools.product(nucleotide,repeat=5))
    pentamer = []
    for n in five:
        pentamer.append("".join(n))
    
    #  k == 4:
    four = list(itertools.product(nucleotide,repeat=4))
    tetramer = []
    for n in four:
        tetramer.append("".join(n))

    # k == 3:
    three = list(itertools.product(nucleotide,repeat=3))
    threemer = []
    for n in three:
        threemer.append("".join(n))
    
    # input features can be combinations of diffrent k values
    if k == 34:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
    if k == 45:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 345:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
        table_kmer.update(dict.fromkeys(pentamer,0))

    # count k-mer for each sequence
    for mer in table_kmer.keys():
        table_kmer[mer] = dataset["cdna"].apply(lambda x : x.count(mer))
    
    # for k-mer raw count without normalization, index: nuc:1 or cyto:0
    rawcount_kmer_df = pd.DataFrame(table_kmer)
    df1_rawcount = pd.concat([rawcount_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    #df1_rawcount.index = dataset["tag"]

    # for k-mer frequency with normalization , index: nuc:1 or cyto:0
    freq_kmer_df = rawcount_kmer_df.apply(lambda x: x/x.sum(),axis=1)
    df1 = pd.concat([freq_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    #df1.index = dataset["tag"]

    return df1,df1_rawcount

In [7]:
# Model input dir 
RNALight_input_dir = "../../../lncRNA/03_Model_Construction/01_Machine_Learning_Model/01_ML_Model_Output"
    
# Load model
RNALight = joblib.load(os.path.join(RNALight_input_dir,"LightGBM/best_LightGBM_model.pkl"))

In [8]:
# Load circRNA input file
circRNA_file = pd.read_csv("./03_PA1_high_exp_circ_info_with_seq.tsv",sep = '\t')

circRNAs = circRNA_file.iloc[:,[4,8]]
circRNAs.columns = ["ensembl_transcript_id","cdna"]

df_kmer_345,df_kmer_345_rawcount = _count_kmer(circRNAs,345)
df_kmer_345.to_csv("circRNAs_df_kmer345_freq.tsv",sep='\t')
df_kmer_345_rawcount.to_csv("circRNAs_df_kmer345_rawcount.tsv",sep='\t')

del df_kmer_345['ensembl_transcript_id']
x_kmer = df_kmer_345.values

y_pred = RNALight.predict(x_kmer)
y_prob = RNALight.predict_proba(x_kmer)[:,1]

In [10]:
circRNAs["RNALight_pred_label"] =  y_pred
circRNAs["RNALight_pred_prob"] = y_prob
circRNAs.to_csv("circRNAs_predict_df.tsv",sep = '\t',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
# Load ciRNA input file
ciRNA_file = pd.read_csv("./03_PA1_high_exp_ci_info_with_seq.tsv",sep = '\t')

ciRNAs = ciRNA_file.iloc[:,[4,8]]
ciRNAs.columns = ["ensembl_transcript_id","cdna"]

df_kmer_345,df_kmer_345_rawcount = _count_kmer(ciRNAs,345)
df_kmer_345.to_csv("ciRNAs_df_kmer345_freq.tsv",sep='\t')
df_kmer_345_rawcount.to_csv("ciRNAs_df_kmer345_rawcount.tsv",sep='\t')

del df_kmer_345['ensembl_transcript_id']

x_kmer = df_kmer_345.values

y_pred = RNALight.predict(x_kmer)
y_prob = RNALight.predict_proba(x_kmer)[:,1]

In [12]:
ciRNAs["RNALight_pred_label"] =  y_pred
ciRNAs["RNALight_pred_prob"] = y_prob
ciRNAs.to_csv("ciRNAs_predict_df.tsv",sep = '\t',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
