# Bash
# To get lncRNA_sublocation_TestSet_with_transcript_id.fa
    Instance of lncRNA_sublocation_TestSet_with_transcript_id.fa
    
    >ENST00000376398_1
    TGAAATAGGAGCCAAGGTATGCTATGAGCCAAGGATTATGAGTAATCCAGTTTTGTGCAC
    TTTAAGCCATTTGAAAAACAGAAAAGCAAAACAACAAAATAATTTTTAAGAAATTGAATA
    
    ENST00000376398 and 1 are the ensembl id and tag of lncRNA, respectively.(0:Cytosol, 1:Nuclear)

In [1]:
%%bash

 sed '1d' ../../Datasets/lncRNA_sublocation_TestSet.tsv \
 | awk -v OFS='\t' '{print $1"_"$4,$3}' \
 | seqkit tab2fx > lncRNA_sublocation_TestSet.fa

# Submmit fasta file to the lncLocator
    (http://www.csbio.sjtu.edu.cn/bioinf/lncLocator/#) 
    You can get the prediction (result.fasta).

# Python
# To evaluate performance of lncLocator

In [2]:
import copy
import os
import collections
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

In [3]:
# Evaluate performance of model
def evaluate_performance(y_test, y_pred, y_prob):
    # AUROC
    auroc = metrics.roc_auc_score(y_test,y_prob)
    auroc_curve = metrics.roc_curve(y_test, y_prob)
    # AUPRC
    auprc=metrics.average_precision_score(y_test, y_prob) 
    auprc_curve=metrics.precision_recall_curve(y_test, y_prob)
    #Accuracy
    accuracy=metrics.accuracy_score(y_test,y_pred) 
    #MCC
    mcc=metrics.matthews_corrcoef(y_test,y_pred)
    
    recall=metrics.recall_score(y_test, y_pred)
    precision=metrics.precision_score(y_test, y_pred)
    f1=metrics.f1_score(y_test, y_pred)
    class_report=metrics.classification_report(y_test, y_pred,target_names = ["control","case"])

    model_perf = {"auroc":auroc,"auroc_curve":auroc_curve,
                  "auprc":auprc,"auprc_curve":auprc_curve,
                  "accuracy":accuracy, "mcc": mcc,
                  "recall":recall,"precision":precision,"f1":f1,
                  "class_report":class_report}
        
    return model_perf

In [4]:
# Output result of evaluation
def eval_output(model_perf,path):
    with open(os.path.join(path,"Evaluate_Result_TestSet.txt"),'w') as f:
        f.write("AUROC=%s\tAUPRC=%s\tAccuracy=%s\tMCC=%s\tRecall=%s\tPrecision=%s\tf1_score=%s\n" %
               (model_perf["auroc"],model_perf["auprc"],model_perf["accuracy"],model_perf["mcc"],model_perf["recall"],model_perf["precision"],model_perf["f1"]))
        f.write("\n######NOTE#######\n")
        f.write("#According to help_documentation of sklearn.metrics.classification_report:in binary classification, recall of the positive class is also known as sensitivity; recall of the negative class is specificity#\n\n")
        f.write(model_perf["class_report"])
        roc_auc = model_perf["auroc"]
    # AUROC info
    fpr,tpr,threshold = model_perf["auroc_curve"]
    #return AUROC info
    temp_df = pd.DataFrame({"FPR":fpr,"TPR":tpr})
    temp_df.to_csv(os.path.join(path,"AUROC_info.txt"),header = True,index = False, sep = '\t')

In [5]:
columns = ["ensemble_transcript_id" , "True_Label" , "Pre_Label", "Cyto_Score" , "Nuc_Score"]
df_dict = {"ensemble_transcript_id" : [], "True_Label" : [], "Pre_Label" : [], "Cyto_Score" : [], "Nuc_Score" : []}
input_file = "./result.fasta"
with open(input_file,'r') as f1:
    fasta_result = f1.readlines()
    lnc_num = int(len(fasta_result)/6)
    for i in range(lnc_num):
        index = 6 * i
        #Seq_Name and True_Label
        fasta_name = fasta_result[index].strip().split('\t')[0]
        df_dict["ensemble_transcript_id"].append(fasta_name.split('_')[0][1:])
        df_dict["True_Label"].append(fasta_name.split('_')[1])
        #Nuc_Score
        nuc_score = float(fasta_result[index + 2].strip().split('\t')[1])
        df_dict["Nuc_Score"].append(nuc_score)
        #Cyto_Score
        cyto_score = 1 - nuc_score
        df_dict["Cyto_Score"].append(cyto_score)
        #Pre_Label
        if cyto_score >= nuc_score:
            df_dict["Pre_Label"].append(0)
        else:
            df_dict["Pre_Label"].append(1)
            
outcome = pd.DataFrame(df_dict,columns = columns)
outcome[["True_Label"]] = outcome[["True_Label"]].astype(int)
outcome[["Pre_Label"]] = outcome[["Pre_Label"]].astype(int)

In [6]:
# Output performance of lncLocator
path = "./Evaluation_Result"
if not (os.path.exists(path)):
    os.mkdir(path)
model_perf = evaluate_performance(outcome["True_Label"],outcome["Pre_Label"],outcome["Nuc_Score"])
eval_output(model_perf,path)

In [8]:
outcome_df = outcome.iloc[:,[0,1,2]]
outcome_df.rename(columns={"True_Label":"tag","Pre_Label":"predict_label"},inplace = True)
outcome_df.to_csv(os.path.join(path,"lncRNA_sublocation_TestSet_lncLocator_predict.tsv"),sep = '\t',index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if isinstance(loc, (slice, Series, np.ndarray, Index)):
