In [None]:
# This notebook defines some functions that are used to calculate AUROC and AUPR for each scored screen.
# Two benchmarks are also loaded and processed here. 

In [2]:
%matplotlib inline
import pandas as pd
import scipy.stats as stats
import scipy.cluster.hierarchy as clust
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from itertools import combinations


  from pandas.core import (


In [None]:
# Pre-define list of the scoring methods
scores = ['zdLFC', 'Orthrus',  'Gemini(Strong)','Gemini(Sens)', 'Horlbeck', 'Parrish']

In [None]:
# This function takes in each dataframe of gene pairs score and alphabetically sorts the gene pairs, such that if the A1<A2,
# the gene pair is A1_A2, else its A2_A1
def reindex_alphbetically(df):
    df['gene_pair'] = df.index
    
    for index, row in df.iterrows():
        a, b = index.split('_')
        if a < b:
            df.at[index, 'gene_pair'] = f'{a}_{b}'
        else:
            df.at[index, 'gene_pair'] = f'{b}_{a}'
    df.set_index('gene_pair', inplace = True)     
    return(df)

In [None]:
# Function to calculate Jaccard similarity
def jaccard_similarity(df, percentile):
        
    def keep_top_X_percent(series, percentile):
        threshold = series.quantile(percentile)  # 95th percentile
        #print(threshold)
        return series.where(series >= threshold, np.nan)
    
    def CalculateSimilarity(set1,set2):
        set1 = set(set1)
        set2 = set(set2)
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union != 0 else np.nan
    # Calculate pairwise Jaccard similarities

    
    df = df.apply(lambda col: keep_top_X_percent(col, percentile), axis=0)
    #display(df)
    # Extract non-NA indices for each column
    non_na_indices = {col: df[df[col].notna()].index.tolist() for col in df.columns}
    
    # Convert dictionary to DataFrame for better viewing
    df_indices = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in non_na_indices.items()]))
    #display(df_indices)
    # Create a DataFrame to store Jaccard similarities
    columns = df_indices.columns
    
    jaccard_matrix = pd.DataFrame(np.nan, index=columns, columns=columns)

    for col1, col2 in combinations(columns, 2):
        
        similarity = CalculateSimilarity(df_indices[col1], df_indices[col2])
        jaccard_matrix.at[col1, col2] = similarity
        jaccard_matrix.at[col2, col1] = similarity
    





    # Fill diagonal with 1 since Jaccard similarity with itself is always 1
    np.fill_diagonal(jaccard_matrix.values, 1)
    return jaccard_matrix
    




In [None]:
def ROC_Curve(scores, labels, ground_truth, title = ""):
    results = []
    
    plt.figure(0).clf()
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label = "random")
    for score, label in zip(scores, labels):   
        data = { 'ground_truth': ground_truth,
                'score': score}
        
        
        df = pd.DataFrame(data)

        # Remove rows with NaN values in either column
        
        df_clean = df.dropna()
        
        if (df_clean.shape[0] == 0):
            print("Error - All NA's here")
            fpr = np.nan
            tpr = np.nan
            auc = np.nan
        else:
            fpr, tpr, _ = metrics.roc_curve(df_clean['ground_truth'],  df_clean['score'])
            auc = metrics.auc(fpr, tpr)
            
            #create ROC curve
            plt.plot(fpr,tpr, label=label + ", auc="+str(auc.round(2)) + " n=" + str(df_clean.shape[0]))
            
        results.append((auc, label,df_clean.shape[0],df_clean['ground_truth'].sum()))
        
    plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.5), shadow=True, ncol=2)
 
    plt.title(title)
    plt.show()
    return results

def PR_Curve(scores, labels, ground_truth, title = ""):
    results = []
    baseline = sum(ground_truth)/len(ground_truth)
    plt.figure(0).clf()
    plt.ylabel('Precision')
    plt.xlabel('Recall')
    plt.plot([0, 1], [baseline, baseline], color='black', lw=2, linestyle='--', label = "baseline")

    for score, label in zip(scores, labels):    
        data = { 'ground_truth': ground_truth,
                'score': score}
        
        
        df = pd.DataFrame(data)

        # Remove rows with NaN values in either column
        df_clean = df.dropna()
        

        if (df_clean.shape[0] > 0):
            lr_precision, lr_recall, _ = metrics.precision_recall_curve(df_clean['ground_truth'],  df_clean['score'])
            lr_auc = metrics.auc(lr_recall, lr_precision)
            
                
                
            plt.plot(lr_recall,lr_precision, label=label + ", auc="+ str(lr_auc.round(2)) + " n=" + str(df_clean.shape[0]))

        else:
            print("Error - All NA's here")
            lr_precision = np.nan
            lr_recall = np.nan
            lr_auc = np.nan


        results.append((lr_auc, label,df_clean.shape[0],df_clean['ground_truth'].sum()))

    plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.5), shadow=True, ncol=2)

    plt.title(title)
    plt.show()
    return results

In [None]:
# For each study/cell line, this function calls the ROC_Curve and PR_curve functions
def Cell_Line_Analysis(df,cell_line_name, study_name, labels, ground_truth):
    
    df = df.join(ground_truth, how='left')
    df = df.dropna(subset=['ground_truth'])

    ground_truth = df["ground_truth"]
       
    title = study_name + ": " + cell_line_name
    
    
    scores =  [df[column].tolist() for column in df.columns]
    
    roc_auc = ROC_Curve(scores, labels, ground_truth, title )
    pr_auc = PR_Curve(scores, labels ,ground_truth, title)

    return roc_auc, pr_auc, cell_line_name, study_name


In [None]:
# Takes in data returned by the Cell_Line_Analysis function and organises it in form of a df. 
def ConvertResultsToDF(list_of_results):
    dfs_roc = []
    dfs_pr = []

    for result_list in list_of_results:
        
        roc = result_list[0]
        pr = result_list[1]
        cell_line = result_list[2]
        study_name = result_list[3]
        
        #ROC
        
        roc = pd.DataFrame(roc, columns=['value', 'Score', 'Common samples', 'Positive Samples'])
       
        roc['Cell line'] = cell_line
        roc['Study name'] = study_name

        dfs_roc.append(roc)
        
        #PR
        
        pr = pd.DataFrame(pr, columns=['value', 'Score', 'Common samples', 'Positive Samples'])

        pr['Cell line'] = cell_line
        pr['Study name'] = study_name
        dfs_pr.append(pr)
        
        
         
        
    big_df_roc = pd.concat(dfs_roc, ignore_index=True)
    big_df_pr = pd.concat(dfs_pr, ignore_index=True)

    return big_df_roc, big_df_pr



In [None]:
# Load Ground truth
# Data from 10.1016/j.cels.2021.08.006 
barbaras = pd.read_csv('../InputData/Benchmarks/deKegel_output.csv', sep=',', index_col = 2)
barbaras.head()
ground_truth_depmap_hit = barbaras[['depmap_hit']]
ground_truth_depmap_hit = ground_truth_depmap_hit.replace({True: 1, False: 0})
ground_truth_depmap_hit = ground_truth_depmap_hit.rename(columns={'depmap_hit': 'ground_truth'})
ground_truth_depmap_hit = ground_truth_depmap_hit.dropna()
ground_truth_depmap_hit

In [3]:
# Ground Truth 2
# Data is downloaded from : DOI: 10.1016/j.celrep.2022.110636

Koferle = pd.read_excel("../InputData/Benchmarks/Koferle.xlsx", sheet_name='PaCT')
Koferle[['gene1', 'gene2']] = Koferle['Pair'].str.split('_', expand=True)

# Use apply along axis=1 to reorder the gene names alphabetically and reconstruct the 'Pair'
Koferle['Pair'] = Koferle.apply(lambda x: '_'.join(sorted([x['gene1'], x['gene2']])), axis=1)
Koferle.rename(columns = {'Pair':'sorted_gene_pair'}, inplace = True)
Koferle = Koferle[['sorted_gene_pair', 'Sub-Dataset', 'Correlation Spearman', 'Screen Type', 'Hit?']]
Koferle.set_index('sorted_gene_pair', inplace=True)
#Koferle

Koferle = Koferle.copy()
Koferle = Koferle[Koferle['Screen Type'] == 'AVANA']
Koferle = Koferle[Koferle['Sub-Dataset'] == 'expr_paralog']#
Koferle['ground_truth'] = np.nan
Koferle.loc[(Koferle['Correlation Spearman'] > 0) & (Koferle['Hit?'] == 'yes'), ['ground_truth']] = 1
Koferle.loc[(Koferle['Correlation Spearman'] < 0) & (Koferle['Hit?'] == 'no'), ['ground_truth']] = 0
Koferle = Koferle.dropna(subset=['ground_truth'])

Koferle = Koferle[['ground_truth']]
Koferle

Unnamed: 0_level_0,ground_truth
sorted_gene_pair,Unnamed: 1_level_1
A1BG_GP6,0.0
A1BG_IGSF1,0.0
A1BG_KIR2DL3,0.0
A1BG_KIR3DL1,0.0
A1BG_LILRB2,0.0
...,...
FBLIM1_ZYX,0.0
WTIP_ZYX,0.0
LPP_ZYX,0.0
CUL9_ZZEF1,0.0
