In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import os
import time

import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn3_circles, venn2

import pickle

import tensorflow as tf

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# pick GPU 0:
tf.config.set_visible_devices(physical_devices[0], 'GPU')

def compute_cosine_distances(a, b):
    # x shape is n_a * dim
    # y shape is n_b * dim
    # results shape is n_a * n_b

    normalize_a = tf.nn.l2_normalize(a,1)        
    normalize_b = tf.nn.l2_normalize(b,1)
    distance = 1 - tf.matmul(normalize_a, normalize_b, transpose_b=True)
    return distance.numpy()

def compute_euclidean_distances(A):
    #d = tf.math.sqrt( tf.reduce_sum((tf.expand_dims(a, 1)-tf.expand_dims(b, 0))**2,2))
    r = tf.math.reduce_sum(A*A, 1)
    r = tf.reshape(r, [-1, 1])
    D = r - 2*tf.matmul(A, tf.transpose(A)) + tf.transpose(r)
    D = D.numpy()
    D[D < 0] = 0.0
    return np.sqrt(D)

Num GPUs Available:  8


In [3]:
def nearest_neighbors(CTL, WT, MUT, K=5):

    ALL = np.concatenate([CTL, WT, MUT], axis=0)
    
    # Compute distance matrix
    D = compute_cosine_distances(ALL, ALL)
    #D = compute_euclidean_distances(ALL)
    
    # Suppress diagonal
    D[np.diag_indices(D.shape[0])] = 10
    
    # Search the K nearest neighbors
    NN = []
    for nn in range(K):
        nnx = np.argmin(D, axis=0)
        D[nnx, np.arange(D.shape[0])] = 1e6
        NN.append(nnx)
    NN = np.concatenate([x[:,np.newaxis] for x in NN], axis=1)
    
    # Identify type of neighbor according to the position in the array
    NN[NN < CTL.shape[0]] = 1                                           # Controls
    NN[(NN >= CTL.shape[0]) & (NN < CTL.shape[0] + WT.shape[0])] = 11   # Wild types
    NN[NN >= CTL.shape[0] + WT.shape[0]] = 111                          # Mutants
    
    # Types of neighborhoods
    A = np.sum(np.all(NN == 1, axis=1))                            # Controls
    B = np.sum(np.all(NN == 11, axis=1))                           # Wild types
    C = np.sum(np.all(NN == 111, axis=1))                          # Mutants
    D = np.sum(np.all((NN == 1) | (NN == 11), axis=1)) - A - B     # Control or Wild type
    E = np.sum(np.all((NN == 1) | (NN == 111), axis=1)) - A - C    # Control or Mutant
    F = np.sum(np.all((NN == 11) | (NN == 111), axis=1)) - B - C   # Wild type or Mutant
    G = NN.shape[0] - A - B - C - D - E - F                        # All mixed
    
    # Impact score: fraction of active mutant cells that are different from active wild type cells
    impact_score = C/(C+F)

    # Fraction of active mutant cells:
    active_mutants = C/MUT.shape[0]
    
    results = {
        "CTL": A, "WT": B, "MUT": C, "CTL-WT": D, "CTL-MUT": E, "WT-MUT": F, "ANY": G,
        "impact_score": impact_score, "active_mutants": active_mutants
    }
    
    # Compatibility with previous analysis:
    all_values = [B, C, F, A, D, E, G]
    
    return results, all_values

In [4]:
def analyze_variant(wt_id, neighborhood=10, N=2000, features="features"):
    
    # Load data
    with open("outputs/single-cells/" + wt_id + ".pkl","rb") as file:
        data = pickle.load(file)
        
    # Prepare matrices
    WT = data["wild_type_data"]["features"]
    CTL = data["controls"]["features"]
    
    idx = np.arange(WT.shape[0])
    np.random.shuffle(idx)
    WT = WT[idx[0:N],...]
    
    idx = np.arange(CTL.shape[0])
    np.random.shuffle(idx)
    CTL = CTL[idx[0:N],...]
        
    graph_scores = {}
    
    for allele in data["mutant_ids"]:
        MUT = data["mutants_data"][allele]["features"]
        idx = np.arange(MUT.shape[0])
        np.random.shuffle(idx)
        MUT = MUT[idx[0:N],...]
        
        rs, av = nearest_neighbors(CTL, WT, MUT, neighborhood)
        
        print(allele, rs["impact_score"])
        graph_scores[allele] = {"all_values": av, "mut_wt_values": av[0:3], "impact_score": rs["impact_score"]}
        
    # Save computations
    with open("outputs/single-cells/" + wt_id + ".pkl", "wb") as out:
        pickle.dump({
            "wild_type_data":data["wild_type_data"], 
            "mutant_ids": data["mutant_ids"],
            "mutants_data": data["mutants_data"], 
            "controls": data["controls"],
            "ctlimgs": data["ctlimgs"],
            "graph_scores": graph_scores,
            "Y": data["Y"]
        }, out)


In [None]:
# Run the analysis
wild_types = [k.replace(".pkl", "") for k in os.listdir("outputs/single-cells/") if k.endswith(".pkl")]
for wt in wild_types:
    #if wt.find("CCND1") == -1 : continue # Uncomment to test on a single variant
    print(wt)
    analyze_variant(wt, neighborhood=5, N=10000)

5268@SERPINB5_WT.c
5268@SERPINB5_p.D141A 0.18227179336659818
5268@SERPINB5_p.A7T 0.08137638617090671
5268@SERPINB5_p.A42F 0.12837331334332833
5268@SERPINB5_p.T37I 0.09465459065885277
5268@SERPINB5_p.I159S 0.11857142857142858
5268@SERPINB5_p.G142V 0.10231923601637108
5268@SERPINB5_WT.o 0.15433314575126617
5268@SERPINB5_p.A165T 0.1598007281088331
231@AKR1B1_WT.c
231@AKR1B1_p.F252L 0.09147982062780269
231@AKR1B1_WT.o 0.26762995274445656
231@AKR1B1_p.P14R 0.22262959568184132
231@AKR1B1_p.Q27K 0.10024475896562733
673@BRAF_WT.c
673@BRAF_p.G466A 0.7279312343055824
673@BRAF_p.A762E 0.5322080974254887
673@BRAF_p.H574N 0.6151975683890577
673@BRAF_p.G469S 0.9418748219204103
673@BRAF_p.W450L 0.6249187432286024
673@BRAF_p.R682W 0.43767672007540054
673@BRAF_p.D594H 0.3943522417636859
673@BRAF_p.N581S 0.7276067527308838
673@BRAF_p.K601N 0.9469812762705387
673@BRAF_p.L485S 0.9084167593622544
673@BRAF_p.G469V 0.7085048699668641
673@BRAF_p.V600E 0.9451508825204024
673@BRAF_p.G466V 0.4063754427390791
673