Toy data

In [2]:
#!/usr/bin/env python
import pandas as pd
import networkx as nx
from networkx.algorithms.bipartite.matrix import biadjacency_matrix
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
from sklearn.preprocessing import normalize
import random
from sklearn import metrics
import time
from sklearn.decomposition import NMF
from scipy import sparse
# from libnmf.gnmf import GNMF

import numpy.linalg as LA

from scipy import stats

import matplotlib.pyplot as plt
# import math
from scipy.linalg import logm

from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LinearRegression

import seaborn as sns
from scipy import optimize
from sklearn.metrics import r2_score
from scipy.interpolate import make_interp_spline
from sklearn.feature_selection import chi2
from scipy.linalg import inv
# import loess.loess_1d as l1d

# import tensorflow as 

from scipy.spatial.distance import cdist

from sklearn.metrics import roc_auc_score


random.seed(1949) # for dataset split
np.random.seed(1949) # for matrix initialization

In [3]:
def GRNMF(bipart_graph, component, WMK, lmd, max_iter, tolerance=1/1000000):
    np.random.seed(1949)
    random.seed(1949)

    #####
    # bipart_graph: bipartite graph X
    # component: number of latent feature
    # WMK: weight matrix kernel
    # lmd: regulization parameter
    # max_iter: maximum iteration of GNMF

    W = WMK.copy()
    X = bipart_graph.copy()
    m, n = X.shape
    k = component
 
    D = np.matrix(np.diag(np.asarray(W.copy()).sum(axis=1)))
    L = D.copy() - W.copy()

    # Initialize U & V

    U = np.random.random((m, k))
    V = np.random.random((n, k))

    # Updating U V
    eps = 2**-8

    term1 = LA.norm(X - np.dot(U, V.T))**2
    term2 = lmd * np.trace(np.dot(np.dot(V.T, L), V))
    Obj0 = term1 + term2
    Obj1 = Obj0


    for i in range(max_iter):
        XV = np.dot(X, V)
        UVtV = np.dot(np.dot(U, V.T), V) + eps

        U *= XV
        U /= UVtV
        
        XtU_lmdWV = np.dot(X.T, U) + lmd*np.dot(W, V)
        VUtU_lmdDV = np.dot(np.dot(V, U.T), U) + lmd*np.dot(D, V) + eps
        V *= XtU_lmdWV
        V /= VUtU_lmdDV

        # Objective function
        
        term1 = LA.norm(X - np.dot(U, V.T))**2
        term2 = lmd * np.trace(np.dot(np.dot(V.T, L), V))
        Obj2 = term1 + term2    
        ObjDiff = Obj1 - Obj2
        Obj1 = Obj2

        if(ObjDiff < (Obj0 *tolerance)):
            print("Converged in iteration: ", i, "ObjDiff: ", ObjDiff, "Obj: ", Obj2)
            return(U, V, np.dot(U, V.T))
        elif i == max_iter - 1:
            print("Has not converged, reach the maximum iteration")
            return(U, V, np.dot(U, V.T))

In [4]:
def KernelRegression(matrix,feature_matrix,idx_train,idx_test,l,s):
    sigma = s
    lmd = l

    # feature_matrix1 = (feature_matrix1.copy() - feature_matrix1.mean()) / feature_matrix1.std()
    # feature_matrix2 = (feature_matrix2.copy() - feature_matrix2.mean()) / feature_matrix2.std()
        
    X = np.array(feature_matrix[idx_train, :].copy()).tolist()
    X_new = np.array(feature_matrix[idx_test, :].copy()).tolist()

    y = matrix[:, idx_train].copy()
    Y = pd.DataFrame(y.T.copy())
    # y_new = matrix[:, idx_test].copy()
    matrix_new = matrix.copy().astype(float)

    distance = cdist(X_new, X)**2
    kernel = np.exp(-distance/sigma**2)

    similarity = cdist(X, X)**2
    K = pd.DataFrame(np.exp(-similarity/sigma**2))

    n = len(idx_train) # size of known drug


    Lmd = np.diag(np.ones(n)*lmd)
    W = inv(K.dot(K)+Lmd).dot(K.dot(Y))
    y_new = kernel.dot(W)

        

    matrix_new[:, idx_test] = y_new.T
    return matrix_new

In [5]:
def Adaptive(matrix,feature_matrix,idx_train,idx_test,l,s,k):
    sigma = s
    lmd = l

    X = np.array(feature_matrix[idx_train, :].copy()).tolist()
    X_new = np.array(feature_matrix[idx_test, :].copy()).tolist()
    y = matrix[:, idx_train].copy()
    # Y = pd.DataFrame(y.T.copy())
    # y_new = matrix[:, idx_test].copy()
    matrix_new = matrix.copy().astype(float)

    similarity = cdist(X, X)**2
    WMK = pd.DataFrame(np.exp(-similarity/sigma**2))

    m, n = matrix.shape
    Vout = np.zeros((n, k))


    U,V,preds = GRNMF(y, component=k, WMK=WMK, lmd=0, max_iter=10000)
    Vout[idx_train, :] = V


    Vpreds = KernelRegression(Vout.T,feature_matrix,idx_train,idx_test,l,s)


    preds = U.dot(Vpreds)
    
    return preds


In [9]:
# Size of Obs 100 with length 10 features
# Size of test 20
# Size of labels 100 with 5 label
np.random.seed(0)


n = 100
p = 10
q = 5
PN_frac = 0.75
sigma = 10

X = np.random.random((n,p))
similarity = cdist(X, X)**2
K = pd.DataFrame(np.exp(-similarity/sigma**2)) # 100*100
W = np.random.random((n,q))

e = np.random.random((n,q)) * 0.01

GT = K.dot(W)
Y_scores = GT + e
Y = Y_scores.copy()
Y[Y > PN_frac] = 1
Y[Y <= PN_frac] = 0
Y = np.array(Y)

idx_train = np.arange(0, 80, 1)
idx_test = np.arange(80, 100, 1)


# Lmd = np.diag(np.ones(n)*lmd)
# W = inv(K.dot(K)+Lmd).dot(K.dot(Y))



In [12]:
Y_scores


Unnamed: 0,0,1,2,3,4
0,46.859271,47.431787,48.411985,53.669735,55.917152
1,46.547657,47.093865,48.018133,53.304018,55.547027
2,46.622020,47.187578,48.137997,53.404291,55.642216
3,46.781931,47.377046,48.351892,53.623027,55.865862
4,46.844715,47.400309,48.410414,53.684413,55.884841
...,...,...,...,...,...
95,46.699851,47.305666,48.291542,53.538082,55.773319
96,46.734671,47.332924,48.321910,53.598933,55.821589
97,46.759077,47.287513,48.274857,53.538210,55.762856
98,46.643808,47.224370,48.178635,53.405077,55.642755


In [10]:
score_KRNMF = Adaptive(matrix=Y.T,feature_matrix=X,idx_train=idx_train,idx_test=idx_test,l=0.1,s=sigma,k=p)

prec, recall, threshold = precision_recall_curve(Y.T[:, idx_test].ravel(), score_KRNMF[:,idx_test].ravel())
pr_auc = auc(recall, prec)
roc_auc = roc_auc_score(Y.T[:, idx_test].ravel(), score_KRNMF[:, idx_test].ravel())

print("KernelRegression - NMF")
print("-----")
print("AUC-PR:", pr_auc)
print("-----")
print("AUC-ROC:", roc_auc)
print("-----")

Converged in iteration:  68 ObjDiff:  0.000591762135929888 Obj:  0.02162777672722441


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [450]:
score_KR = KernelRegression(matrix=Y.T,feature_matrix=X,idx_train=idx_train,idx_test=idx_test,l=0.1,s=sigma)

prec, recall, threshold = precision_recall_curve(Y.T[:, idx_test].ravel(), score_KR[:,idx_test].ravel())
pr_auc = auc(recall, prec)
roc_auc = roc_auc_score(Y.T[:, idx_test].ravel(), score_KR[:, idx_test].ravel())

print("KernelRegression")
print("-----")
print("AUC-PR:", pr_auc)
print("-----")
print("AUC-ROC:", roc_auc)
print("-----")

KernelRegression
-----
AUC-PR: 0.2019896150130018
-----
AUC-ROC: 0.5189873417721519
-----


In [448]:
score_KRNMF = Adaptive(matrix=Y.T,feature_matrix=X,idx_train=idx_train,idx_test=idx_test,l=100,s=sigma,k=5)

prec, recall, threshold = precision_recall_curve(GT_labels.T[:, idx_test].ravel(), score_KRNMF[:,idx_test].ravel())
pr_auc = auc(recall, prec)
roc_auc = roc_auc_score(GT_labels.T[:, idx_test].ravel(), score_KRNMF[:, idx_test].ravel())

print("KernelRegression - NMF")
print("-----")
print("AUC-PR:", pr_auc)
print("-----")
print("AUC-ROC:", roc_auc)
print("-----")

Converged in iteration:  81 ObjDiff:  0.0002291208670851582 Obj:  0.009704781477330306
KernelRegression - NMF
-----
AUC-PR: 0.25650690216523286
-----
AUC-ROC: 0.5165762507534659
-----


In [16]:
1/11 + (1 - 1/11) * (1 - 4e-4 - 4e-5)**100

0.9608588104413304

In [15]:
(1 - 4e-4 - 4e-5)**100

0.9569446914854635