In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../../')
sys.path.append('/usr/users/fsimone/tejaas')
import numpy as np
import scipy.stats as ss
import random
import os
import pandas as pd
from iotools.readRPKM import ReadRPKM
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.decomposition import PCA

def knn_correction(expr, K, f=1):
    pca = PCA(n_components=int(f * min(expr.shape[0], expr.shape[1])))
    pca.fit(expr) # requires N x G
    expr_pca = pca.transform(expr)

    def gene_distance(a, b):
        return np.linalg.norm(a - b)

    nsample = expr.shape[0]
    distance_matrix = np.zeros((nsample, nsample))
    for i in range(nsample):
        for j in range(i+1, nsample):
            dist = gene_distance(expr_pca[i,:], expr_pca[j,:])
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist

    kneighbor = K
    gx_knn = np.zeros_like(expr)
    neighbor_list = list()

    neighbour_comp = np.zeros(expr.shape)
    for i in range(nsample):
        neighbors = np.argsort(distance_matrix[i, :])[:kneighbor + 1][1:]
        gx_knn[i, :] = expr[i, :] - np.mean(expr[neighbors, :], axis = 0)
        # noisy_neighbors = np.random.choice(neighbors, size = int(2 * kneighbor / 3), replace = False)
        # noisy_neighbors = np.random.choice(neighbors, size = kneighbor, replace = True )
        neighbour_comp[i,:] = np.mean(expr[neighbors, :], axis = 0)

    return gx_knn, neighbour_comp

In [2]:
import operator
import json
from utils import utils
json_file = "../../gtex_v8_metadata.json"
tissue_file = "../../plots/tissues.txt"
tshorts, tfulls_plain = utils.read_tissues(tissue_file, plain=True)
_, tfulls = utils.read_tissues(tissue_file)
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors  = dict()
tissue_names   = dict()
tissue_samples = dict()
for tshort, tfull in zip(tshorts, tfulls_plain):
    tissue_names[tshort] = tfull
for tshort, tfull in zip(tshorts, tfulls):
    tissue_colors[tshort] = "#" + gtex_meta[tfull.replace(" ", "_")]["colorHex"]
    tissue_samples[tshort] = gtex_meta[tfull.replace(" ", "_")]["rnaSeqAndGenotypeSampleCount"]

sorted_tissues = [x[0] for x in sorted(tissue_samples.items(), key=operator.itemgetter(1))]
select_tissues = tshorts

In [17]:
K = 30
for tissue in tshorts:
    if tissue not in select_tissues:
        continue
    print(tissue)
    gx_file = "/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{:s}_tpms_qcfilter.txt.protein_coding_lncRNA_filtered".format(tissue)
    gx_knn_file = "/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{:s}_knn{:d}.txt.protein_coding_lncRNA_filtered".format(tissue, K)
    rpkm = ReadRPKM(gx_file, "gtex")
    expression = rpkm.expression
    expr_donors = rpkm.donor_ids
    gene_names = rpkm.gene_names

    gx_corr, nb_comp = knn_correction(expression.T, K)
    df = pd.DataFrame(gx_corr.T, columns=expr_donors, index=gene_names)
    df.index.name = "gene_id"
    df.to_csv(gx_knn_file, sep="\t", doublequote=False)

as
av
ag
aa
ac
at
bam
ban
bca
bceh
bce
bco
bfr
bhi
bhy
bnu
bpu
bsp
bsu
br
ebv
fib
cols
colt
esog
esom
esomu
haa
hlv
kc
liv
lu
msg
ms
nt
pan
pit
snse
sse
si
spl
sto
thy
wb
ov
pro
tes
ut
va


In [28]:
def normalize_expr(Y):
    if isinstance(Y, pd.DataFrame):
        Y_cent = (Y.values - np.mean(Y.values, axis = 1).reshape(-1, 1)) / np.std(Y.values, axis = 1).reshape(-1, 1)
        Y_cent = pd.DataFrame(Y_cent, index=Y.index, columns=Y.columns)
        Y_cent.index.name = Y.index.name
    else:
        Y_cent = (Y - np.mean(Y, axis = 1).reshape(-1, 1)) / np.std(Y, axis = 1).reshape(-1, 1)
    return Y_cent

K=30
for tissue in tshorts:
    print(tissue)
    gx_knn_file = "/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{:s}_knn{:d}.txt.protein_coding_lncRNA_filtered".format(tissue, K)
    gx_knn_norm_file = "/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/{:s}_knn{:d}_norm.txt.protein_coding_lncRNA_filtered".format(tissue, K)
    df = pd.read_csv(gx_knn_file, sep="\t", header=0, index_col=0)
    df_norm = normalize_expr(df)
    df_norm.to_csv(gx_knn_norm_file, sep="\t", doublequote=False)
    

as
av
ag
aa
ac
at
bam
ban
bca
bceh
bce
bco
bfr
bhi
bhy
bnu
bpu
bsp
bsu
br
ebv
fib
cols
colt
esog
esom
esomu
haa
hlv
kc
liv
lu
msg
ms
nt
pan
pit
snse
sse
si
spl
sto
thy
wb
ov
pro
tes
ut
va


In [2]:
import pandas as pd
import numpy as np
gx_file = "/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms/as_tpms_qcfilter.txt.protein_coding_lncRNA_filtered"
df = pd.read_csv(gx_file, sep="\t", header=0, index_col=0)

In [3]:
np.mean(df.values, axis=1)

array([ 1.16754951e-16, -6.21992417e-17, -1.57456759e-16, ...,
        6.40336894e-16,  9.97480927e-17,  8.00564434e-16])

In [4]:
np.var(df.values, axis=1)

array([1., 1., 1., ..., 1., 1., 1.])