In [1]:
import numpy as np
import pandas as pd
import collections
import gzip

from sklearn.decomposition import PCA
from scipy import stats

import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib
import sys
sys.path.append('../../')
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)

In [2]:
f_gx_qn= "/scratch/sbanerj/trans-eqtl/input/gtex/expression/gtex_protein_coding_normalized_esom.txt"
f_rpkm = "/scratch/sbanerj/trans-eqtl/input/gtex/preprocess_gx/rpkms/esom_rpkm.gct"
f_vcf = "/scratch/sbanerj/trans-eqtl/input/gtex/genotype/all_samples/GTEx_v6_imput_info04_HWEp1E6_PASS_dbSNP135_maf1_noindels_noambig_chr5.vcf.gz"

In [3]:
SNPINFO_FIELDS = ['chrom', 'varid', 'bp_pos', 'ref_allele', 'alt_allele', 'maf']
class SnpInfo(collections.namedtuple('_SnpInfo', SNPINFO_FIELDS)):
    __slots__ = ()


def read_gtex(filename): # returns N x G gene expression
    expr_list = list()
    donor_list = list()
    gene_list = list()
    with open(filename) as mfile:
        donor_list = mfile.readline().strip().split("\t")[1:]
        for line in mfile:
            linesplit = line.strip().split("\t")
            gene = linesplit[0].strip()
            gene_list.append(gene)
            expr = np.array([float(x) for x in linesplit[1:]])
            expr_list.append(expr)
    expr = np.transpose(np.array(expr_list))
    return expr, donor_list, gene_list

def read_vcf(filename, startsnp, endsnp):
    dosage = list()
    snpinfo = list()
    linenum = 0
    with gzip.open(filename, 'r') as vcf:
        for line in vcf:
            linestrip = line.decode().strip()
            if linestrip[:2] == '##': continue
            if linestrip[:6] == '#CHROM':
                linesplit = linestrip.split("\t")
                donor_ids = linesplit[9:]
            else:
                if linenum >= startsnp and linenum < endsnp:
                    linesplit = linestrip.split("\t")
                    chrom = int(linesplit[0])
                    pos   = int(linesplit[1])
                    varid = linesplit[2]
                    ref   = linesplit[3]
                    alt   = linesplit[4]

                    dsindx = linesplit[8].split(':').index("DS")
                    ds = [x.split(':')[dsindx] for x in linesplit[9:]]
                    gtindx = linesplit[8].split(':').index("GT")
                    for i, x in enumerate(ds):
                        if x == ".":
                            gt = linesplit[9+i].split(':')[gtindx]
                            if len(gt) == 3 and gt[0] != "." and gt[2] != ".":
                                ds[i] = float(int(gt[0]) + int(gt[2]))

                    ds_notna = [float(x) for x in ds if x != "."]
                    freq = sum(ds_notna) / 2 / len(ds_notna)
                    maf = freq
                    snpdosage = [float(x) if x != '.' else 2 * freq for x in ds]

                    this_snp = SnpInfo(chrom      = chrom,
                                       bp_pos     = pos,
                                       varid      = varid,
                                       ref_allele = ref,
                                       alt_allele = alt,
                                       maf        = maf)

                    dosage.append(snpdosage)
                    snpinfo.append(this_snp)
                linenum += 1

    return np.array(dosage), snpinfo, donor_ids


def normalize_expr(Y): # requires G x N input expression, returns G x N
    newY = (Y - np.mean(Y, axis = 1).reshape(-1, 1)) / np.std(Y, axis = 1).reshape(-1, 1)
    return newY

def select_donors(vcf_donors, expr_donors):
    ''' Make sure that donors are in the same order for both expression and genotype
    '''
    common_donors = [x for x in vcf_donors if x in expr_donors]
    vcfmask = np.array([vcf_donors.index(x) for x in common_donors])
    exprmask = np.array([expr_donors.index(x) for x in common_donors])
    return vcfmask, exprmask

#def normalize_and_center_dosage(dosage, snpinfo):
def normalize_and_center_dosage(dosage):
    #f = [snp.maf for snp in snpinfo]
    #f = np.array(f).reshape(-1, 1)
    gtcent = dosage - np.mean(dosage, axis = 1).reshape(-1, 1)
    return gtcent

def read_gct(gct_file, donor_ids, gene_names):
    """
    Load GCT as DataFrame
    """
    df = pd.read_csv(gct_file, sep='\t', skiprows=2, index_col=0)
    df.drop('Description', axis=1, inplace=True)
    df.index.name = 'gene_id'
    df.columns = ['-'.join(x.split('-')[:2]) for x in df.columns]
    df_donorsorted = df[donor_ids]
    df_donorsorted_genesorted = df_donorsorted.loc[gene_names]
    #df = df[[x for x in df.columns if x in donor_ids]]
    return df_donorsorted_genesorted

In [4]:
gtfull, snp_info, gt_donors = read_vcf(f_vcf, 0, 3)

In [5]:
gx_qn, gx_donors, gene_names = read_gtex(f_gx_qn) # reads N x G gene expression

In [6]:
expression_df = read_gct(f_rpkm, gx_donors, gene_names) # returns G x N gene expression
gx_rpkm = np.array(expression_df).T

In [7]:
print(gx_qn.shape)
print(gx_rpkm.shape)

(241, 15952)
(241, 15952)


In [8]:
nsample = gx_qn.shape[0]
ngene = gx_qn.shape[1]

In [9]:
gx_qn_nomask = normalize_expr(gx_qn.T) / np.sqrt(nsample)

In [10]:
gtfull.shape

(3, 450)

In [11]:
vcfmask, exprmask = select_donors(gt_donors, gx_donors)
gx_qn_norm = gx_qn_nomask[:, exprmask] # G x N
gx_rpkm_sort = gx_rpkm[exprmask, :] # N x G
gt_sort = gtfull[:, vcfmask]
gt_cent = normalize_and_center_dosage(gt_sort)

# gt = normalize_and_center_dosage(gtfull[:, vcfmask], snp_info)
# sigmax2 = np.var(gt, axis = 1)
# print(sigmax2)

In [12]:
pca = PCA(n_components=200)
print("Original dimension: ", gx_rpkm_sort.shape)
pca.fit(gx_rpkm_sort) # requires N x G
gx_rpkm_pca = pca.transform(gx_rpkm_sort)
print("Reduced dimension: ", gx_rpkm_pca.shape)

def gene_distance(a, b):
    return np.linalg.norm(a - b)

distance_matrix = np.zeros((nsample, nsample))
for i in range(nsample):
    for j in range(i+1, nsample):
        dist = gene_distance(gx_rpkm_pca[i,:], gx_rpkm_pca[j,:])
        distance_matrix[i, j] = dist
        distance_matrix[j, i] = dist

kneighbor = 30
gx_knn = np.zeros_like(gx_rpkm_sort)
gt_knn = np.zeros_like(gt_sort)

for i in range(nsample):
    #neighbors = np.argsort(distance_matrix[i, :kneighbor + 1])
    neighbors = np.argsort(distance_matrix[i, :])[:kneighbor + 1][1:]
    gx_knn[i, :] = gx_rpkm_sort[i, :] - np.mean(gx_rpkm_sort[neighbors, :], axis = 0)
    gt_knn[:, i] = gt_sort[:, i] - np.mean(gt_sort[:, neighbors[1:]], axis = 1)

Original dimension:  (241, 15952)
Reduced dimension:  (241, 200)


In [13]:
gx_knn.shape

(241, 15952)

In [14]:
gx_qn.shape

(241, 15952)

In [None]:
gx_knn_norm = normalize_expr(gx_knn.T) / np.sqrt(nsample)
gt_knn_cent = normalize_and_center_dosage(gt_knn)

In [None]:
def pvals_perm(GT, R, W):
    mu2, mu4 = moment_data(GT)
    N = GT.shape[1]
    q11 = np.sum(W)
    q2  = np.sum(np.diag(W))
    muQ = mu2 * (N * q2 - q11) / (N - 1)

    v31 = - mu4 / (N - 1)
    v22 = v31 + (N * mu2 * mu2 / (N - 1)) #(N*(mu2**2) - mu4)/(N-1)
    v211 = - (v31 + v22) / (N - 2)
    v1111 = - 3 * v211 / (N - 3)

    q31 = np.dot(np.diag(W),np.sum(W,axis = 1))
    q4 = np.sum(np.square(np.diag(W)))
    q22 = np.sum(np.square(W))
    q211 = np.sum(np.square(np.sum(W,axis = 1)))

    sigma2 = v1111*(q11**2 - 2*q2*q11 - 4*q211 + 8*q31 + 2*q22 + q2**2 - 6*q4) + 2*v211*(q2*q11 + 2*q211 - 6*q31 - 2*q22 - q2**2 + 6*q4) + v22*(q2**2 + 2*q22 - 3*q4) + 4*v31*(q31 - q4) + mu4*q4

    sigma2 = sigma2 - muQ**2
    sigmaQ = np.sqrt(sigma2)
    p = 1 - stats.norm.cdf(R, loc=muQ, scale=sigmaQ)
    return p, muQ, sigmaQ

def moment_data(GT):   #GT ixN
    GT2 = np.square(GT)
    GT4 = np.square(GT2)
    mu2 = np.mean(GT2)
    mu4 = np.mean(GT4)
    return mu2, mu4

In [None]:
## Shuffle 1000 times, and get Qscore

def shuffle_and_plot(GX, GT, ax, label):
    sigmabeta2 = 0.2 * 0.2
    niter = 20000
    sigmax2 = np.var(GT)
    
    Yt = GX.T
    U, S, Vt = np.linalg.svd(Yt, full_matrices=False)
    S2 = np.square(S)
    S2mod = S2 + sigmax2 / sigmabeta2
    mrank = len(np.where(S2 > 1)[0])

    W = np.dot(U, np.dot(np.diag(S2 / S2mod), U.T)) / sigmax2
    Qscore = np.sum(np.square(np.dot(U.T, GT)) * S2 / S2mod) / sigmax2
    pval, muQ, sigmaQ = pvals_perm(GT.reshape(1, -1), Qscore, W)
    
    Qiter = np.zeros(niter)
    permgt = GT.copy()
    for i in range(niter):
        np.random.shuffle(permgt)
        Qiter[i] = np.sum(np.square(np.dot(U.T, permgt)) * S2 / S2mod) / sigmax2
    
    print("====== {:s} ======".format(label))
    print("Effective rank: {:d}".format(mrank))
    print("Keff: {:g}".format(np.sum(S2 / S2mod)) )
    print("First 4 singular values: {:g} {:g} {:g} {:g}".format(S2[0], S2[1], S2[2], S2[3]))
    print("Sx2 / Sb2: {:g}".format(sigmax2 / sigmabeta2))
    print("Qscore: {:g}".format(Qscore))
    print("muQ: {:g}".format(muQ))
    print("sigmaQ: {:g}".format(sigmaQ))

    
    #xmax = np.max(Qiter) + 1.5 * sigmaQ
    #xmin = np.min(Qiter) - 1.5 * sigmaQ
    xmax = muQ + 4.0 * sigmaQ
    xmin = muQ - 4.0 * sigmaQ
    bins = np.linspace(xmin, xmax, 50)
    xbin = [(bins[i] + bins[i+1]) / 2 for i in range(bins.shape[0] - 1)] # centers of the bins
    x = np.linspace(xbin[0], xbin[-1], 100)
    ax.hist(Qiter, bins = bins, density = True, alpha = 0.3, label = '{:d} iter'.format(niter))
    
    rv = stats.norm(loc = muQ, scale = sigmaQ)
    ax.plot(x, rv.pdf(x), label = 'analytical')
    
    ax.set_title(label)
    
    return Qiter
    
fig = plt.figure(figsize = (14, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
Qiter = shuffle_and_plot(gx_qn_norm, gt_cent[0, :], ax1, "Normalized gene expression")
Qiter_knn = shuffle_and_plot(gx_knn_norm, gt_knn_cent[0, :], ax2, "After KNN correction")
ax1.legend()
ax2.legend()
plt.tight_layout()
#plt.savefig('../plots/PCA_correction_random_gene_expression.png')
plt.show()