In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import collections

predicted_filepath = "C21orf56_predixcan.txt"
observed_norm_filepath = "data/input/gtex_wholeblood_normalized_expression_correlated_genes.txt"
pheno_filepath = "data/input/gtex_wholeblood_normalized_correlated_genes.pheno"

In [2]:
predicted_df = pd.read_csv(predicted_filepath, sep=' ', index_col=0)
predicted_df = predicted_df.transpose()
predicted_df.index.name = 'gene_id'

In [3]:
observed_norm_df = pd.read_csv(observed_norm_filepath, sep='\t', index_col=0)
observed_norm_df.columns.name = 'IID'

In [4]:
pheno_df = pd.read_csv(pheno_filepath, sep='\t', index_col=1)
pheno_df.drop('FID', axis=1, inplace=True)
pheno_df = pheno_df.transpose()
pheno_df.index.name = 'gene_id'

In [5]:
observed_norm_df

IID,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-113IC,GTEX-113JC,GTEX-117XS,GTEX-117YW,GTEX-1192W,GTEX-11DXW,GTEX-11DXX,...,GTEX-ZVE2,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZVT3,GTEX-ZVT4,GTEX-ZVTK,GTEX-ZVZP,GTEX-ZVZQ,GTEX-ZXES,GTEX-ZXG5
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000160284.10,-0.690825,-0.129761,-1.185074,-1.009605,-0.070303,2.040526,1.563249,1.200131,1.588849,-0.471662,...,0.31184,-1.407457,-0.949861,0.788031,0.304087,-0.463414,-0.719267,2.176759,0.099988,0.025883
ENSG00000182362.9,0.538879,-0.167144,-2.371853,-1.296828,-0.265583,2.176759,-0.455197,-0.366671,0.18216,-0.556058,...,0.839516,1.23109,0.152166,-0.829044,-1.127368,0.402501,-1.492023,0.521857,-0.926893,-0.882362


In [6]:
pheno_df

IID,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-113IC,GTEX-113JC,GTEX-117XS,GTEX-117YW,GTEX-1192W,GTEX-11DXW,GTEX-11DXX,...,GTEX-ZVE2,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZVT3,GTEX-ZVT4,GTEX-ZVTK,GTEX-ZVZP,GTEX-ZVZQ,GTEX-ZXES,GTEX-ZXG5
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000160284.10,0.59533,0.956132,0.370437,0.490436,1.00638,4.09246,3.49618,2.65277,3.53847,0.74077,...,1.33426,0.27803,0.499423,1.89971,1.33232,0.741246,0.585057,4.65565,1.13407,1.07936
ENSG00000182362.9,3.48878,2.44796,0.796999,1.317,2.28405,8.71436,2.05728,2.21886,2.86735,2.022,...,4.0011,4.73724,2.84224,1.75813,1.47373,3.20217,1.22408,3.45498,1.68445,1.71742


In [7]:
common_ids = [x for x in observed_norm_df.columns.tolist() if x in predicted_df.columns.tolist()]
predicted_common_df = predicted_df[common_ids]
observed_common_df = pheno_df[common_ids]

In [8]:
gene = 'ENSG00000160284.10'
Y = predicted_common_df.loc[gene].as_matrix()
X = observed_common_df.loc[gene].as_matrix()
slope, intercept, r_value, p_value, std_err = stats.linregress(X, Y)
rsq = r_value * r_value
print ("R-squared: {:g}".format(r_value ** 2))
print ("p-value: {:g}".format(p_value))

R-squared: 0.189901
p-value: 4.24896e-17


In [None]:
pheno_df