### <p style="text-align: right;"> &#9989; **Context Matters** </p>
#### <p style="text-align: right;"> &#9989; Christina, Hao, Yunfei, Erik</p>

# Module 3

## How do gene transcription relate to phenotype?

We will work with maize.

### Preliminaries

First, import the usual libraries
- `math`: basic math operations
- `os`: enable file manipulation with the OS
- `sys`: enable interaction with commandline
- `importlib` : reload libraries if necessary
- `argparse` : pass arguments directly to the command line
- `glob`: more variable manipulation
- `matplotlib.pyplot`: default plotter (I personally like ggplot waaaaay better. E)
    - `inline`: so that plots are shown in the notebook
- `numpy`: all number cruching done here
- `pandas`: data wrangling

In [1]:
import math
import importlib
import numpy as np
import os
import sys
import argparse
import glob
from matplotlib import pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

Specify the gene expression raw data file and a directory to save plots and CSVs.

The eQTL file was provided by (Christina,Fabio,Nolan,Scott)-group in Module 1. Supposedly, it is the result of eQTL performed of every gene against every SNP.

In [2]:
# path to FPKM data
gene_file = '~/documents/css893/942_FPKM_B73_genes_w_feature.txt'

# path to eQTL data
eqtl_file = '/home/ejam/documents/css893/Merged_eQTL_Data.tsv'

# path to correlation matrix directory
src = '/home/ejam/documents/css893/tpm_corr/'

# path to save results
dst = '/home/ejam/documents/css893/context_matters/results/'

Load the original FPKM data. This time we only care of the metadata. All the contigs were treated as _chromosome 11_

In [3]:
fpkm = pd.read_table(gene_file)
metadata_cols = 5

gene_info = fpkm.iloc[:, :metadata_cols]
foo = gene_info['chromosome'].unique()

for idx in foo:
    if idx == str(idx):
        value = 0
        if idx[0] == 'B':
            value = int('73' + (idx.split('_')[-1])[3:])
            value = 11
        else:
            value = int(idx)
        
        mask = gene_info.chromosome == idx
        gene_info.loc[mask, 'chromosome'] = value

  interactivity=interactivity, compiler=compiler, result=result)


Load the correlation matrix. The correlation method is _pearson_ and we only consider those genes whose TPM variance was above 10e5.

In [12]:
cutoff = 100
corr_meth = 'pearson'
corr_csv = '{}_gene_correlation_{}.csv'.format(corr_meth,cutoff)

corr = pd.read_csv(src + corr_csv)
corr

Unnamed: 0,7,8,19,20,27,30,33,38,40,49,...,44212,44218,44219,44221,44227,44228,44245,44250,44254,44256
0,1.000000,0.603307,0.617355,0.067536,-0.152953,-0.580836,-0.638142,0.579787,0.688214,-0.597067,...,-0.495718,0.561550,-0.420209,0.198508,0.301550,0.473162,-0.283920,-0.468367,0.686651,-0.191131
1,0.603307,1.000000,0.737379,0.083168,0.016714,-0.508527,-0.633081,0.682938,0.585571,-0.633340,...,-0.499109,0.627078,-0.332796,0.378169,0.279490,0.657190,-0.368799,-0.403331,0.742548,-0.133403
2,0.617355,0.737379,1.000000,0.149760,-0.043953,-0.636658,-0.691886,0.812817,0.686925,-0.688775,...,-0.535990,0.765954,-0.308007,0.382220,0.363331,0.769484,-0.389583,-0.468688,0.803792,-0.062469
3,0.067536,0.083168,0.149760,1.000000,-0.026165,-0.048954,-0.126808,0.224780,0.059721,-0.075750,...,-0.085676,0.127682,0.022826,0.020554,0.131990,0.037556,-0.132367,0.026499,0.052534,0.197024
4,-0.152953,0.016714,-0.043953,-0.026165,1.000000,0.191237,0.105725,-0.126072,-0.180447,0.100474,...,-0.023838,-0.152780,0.287822,0.201723,-0.134249,0.075790,-0.063227,0.052689,-0.030109,0.114066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7354,0.473162,0.657190,0.769484,0.037556,0.075790,-0.516658,-0.582472,0.677102,0.580988,-0.595140,...,-0.461170,0.664644,-0.245511,0.377184,0.226978,1.000000,-0.322122,-0.396342,0.678366,-0.111052
7355,-0.283920,-0.368799,-0.389583,-0.132367,-0.063227,0.286439,0.371946,-0.387424,-0.297418,0.385382,...,0.310319,-0.332960,0.091233,-0.218509,-0.210024,-0.322122,1.000000,0.280048,-0.368306,-0.032168
7356,-0.468367,-0.403331,-0.468688,0.026499,0.052689,0.543811,0.491412,-0.438147,-0.429403,0.433600,...,0.429828,-0.438580,0.213846,-0.236870,-0.191679,-0.396342,0.280048,1.000000,-0.461572,0.124425
7357,0.686651,0.742548,0.803792,0.052534,-0.030109,-0.544782,-0.652504,0.729326,0.677659,-0.694226,...,-0.526258,0.657663,-0.404594,0.319675,0.346116,0.678366,-0.368306,-0.461572,1.000000,-0.160224


Retrieve the metadata corresponding to the highly varying genes.

In [14]:
meta = gene_info.iloc[corr.columns]
foo = meta.chromosome.unique()
print(len(foo), meta.shape)
meta

11 (7359, 5)


Unnamed: 0,gene,chromosome,feature_type,position_left,position_right
7,Zm00001d033979,1,gene,279975978,279978430
8,Zm00001d018479,5,gene,221521585,221525322
19,Zm00001d016606,5,gene,169941759,169949748
20,Zm00001d047989,9,gene,147322756,147334539
27,Zm00001d009640,8,gene,74045694,74049815
...,...,...,...,...,...
44228,Zm00001d020669,7,gene,127426893,127431481
44245,Zm00001d006882,2,gene,218478424,218478715
44250,Zm00001d002347,2,gene,10657876,10658439
44254,Zm00001d028653,1,gene,41833737,41838204


In [9]:
eqtl = pd.read_table(eqtl_file)
eqtl

Unnamed: 0,R_Gene_Chr,R_Gene_Start,R_Gene_Stop,R_Gene_ID,R_Gene_Class,SNP_Chr,SNP_Start,SNP_Stop,SNP_ID,Associated_Gene_ID,Statistic,PVal,FDR,Beta
0,chr1,44288,49837,Zm00001d027230,gene,chr1,44306,44307,rs1_44306,Zm00001d047861,-15.244080,4.703940e-47,8.784256e-43,-0.012608
1,chr1,44288,49837,Zm00001d027230,gene,chr1,44306,44307,rs1_44306,Zm00001d004808,-10.441780,3.181453e-24,2.339655e-20,-3.733139
2,chr1,44288,49837,Zm00001d027230,gene,chr1,44306,44307,rs1_44306,Zm00001d031538,-10.165090,4.220149e-23,2.880657e-19,-12.332500
3,chr1,44288,49837,Zm00001d027230,gene,chr1,44306,44307,rs1_44306,Zm00001d008253,-10.165090,4.220149e-23,2.880657e-19,-13.845900
4,chr1,44288,49837,Zm00001d027230,gene,chr1,44306,44307,rs1_44306,Zm00001d032862,-10.165090,4.220149e-23,2.880657e-19,-0.000578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17196037,chr9,159667997,159691083,Zm00001d048577,gene,chr9,159690349,159690350,rs9_159690349,Zm00001d001752,-7.556017,9.846873e-14,3.089405e-10,-0.006062
17196038,chr9,159667997,159691083,Zm00001d048577,gene,chr9,159690349,159690350,rs9_159690349,Zm00001d001652,-7.371548,3.694633e-13,1.089526e-09,-0.042778
17196039,chr9,159667997,159691083,Zm00001d048577,gene,chr9,159690349,159690350,rs9_159690349,Zm00001d045638,-7.322923,5.210988e-13,1.501189e-09,-6.382181
17196040,chr9,159667997,159691083,Zm00001d048577,gene,chr9,159690349,159690350,rs9_159690349,Zm00001d031869,-7.303307,5.983143e-13,1.712528e-09,-0.196164


### Making subsets of the eQTL data

We subsample the eQTL matrix above. 

+ Consider only the eQTLs where the SNP is associated to one of the highly varying genes. 

In [15]:
related = pd.DataFrame()

for i in range(meta.shape[0]):
    related = related.append(eqtl[ eqtl['Associated_Gene_ID'] == meta.iloc[i]['gene']], sort=False)

related

Unnamed: 0,R_Gene_Chr,R_Gene_Start,R_Gene_Stop,R_Gene_ID,R_Gene_Class,SNP_Chr,SNP_Start,SNP_Stop,SNP_ID,Associated_Gene_ID,Statistic,PVal,FDR,Beta
46166,chr1,3095307,3095604,Zm00001d027332,gene,chr1,3095600,3095601,rs1_3095600,Zm00001d032825,-6.583017,7.653515e-11,1.648439e-07,-618.7353
89558,chr1,4494974,4499451,Zm00001d027403,gene,chr1,4495369,4495370,rs1_4495369,Zm00001d032825,-6.583017,7.653515e-11,1.648439e-07,-618.7353
95346,chr1,4891940,4894990,Zm00001d027419,gene,chr1,4892252,4892253,rs1_4892252,Zm00001d032825,-6.583017,7.653515e-11,1.648439e-07,-618.7353
122615,chr1,5819751,5825029,Zm00001d027462,gene,chr1,5824650,5824651,rs1_5824650,Zm00001d032825,-6.583017,7.653515e-11,1.648439e-07,-618.7353
142221,chr1,6407152,6409815,Zm00001d027488,gene,chr1,6409375,6409376,rs1_6409375,Zm00001d032825,-6.583017,7.653515e-11,1.648439e-07,-618.7353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13559234,chr7,112328679,112337673,Zm00001d020414,gene,chr7,112331952,112331953,rs7_112331952,Zm00001d002364,-8.035461,2.782536e-15,1.027720e-11,-141.7520
13788870,chr7,145562442,145567534,Zm00001d021196,gene,chr7,145562868,145562869,rs7_145562868,Zm00001d002364,-8.035461,2.782536e-15,1.027720e-11,-141.7520
14746701,chr8,23386819,23390680,Zm00001d008871,gene,chr8,23387177,23387178,rs8_23387177,Zm00001d002364,-8.035746,2.776497e-15,1.025549e-11,-141.7636
15677638,chr8,169573762,169577154,Zm00001d012177,gene,chr8,169575896,169575897,rs8_169575896,Zm00001d002364,-8.035461,2.782536e-15,1.027720e-11,-141.7520


In [16]:
print(len(related.Associated_Gene_ID.unique()), 'genes out of', len(meta.gene.unique()),
      'contain associated SNPs')
related.drop(columns=['SNP_Chr','SNP_Stop','R_Gene_Class']).to_csv(dst+'eqtl_related_{}.csv'.format(cutoff),
                                                                  index = True, index_label='Original_Index')

1642 genes out of 7359 contain associated SNPs


In [42]:
locrel = pd.concat([located, related], axis=1, join='inner')
locrel = locrel.iloc[:, :located.shape[1]]
locrel

Unnamed: 0,R_Gene_Chr,R_Gene_Start,R_Gene_Stop,R_Gene_ID,R_Gene_Class,SNP_Chr,SNP_Start,SNP_Stop,SNP_ID,Associated_Gene_ID,Statistic,PVal,FDR,Beta
10417261,chr5,69975364,69978349,Zm00001d014957,gene,chr5,69975582,69975583,rs5_69975582,Zm00001d040202,-7.155506,1.677400e-12,4.564204e-09,-580.0772
10417335,chr5,69975364,69978349,Zm00001d014957,gene,chr5,69975588,69975589,rs5_69975588,Zm00001d040202,-7.563082,9.355389e-14,2.941734e-10,-611.2713
10417910,chr5,69975364,69978349,Zm00001d014957,gene,chr5,69978178,69978179,rs5_69978178,Zm00001d040202,-7.563082,9.355389e-14,2.941734e-10,-611.2713
12014337,chr6,92808874,92811320,Zm00001d036567,gene,chr6,92811304,92811305,rs6_92811304,Zm00001d001139,-8.917276,2.446043e-18,1.199951e-14,-1262.5240
14255497,chr7,173953536,173954457,Zm00001d022264,gene,chr7,173954253,173954254,rs7_173954253,Zm00001d044495,-8.451948,1.081581e-16,4.604894e-13,-823.7607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616952,chr1,41833737,41838204,Zm00001d028653,gene,chr1,41836775,41836776,rs1_41836775,zma-MIR482,-8.789329,7.050030e-18,3.306002e-14,-288.7677
616999,chr1,41833737,41838204,Zm00001d028653,gene,chr1,41836824,41836825,rs1_41836824,zma-MIR482,-8.789329,7.050030e-18,3.306002e-14,-288.7677
617046,chr1,41833737,41838204,Zm00001d028653,gene,chr1,41836963,41836964,rs1_41836963,zma-MIR482,-8.789329,7.050030e-18,3.306002e-14,-288.7677
617075,chr1,41833737,41838204,Zm00001d028653,gene,chr1,41837097,41837098,rs1_41837097,zma-MIR482,-8.439968,1.189764e-16,5.001608e-13,-268.1414


In [48]:
print(len(locrel.Associated_Gene_ID.unique()), 'genes out of', len(meta.gene.unique()),
      'contain associated SNPs')
locrel.drop(columns=['SNP_Chr','SNP_Stop','R_Gene_Class']).to_csv(dst+'eqtl_locrel_{}.csv'.format(cutoff),
                                                                  index = True, index_label='Original_Index')

92 genes out of 648 contain associated SNPs
