In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
from google.cloud import storage
from io import BytesIO

# Phenotype preparation and QC

In [None]:
# Read phenotype from google bucket
# Note: phenotype files can be generated from infer, infer_hidden and explore recipes with the flag 
# '--tsv_style genetics'
client = storage.Client()
bucket = client.get_bucket('ml4h')
blob = storage.blob.Blob("pdiachil/genetics/phenotype_hrr.tsv", bucket)
content = blob.download_as_string()
phenotype = pd.read_csv(BytesIO(content), sep='\t').dropna()

In [None]:
phenotype

In [None]:
f, ax = plt.subplots(1, 3)
f.set_size_inches(16, 5)
sns.distplot(phenotype['hrr'], ax=ax[0])
ax[0].set_xlim([-5, 80])
ax[0].set_xlabel('hrr')
ax[1].plot(phenotype['hrr'], phenotype['rbint'], 'o')
ax[0].set_xlim([-5, 80])
ax[1].set_xlabel('hrr')
ax[1].set_ylabel('RBINT')
sns.distplot(phenotype['rbint'], ax=ax[2])
ax[2].set_xlabel('RBINT')
plt.tight_layout()

# Covariate tables

In [None]:
# Read covariate table from google bucket
# Note: covariate files can be generated via explore recipe by passing the flag 
# '--tsv_style genetics'
blob = storage.blob.Blob("pdiachil/genetics/covariate_table_fixed.csv", bucket)
content = blob.download_as_string()
covariates = pd.read_csv(BytesIO(content), sep='\t').dropna()

In [None]:
covariates

In [None]:
# Prepare exclude file with individuals without phenotype
cov_set = set(covariates['FID'])
pheno_set = set(phenotype['FID'])
remove_list = list(cov_set - pheno_set)
remove_df = pd.DataFrame()
remove_df['FID'] = remove_list
remove_df['IID'] = remove_list
remove_df.to_csv(f'remove_nopheno.tsv', header=None, index=False, sep='\t')

# Run GWAS
Files are ready to be used by genetic association software such as Plink and BOLT

# Manhattan plot

In [None]:
gwases = ['bolt_results_hrr']
labels = ['HRR']
gwas_dic = {}
for gwas, label in zip(gwases, labels):
    blob = storage.blob.Blob(f"pdiachil/genetics/{gwas}/bolt_output", bucket)
    content = blob.download_as_string()
    df = pd.read_csv(BytesIO(content), sep='\t',
                     index_col=False
                    )
    df = df.loc[df['SNP'].drop_duplicates().index]
    gwas_dic[gwas] = {}
    gwas_dic[gwas]['df'] = df
    gwas_dic[gwas]['label'] = label

In [None]:
def plot_gwas(df_bolt, fig, ax):
    df = df_bolt.sort_values(by=['CHR', 'BP'])
    df['LOGP'] = -np.log10(df['P_BOLT_LMM'])
    df.CHR = df.CHR.astype('category')
    df['ind'] = range(len(df))
    df_grouped = df.groupby(('CHR'))
       
    colors = ['black','silver']
    x_labels = []
    x_labels_pos = []

    for num, (name, group) in enumerate(df_grouped):
        group.plot(kind='scatter', x='ind', y='LOGP',color=colors[num % len(colors)], ax=ax, s=5)
        x_labels.append(name)
        x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2))   
        
    ax.plot([0.0, group['ind'].iloc[-1]], [7.3, 7.3])
    
    ax.set_xticks(x_labels_pos)
    ax.set_xticklabels(x_labels)
    ax.set_xlim([0, len(df)])
    ax.set_ylim([1, 20])
    ax.set_xlabel('Chromosome')
    ax.set_ylabel('$-\log_{10}(p-value)$')

In [None]:
gwas_dic['bolt_results_hrr']['df']

In [None]:
for gwas in gwas_dic:
    f, ax = plt.subplots()
    f.set_size_inches(22, 6)
    plot_gwas(gwas_dic[gwas]['df'], f, ax)
    f.savefig(f'manhattan_{gwas}.png', dpi=300)