### Preliminaries

In [1]:
import glob
import os
import numpy as np
import pandas as pd
from IPython import embed
import functools

from data.gwas_details import GWAS_dict
GWAS_PATH = "data/other_gwas/original_files"

In [2]:
gwas_harmonized_pattern = "data/other_gwas/preprocessed_files/{prefix}__{phenotype}.tsv"
gwas_selected_snps_pattern = "data/other_gwas/preprocessed_files/{prefix}__{phenotype}__selected_snps.tsv"
COMA_GWAS_SUMMARY = "results/gwas_loci_summary_across_runs.csv"
LOGP_PATH = "results/log10p_for_selected_snps_across_gwas.csv"

### GWAS file - reduction

In [None]:
for file, info in GWAS_dict.items():
    prefix = info["prefix"]
    phenotype = info["phenotype"].replace(" ", "_")
    o_filename = f"{prefix}__{phenotype}.tsv"
    file = os.path.join(GWAS_PATH, file)
    df = pd.read_csv(file, sep="\t").rename(info["columns"], axis=1)    
    df = df[["CHR", "BP", "SNP", "P"]]
    df.to_csv(o_filename, sep='\t', header=True, index=False)

### Filter GWAS files for specific SNPS

#### Find significant SNPs across COMA runs.

In [None]:
gwas_loci_summary_across_runs_df = pd.read_csv(COMA_GWAS_SUMMARY)
idx = gwas_loci_summary_across_runs_df.groupby(["region"])["P"].transform(min) == gwas_loci_summary_across_runs_df["P"]

best_association_per_region = gwas_loci_summary_across_runs_df[idx].sort_values("region")
best_snps = set(best_association_per_region.SNP)

#### Filter GWAS files for selected SNPs

In [None]:
for file, info in GWAS_dict.items():
    
    prefix = info["prefix"]
    phenotype = info["phenotype"].replace(" ", "_")
    
    o_filename = gwas_harmonized_pattern.format(prefix=prefix, phenotype=phenotype)           
    ofile_selected = gwas_selected_snps_pattern.format(prefix=prefix, phenotype=phenotype)           
    
    df = pd.read_csv(o_filename, sep="\t")
    keep = df.apply(lambda row: row.SNP in best_snps, axis=1)    
    
    print(ofile_selected)
    df[keep].to_csv(ofile_selected, sep="\t", index=False, header=True)        

In [None]:
pp = []

for file, info in GWAS_dict.items():
    
    prefix = info["prefix"]
    phenotype = info["phenotype"].replace(" ", "_")
        
    ofile_selected_snps = gwas_selected_snps_pattern.format(prefix=prefix, phenotype=phenotype)
    
    df = pd.read_csv(ofile_selected, sep="\t")
    df = df[["SNP", "P"]]
    df = df.rename({"P": f"{prefix}__{phenotype}"}, axis=1)
    
    pp.append(df)
    

pp = [dd.set_index("SNP") for dd in pp]
snps_across_gwas = functools.reduce(lambda df1,df2: pd.merge(df1,df2,on='SNP'), pp)

# WHAT'S HAPPENING WITH THIS SNP?
snps_across_gwas = snps_across_gwas.drop("rs533885")
log10p_gwas_df = (-np.log10(snps_across_gwas))
log10p_gwas_df.to_csv(LOGP_PATH, index=True)

In [None]:
log10p_gwas_df = pd.read_csv(LOGP_PATH)
log10p_gwas_df