In [None]:
import pandas as pd
import vcf
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf
import os
import subprocess
import polars as pl

In [None]:
def inverse_normal_transform(df, column):
    # Perform inverse normal transform on a column
    df[column+'_rank'] = df[column].rank()
    df[column+'_rint'] = stats.norm.ppf(df[column+'_rank'] / (len(df[column]) + 1))

    # Define the formula for the model
    formula = column+'_rint ~ baseline_age + age2 + genetic_sex + PC1 + PC2 + PC3 + PCD4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10'

    # Fit the model
    model = smf.ols(formula, data=df)
    results = model.fit()

    # Compute the residuals
    df[column+'_rint_resid'] = results.resid

    # Square the residuals and store in a new column
    df[column+'_rint_resid_sq'] = df[column+'_rint_resid'] ** 2

    df['FID'] = df['ID_VUMC']
    df['IID'] = df['ID_VUMC']
    df[['FID', 'IID', 'baseline_age', 'genetic_sex', 'PC1', 'PC2', 'PC3',
        'PCD4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', column+'_rint', column+'_rint_resid_sq']].dropna().to_csv('gs://bicklab-main-storage/Users/Yash_Pershad/'+column+'_vgwas_pheno_cov_v2.tsv', sep='\t', index=False)

In [None]:

def analyze_trait(trait, mean_urls, variance_urls):
    """
    Performs a complete analysis for a given trait, including downloading files,
    combining summary statistics, and finding vGWAS hits.

    Args:
    trait (str): The trait to analyze (e.g., 'plt', 'wbc').
    mean_urls (list): List of URLs for mean GWAS files.
    variance_urls (list): List of URLs for variance GWAS files.

    Returns:
    pd.DataFrame: DataFrame containing vGWAS hits.
    """
    def download_files(file_urls, folder_name):
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        os.chdir(folder_name)
        for url in file_urls:
            filename = url.split('/')[-1].strip()
            subprocess.run(['wget', '-O', filename, url], check=True)
        os.chdir('..')
        return os.path.abspath(folder_name)

    def combine_summary_stats(results_dir, gwas_test, pheno, variance):
        dfs = []
        for chr_num in range(1, 23):
            try:
                file_path = f"{results_dir}/{gwas_test}_ukb22828_c{chr_num}_b0_v3_{pheno}.regenie.gz"
                df = pl.read_csv(file_path, separator='\t')
                dfs.append(df)
            except Exception as e:
                print(f"Error processing chromosome {chr_num}: {str(e)}")
                continue
        
        if not dfs:
            print("No data was loaded. Please check the folder, gwas_test, and SNP interaction parameters.")
            return None
        
        combined_file_name = f"{gwas_test}_{pheno}_maf5_p5e8.tsv"
        combined_df = pl.concat(dfs)

        if variance:
            combined_df.filter(
                (pl.col("AAF") > 0.05) & (pl.col("AAF") < 0.95) & (pl.col("Pval") < 5e-8)
            ).write_csv(combined_file_name, separator='\t')
        if not variance:
            combined_df.filter(
                (pl.col("AAF") > 0.05) & (pl.col("AAF") < 0.95) & (pl.col("Pval") > 5e-8)
                ).write_csv(combined_file_name, separator='\t')

        return combined_file_name

    def find_vgwas_hits(mean_file, variance_file):
        variance_df = pd.read_csv(variance_file, sep='\t')
        mean_df = pd.read_csv(mean_file, sep='\t')
        return variance_df[~variance_df['Name'].isin(mean_df['Name'])]

    # Download mean GWAS files
    mean_folder = f"{trait}_vgwas_results_ukb_mean"
    mean_folder_path = download_files(mean_urls, mean_folder)
    print(f"Mean GWAS files downloaded to: {mean_folder_path}")

    # Combine mean summary stats
    mean_results_dir = os.path.abspath(mean_folder)
    mean_gwas_test = f"{trait}_gwas"
    mean_pheno = f"{trait}_rint"
    mean_combined_file = combine_summary_stats(mean_results_dir, mean_gwas_test, mean_pheno, False)

    # Download variance GWAS files
    variance_folder = f"{trait}_vgwas_results_ukb_variance"
    variance_folder_path = download_files(variance_urls, variance_folder)
    print(f"Variance GWAS files downloaded to: {variance_folder_path}")

    # Combine variance summary stats
    variance_results_dir = os.path.abspath(variance_folder)
    variance_gwas_test = f"{trait}_gwas_resid"
    variance_pheno = f"{trait}_rint_resid_sq"
    variance_combined_file = combine_summary_stats(variance_results_dir, variance_gwas_test, variance_pheno, True)

    # Find vGWAS hits
    if mean_combined_file and variance_combined_file:
        vgwas_hits = find_vgwas_hits(mean_combined_file, variance_combined_file)
        return vgwas_hits
    else:
        print("Error: Could not generate combined files for mean or variance GWAS.")
        return None