In [1]:
import pandas as pd
import os

Please note that all the table illustrations removed the index, the table should have subject index for aligning in actual running

In [2]:
def source_path(path):
    """for tidier illustration"""
    return os.path.join("/mnt/vast/hpc/bvardarajan_lab/", path)

# Encode the output from Coassin pipeline

In [3]:
from lpa_pipeline import encodings
print(encodings.__doc__)

The pipeline encodes outputs from coassin_pipeline

Encoding Rule:

* If raw/raw.txt file total coverage < ``raw_total_coverage_threshold``,
  encode that position "missing" for that person
* If total coverage >= ``raw_total_coverage_threshold``, in annotated file
    * If position is missing in annotated file, the variant is coded 0
    * If position is present in the annotated, if both:
        1. variant_level value > ``variant_level_threshold``
        2. the total reads supporting (variant_level*total_coverage)
           value >= ``read_supporting_threshold``,
      are met, the variant is coded 1, otherwise 0

Example:

    The class should be initiated as follows::

        eco = encodings.EncodingCoassinOutput(
            input_path = "/some/parent/path/of/bam/output" # or next line
            bam_list = "/paths/to/a/file/recording/bam/path/line/by/line.txt"
            output_path = "output_path"# required
            )

    The encoding process include an individual encodi

Using parent path:

In [4]:
eco = encodings.EncodingCoassinOutput(
    input_path = source_path("LPA_analysis/coassin_pipeline/pipeline_output"),
    output_path = source_path("LPA_analysis/dataset/tidied_output/illustration_run"),
    verbosity = 0
    )

0it [00:00, ?it/s]

For txt file, the txt file should have each path in a row

In [5]:
eco = encodings.EncodingCoassinOutput(
    bam_list = source_path("LPA_analysis/coassin_pipeline/data_inflow/bam_list_n=3.txt"),
    output_path = source_path("LPA_analysis/dataset/tidied_output/pipeline_test"),
    verbosity = 1 # you can set to 0 to mute output
    )

  0%|          | 0/3 [00:00<?, ?it/s]

3 valid input detected


In [6]:
eco.encode_individual(saving_step = 1)

  0%|          | 0/3 [00:00<?, ?it/s]

Step: 0: coverage total saved into /mnt/vast/hpc/bvardarajan_lab/LPA_analysis/dataset/tidied_output/pipeline_test/coverage_totals/washei25748.BQSR.recaled.bam.csv
Step: 0: encoded result saved into /mnt/vast/hpc/bvardarajan_lab/LPA_analysis/dataset/tidied_output/pipeline_test/encoded_results/washei25748.BQSR.recaled.bam.csv
Step: 1: coverage total saved into /mnt/vast/hpc/bvardarajan_lab/LPA_analysis/dataset/tidied_output/pipeline_test/coverage_totals/washei25756.BQSR.recaled.bam.csv
Step: 1: encoded result saved into /mnt/vast/hpc/bvardarajan_lab/LPA_analysis/dataset/tidied_output/pipeline_test/encoded_results/washei25756.BQSR.recaled.bam.csv
Step: 2: coverage total saved into /mnt/vast/hpc/bvardarajan_lab/LPA_analysis/dataset/tidied_output/pipeline_test/coverage_totals/washei25759.BQSR.recaled.bam.csv
Step: 2: encoded result saved into /mnt/vast/hpc/bvardarajan_lab/LPA_analysis/dataset/tidied_output/pipeline_test/encoded_results/washei25759.BQSR.recaled.bam.csv


When encoding, the class will search all the output under output_path

In [7]:
complete_coverage_total = eco.generate_coverage_total(save = True)
complete_encoded_result = eco.generate_encoded_results(save = True)

# Intersect with eigenstrat result 
Any other table with ethnicity information is fine

In [8]:
# the complete_encoded_result on the cell above
encoding_result = pd.read_csv(
    source_path("LPA_analysis/dataset/tidied_output/encoded_result_final.csv"), 
    index_col = 0).T # One-hot-encoded table, Subject at index, snps on the header

# A ethnicity information table
# Any table provided a "ethnicity" column, this gives PCA result from eigenstrat as well
eigen_result = pd.read_csv(
    source_path("LPA_analysis/dataset/ethnicity_from_eigenstrat/eigenstrat_complete.csv"), 
    index_col = 0) 
print(encoding_result.shape, eigen_result.shape)

(3915, 2130) (3819, 4)


In [9]:
# removed the index ID for illustration
eigen_result.head(3).reset_index(drop = True) 

Unnamed: 0,PC1,PC2,PC3,ethnicity
0,-0.0394,0.0118,-0.0001,EU
1,0.028,0.003,0.048,EU
2,-0.0429,0.0254,-0.024,EU


Align the encoding and ethnicity information

In [10]:
encoding_result_aligned, eigen_result_aligned = encoding_result.align(eigen_result, join = "inner", axis = 0)
print(encoding_result_aligned.shape, eigen_result_aligned.shape)

(3817, 2130) (3817, 4)


# Filter the encoded variants

In [11]:
from lpa_pipeline import snps_filter
print(snps_filter.__doc__)

Give a first-step filter to all the SNPs encoded from Coassin's output

Apply the following:

    * filter A: drop SNPs with NA ratio higher than ``drop_ratio`` in population
    * filter B: drop SNPs with all 1's or all 0's (NA doesn't count for this)

Example::

    filter_AB = SnpsFilter(drop_ratio = 0.1)
    filtered_result, drop_mask, drop_report = filter_AB.filter_A_and_B(df)

Where df is a pd.DataFrame instance,
with SNPs ID on the header, subject ID at the index,
i.e. This filter is dropping columns.



In [12]:
filter_AB = snps_filter.SnpsFilter()
filtered_result, drop_mask, drop_report = filter_AB.filter_A_and_B(encoding_result_aligned)

In [13]:
drop_mask.head(3)

Unnamed: 0_level_0,filtered_A,filtered_B,filtered
snp_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11-G/A,True,False,False
16-T/C,True,False,False
17-G/A,True,False,False


In [14]:
drop_report

Unnamed: 0,left,drop
filtered_A,1764,366
filtered_B,1533,597
filtered,1421,709


In [15]:
filtered_result.shape

(3817, 1421)

# Locus table illustration

In [16]:
from lpa_pipeline import locus_collector
print(locus_collector.__doc__)

Collecting the locus information of Coassin's output

Example::

    lc = locus_collector.LocusCollector(
        input_path = "/some/parent/path/of/bam/output" #choose this or next line
        bam_list = "/paths/to/a/file/recording/bam/path/line/by/line.txt")

    locus_table = lc.generate_locus_table()




In [17]:
lc = locus_collector.LocusCollector(
    input_path = source_path("LPA_analysis/coassin_pipeline/pipeline_output"))
locus_table = lc.generate_locus_table()

In [18]:
locus_table.head(3)

Unnamed: 0_level_0,Pos,Ref,Variant,mylocus,coding
pos-ref/var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11-G/A,11,G,A,short-I,Intron
16-T/C,16,T,C,short-I,Intron
17-G/A,17,G,A,short-I,Intron


In [19]:
locus_table_filtered = locus_table[locus_table.index.isin(filtered_result.columns)]
locus_table_filtered.shape

(1421, 5)

# New Exon variant compared with Coassin table S9

In [20]:
coassin_paper_exon = pd.read_csv(
    source_path("LPA_analysis/dataset/Coassin_exon_from_excel_OCR_tidied.csv"), 
    index_col = 0)

In [21]:
coassin_paper_exon.shape

(208, 1)

In [22]:
# find the Exons in our table
locus_exon = locus_table_filtered[locus_table_filtered["coding"] == "Exon"].index.drop_duplicates()
# find the pros-ref/var in our exon but not in coassin's report
new = pd.DataFrame(locus_exon[~locus_exon.isin(coassin_paper_exon)])
new.head()

Unnamed: 0,pos-ref/var
0,584-C/T
1,584-C/A
2,584-C/G
3,585-G/T
4,586-A/C


# Freq and appearance

In [23]:
from lpa_pipeline import freq_table_generator
print(freq_table_generator.__doc__)

A generator computing relative frequency of SNP carrier by group

Common usage:

Given two pandas.DataFrame

* ``class_info_table`` has one have columns <class_variable> indicate the group
* ``one_hot_table`` has all one-hot-variables(SNPs encoding),

Compute the SNPs frequency in each class defined in <class_variable>::

    ftg = freq_table_generator.FreqTableGenerator(
        threshold = 0.01
        encoding = {0: "Not Detected",
                    1: "Rare",
                    2: "Common"})

    freq_table = ftg.generate_freq_table(
        class_info_table = class_info_table,
        one_hot_table = one_hot_table,
        class_variable = "<class_variable>"
        class_variable_list = ["<class_name_1>","<class_name_2>",...]
        #if only need a part of <class_variable> column
        )

if need a rarity classification as columns as well::

    freq_table_with_rarity = ftg.generate_freq_table_with_rarity(
        class_info_table = class_info_table,
        one_hot_table =

In [24]:
ftg = freq_table_generator.FreqTableGenerator()

In [25]:
#removed the index for illustration, the table should have index for aligning!
eigen_result_aligned.head(3).reset_index(drop = True) 

Unnamed: 0,PC1,PC2,PC3,ethnicity
0,-0.0113,0.017,-0.0402,AF
1,-0.0414,0.02,-0.029,EU
2,0.0344,-0.0184,0.0465,AF


In [26]:
freq_table_complete = ftg.generate_freq_table_with_rarity(
                        one_hot_table = filtered_result,
                        class_info_table = eigen_result_aligned,
                        class_variable = "ethnicity"
                        )

In [27]:
freq_table_complete.head(3)

Unnamed: 0_level_0,count_AF,total_AF_detected,total_AF_population,freq_AF,count_EU,total_EU_detected,total_EU_population,freq_EU,count_HISP,total_HISP_detected,total_HISP_population,freq_HISP,AF,EU,HISP
snp_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
21-G/A,346.0,988,1120,0.350202,388.0,815,886,0.476074,945.0,1769,1811,0.5342,Common,Common,Common
31-T/C,384.0,1009,1120,0.380575,403.0,824,886,0.489078,1015.0,1777,1811,0.571187,Common,Common,Common
33-T/C,1.0,1010,1120,0.00099,0.0,823,886,0.0,0.0,1779,1811,0.0,Rare,Not Detected,Not Detected


In [28]:
freq_table_complete["Novelty"] = ~(freq_table_complete.index.isin(coassin_paper_exon["pos-ref/var"]))
freq_table_complete["Novelty"].value_counts()

True     1318
False     103
Name: Novelty, dtype: int64

# Regression @ N_subject = 27 & N_Var = 1421

Serum result Association

removed the index for illustration, the table should have index for aligning!

In [29]:
#snps.to_csv("/mnt/mfs/hgrcgrid/shared/LPA_analysis/dataset/serum_data/serum_data_snps.csv")
snps = pd.read_csv(
    source_path("LPA_analysis/dataset/serum_data/serum_data_snps.csv"), 
    index_col = 0)
snps.shape

(27, 1421)

In [30]:
personal_info = pd.read_csv(
    source_path("LPA_analysis/dataset/serum_data/serum_data_personal_info.csv"), 
    index_col = 0)
# The index is dropped for illustration purpose
personal_info.head(3).reset_index(drop = True)

Unnamed: 0,AGE,GENDER,PC1,PC2,PC3
0,93.72,0,0.031,-0.0025,0.0136
1,74.0,1,-0.0435,0.0076,0.0145
2,82.93,1,0.0043,0.0004,0.0135


In [31]:
serum_result = pd.read_csv(
    source_path("LPA_analysis/dataset/serum_data/serum_data_serum.csv"),
    index_col = 0).rename(columns = {"LP(a) (nmol/L)": "lpa",
                                     "Isoform 1          (expression)": "isoform"})
serum_result.columns # don't have to have all these, just what you want

Index(['lpa', 'Isoform 1 (KIV Motifs)', 'Isoform 2 (KIV Motifs)',
       'Isoform 1          (% expression)',
       'Isoform 2          (% expression)', 'isoform',
       'Isoform 2          (expression)', 'wAS'],
      dtype='object')

In [32]:
from lpa_pipeline import association

In [33]:
serum_asso = association.SNPAssociation()

In [34]:
print(association.__doc__)

This pipeline running an iterative association test for each snps.

The iterative strategy is as follows:

* for each the target variable in ``target_strategy``
    #. Extract the column from ``target_dataset``:
    #. run preprocessing, if the value of key ``preprocessing`` is given in ``target_strategy``
    #. group the dataset by columns defined by ``extra_iterate_on`` if provided
    #. for each group, iterate over columns of ``encoded_snp`` (each SNP):
        1. concatenate it with ``other_exogs``, generate the exogenous dataset
        2. if ``one_to_one_exogs`` is provided, use ``one_to_one_strategy``
           finding other columns and concatenate them as well
        3. run regressions specified by ``target_strategy``.engine
        4. combine the results from each SNP
        5. save the regression output

Two APIs provided
 * sklearn style:

     * 3-line style::

        snp_asso = SNPAssociation()
        snp_asso.fit(**kwargs)
        snp_asso.transform()

     * 2-lin

In [35]:
print(serum_asso.fit.__doc__)

API setting up the regression (not running)

        Args:

            encoded_snp: pd.DataFrame, the dataframe to be looped on columns

            other_exog: pd.DataFrame, the dataframe taking all the other variables

            target_dataset: pd.DataFrame, the dataframe taking all the target variables

            target_strategy: dict[str, dict[str, funcs or models]],
                 The dictionary provide pre-processing to specific column,
                 Only column mentioned in keys will be included in the running.
                 The inner dictionary should have two keys:

                 "engine": statsmodels.api models
                     designed for statsmodels.discrete.discrete_model.Logit or
                     statsmodels.regression.linear_model.OLS,
                     but any model's .fit results provide .params .bse .pvalues will work

                 "preprocessing": funcs
                     the function should take a pd.DataFrame/pd.Series as input
   

In [36]:
print(association.filter_C.__doc__)

The filter C on existing frequency,

    both 0 and 1 should appear for more than <threshold> times

    Args:
        df: pandas DataFrame, the index should be the Sample ID, and the columns are the snps
        threshold: int, the number threshold that both 0 and 1 should appear more than this number
    


In [37]:
serum_asso.fit_transform(
    encoded_snp = snps, 
    other_exogs = personal_info,
    target_dataset = serum_result,
    target_strategy = association.target_strategy_serum(),
    output_path = source_path("LPA_analysis/data_analysis_result/association/illustrate_run/serum"),
    snps_preprocessing_strategy = association.filter_C,
    verbose = 1
)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

/mnt/vast/hpc/bvardarajan_lab/LPA_analysis/data_analysis_result/association/illustrate_run/serum/lpa_OLS_N_snp=107.csv/.txt saved


  0%|          | 0/1421 [00:00<?, ?it/s]

/mnt/vast/hpc/bvardarajan_lab/LPA_analysis/data_analysis_result/association/illustrate_run/serum/wAS_OLS_N_snp=107.csv/.txt saved


  0%|          | 0/1421 [00:00<?, ?it/s]

/mnt/vast/hpc/bvardarajan_lab/LPA_analysis/data_analysis_result/association/illustrate_run/serum/isoform_OLS_N_snp=107.csv/.txt saved


# Regression @ N_subject = 3774 & N_Var = 1421

SNP - Phenotype correlation

In [38]:
personal_info = pd.read_csv(
    source_path("LPA_analysis/dataset/phenotypes/WHICAP_pheno_lpa_202306022.csv"), 
    index_col = 0)
personal_info.set_index("WES_ID", inplace= True)
personal_info = personal_info[(~personal_info["AGE"].isna()) & (~personal_info["DEM03"].isna())]

The personal_info table has some subjects with NA age and Gender, just pick the all non-NA part, align

In [39]:
encoding_aligned, personal_info_aligned = filtered_result.align(personal_info, join = "inner", axis = 0)
eigen_aligned, personal_info_aligned = eigen_result_aligned.align(personal_info, join = "inner", axis = 0)
print(encoding_aligned.shape, personal_info_aligned.shape, eigen_aligned.shape)

(3774, 1421) (3774, 22) (3774, 4)


In [40]:
other_exogs = pd.concat([personal_info_aligned[["DEM03", "AGE"]], eigen_aligned], 
                        axis = 1).rename(columns = {"DEM03": "GENDER"})
other_exogs.head(3).reset_index(drop = True)

Unnamed: 0,GENDER,AGE,PC1,PC2,PC3,ethnicity
0,0,81.0,-0.0113,0.017,-0.0402,AF
1,1,77.0,-0.0414,0.02,-0.029,EU
2,1,71.0,0.0344,-0.0184,0.0465,AF


In [41]:
personal_info_aligned.DEMENTIA.value_counts()

1.0    1986
2.0    1738
3.0      50
Name: DEMENTIA, dtype: int64

In [42]:
phenotypes = personal_info_aligned[list(association.target_strategy().keys())]
phenotypes.tail(5).reset_index(drop = True)

Unnamed: 0,STROKE,DEMENTIA,DIABETES,HEART,HYPERTENSION,LIP01_B,LIP02_B,LIP03_B,LIP04_2,INSL01_2,INSL02_2,INSL03_2,HBA1C_2
0,0.0,1.0,0,0,1,,,,95.0,83.0,10.3,5.82,6.37
1,0.0,2.0,0,1,1,,,,,,,,
2,0.0,2.0,0,0,1,,,,,,,,
3,0.0,2.0,0,0,1,,,,,,,,
4,0.0,1.0,0,1,1,,,,,,,,


In [43]:
snp_asso = association.SNPAssociation()

In [44]:
snp_asso.fit_transform(
    encoded_snp = encoding_aligned, 
    other_exogs = other_exogs,
    target_dataset = phenotypes,
    target_strategy = association.target_strategy(),
    output_path = source_path("LPA_analysis/data_analysis_result/association/illustrate_run/association"),
    extra_iterate_on = ["ethnicity"], # will run addtional iteration from this column in other_exogs, like group_by()
    snps_preprocessing_strategy = association.filter_C,
    verbose = 0 # the output is a lot.
)

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/1421 [00:00<?, ?it/s]

# METAL Meta-analysis

In [45]:
from lpa_pipeline import metal_toolkit
print(metal_toolkit.__doc__)

The toolkit running METAL-related pipeline internally

Example:

Initialize::

    mtk = metal_toolkit.METALToolkit(
        ethnicity = ["EU", "AF", "HISP"],
        verbose = 1,
        metal_path = "/mnt/mfs/cluster/bin/METAL/metal"
        )

One-line Pipeline::

    mtk.run_metal(
        path_association: str,
        path_meta: str,
        multi_line_header: bool)

The ``multi_line_header`` only makes difference at the output header,
it will create another header row, just for the visualization pipeline

If need docs for each steps, after Initializing run::

    print(mtk.__doc__)



In [46]:
mtk = metal_toolkit.METALToolkit(
    ethnicity = ["EU", "AF", "HISP"], 
    verbose = 1, 
    metal_path = "/mnt/mfs/cluster/bin/METAL/metal")

In [47]:
aggregate_results = mtk.run_metal(
    path_association = source_path("LPA_analysis/data_analysis_result/association/illustrate_run/association"),
    path_meta = source_path("LPA_analysis/data_analysis_result/meta_analysis/illustrate_run/"))

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

METAL scripts are saved to /mnt/vast/hpc/bvardarajan_lab/LPA_analysis/data_analysis_result/meta_analysis/illustrate_run/


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [48]:
def list_files(startpath):
    """credit to https://stackoverflow.com/a/9728478"""
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

In [49]:
list_files(source_path("LPA_analysis/data_analysis_result/meta_analysis/illustrate_run/"))

/
    DEMENTIA_Logit_ethnicity=AF_N_snp=496.txt
    DEMENTIA_Logit_ethnicity=EU_N_snp=397.txt
    DEMENTIA_Logit_ethnicity=HISP_N_snp=553.txt
    DIABETES_Logit_ethnicity=AF_N_snp=503.txt
    DIABETES_Logit_ethnicity=EU_N_snp=396.txt
    DIABETES_Logit_ethnicity=HISP_N_snp=555.txt
    HBA1C_2_OLS_ethnicity=AF_N_snp=304.txt
    HBA1C_2_OLS_ethnicity=EU_N_snp=271.txt
    HBA1C_2_OLS_ethnicity=HISP_N_snp=318.txt
    HEART_Logit_ethnicity=AF_N_snp=503.txt
    HEART_Logit_ethnicity=EU_N_snp=397.txt
    HEART_Logit_ethnicity=HISP_N_snp=555.txt
    HYPERTENSION_Logit_ethnicity=AF_N_snp=503.txt
    HYPERTENSION_Logit_ethnicity=EU_N_snp=397.txt
    HYPERTENSION_Logit_ethnicity=HISP_N_snp=552.txt
    INSL01_2_OLS_ethnicity=AF_N_snp=377.txt
    INSL01_2_OLS_ethnicity=EU_N_snp=314.txt
    INSL01_2_OLS_ethnicity=HISP_N_snp=365.txt
    INSL02_2_OLS_ethnicity=AF_N_snp=378.txt
    INSL02_2_OLS_ethnicity=EU_N_snp=315.txt
    INSL02_2_OLS_ethnicity=HISP_N_snp=365.txt
    INSL03_2_OLS_ethnicity=AF_N_snp=

In [50]:
aggregate_results.columns

Index(['MarkerName', 'snp_pos.Beta.EUR', 'snp_pos.SE.EUR',
       'snp_pos.P.value.EUR', 'rel_freqs.EUR', 'abs_freqs.EUR', 'n_sample.EUR',
       'snp_pos.Beta.AA', 'snp_pos.SE.AA', 'snp_pos.P.value.AA',
       'rel_freqs.AA', 'abs_freqs.AA', 'n_sample.AA', 'snp_pos.Beta.HISP',
       'snp_pos.SE.HISP', 'snp_pos.P.value.HISP', 'rel_freqs.HISP',
       'abs_freqs.HISP', 'n_sample.HISP', 'Allele1', 'Allele2', 'Effect',
       'StdErr', 'P.value', 'Direction', 'META.ALLELE.COUNT', 'META.ALLELE.N',
       'trait'],
      dtype='object')

In [51]:
aggregate_results.shape

(6113, 28)