# Matching

## Purpose:

* Find exact and fuzzy overlaps between significant variants that are (e/p/m)QTLs
* Create organized tables relating variants to relevant biological features

## Import Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Load in Data 

In [2]:
eqtl_df =pd.read_csv("/home/jve4pt/resource-files/xiaowei-analysis-files/summary_gwas_eqtl_ColocSusie_13May2022_XH.csv")
eqtl_df.head()

Unnamed: 0,trait,stratum,phenotype.id,gene.name,start_pos,end_pos,strand,nsnps,variant.id1,variant.id2,PP.H0.abf.Exam1,PP.H1.abf.Exam1,PP.H2.abf.Exam1,PP.H3.abf.Exam1,PP.H4.abf.Exam1,LD.R
0,FEV1FVC,All,ENSG00000135074.15,ADAM19,157395534,157575775,R,496,chr5_157508516_C_T_b38,chr5_157517467_G_T_b38,2.1400000000000001e-69,3.94e-66,1.41e-05,0.024113,0.975873,0.99
1,FEV1FVC,All,ENSG00000237541.3,HLA-DQA2,32741391,32747198,F,1760,chr6_31896897_T_C_b38,chr6_32183666_C_T_b38,3.34e-15,5.43e-07,7.74e-10,0.124045,0.875955,0.51
2,FEV1,All,ENSG00000138780.14,GSTCD,105708778,105847726,F,824,chr4_105863123_G_C_b38,chr4_105884676_C_G_b38,0.0,0.0,8.48e-05,0.166543,0.833372,0.98


In [3]:
mqtl_df =pd.read_csv("/home/jve4pt/resource-files/xiaowei-analysis-files/summary_gwas_mqtl_ColocSusie_13May2022_XH.csv")
mqtl_df.head()

Unnamed: 0,trait,stratum,phenotype.id,chr,pos_hg19,Start_hg38,End_hg38,nsnps,variant.id1,variant.id2,...,PP.H4.abf.Exam1,PP.H0.abf.Exam5,PP.H1.abf.Exam5,PP.H2.abf.Exam5,PP.H3.abf.Exam5,PP.H4.abf.Exam5,LD.R,UCSC_RefGene_Name,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island
0,FEV1FVC,All,cg07318204,4,145566441,144645288,144645290,617,chr4_144541436_A_C_b38,chr4_144548816_G_T_b38,...,0.949431,,,,,,0.99,HHIP,TSS1500,Island
1,FEV1FVC,All,cg10728227,4,145478755,144557602,144557604,617,chr4_144541436_A_C_b38,chr4_144557510_C_T_b38,...,0.963304,3.84e-44,2.2499999999999998e-36,6.62e-10,0.036781,0.963219,0.99,,,
2,FEV1FVC,All,cg13749822,4,145566663,144645510,144645512,617,chr4_144541436_A_C_b38,chr4_144548816_G_T_b38,...,0.975014,,,,,,0.99,HHIP,TSS1500,Island
3,FEV1FVC,All,cg26339943,4,145571234,144650081,144650083,617,chr4_144541436_A_C_b38,chr4_144550093_A_T_b38,...,0.933779,,,,,,0.95,HHIP,Body,S_Shelf
4,FEV1FVC,All,cg01197005,5,147677420,148297856,148297858,1117,chr5_148360432_A_G_b38,chr5_148355813_T_C_b38,...,0.952336,,,,,,1.0,LOC102546294,Body,


In [4]:
pqtl_df =pd.read_csv("/home/jve4pt/resource-files/xiaowei-analysis-files/summary_gwas_pqtl_ColocSusie_27May2022_XH.csv")
pqtl_df.head()

Unnamed: 0,trait,stratum,phenotype.id,EntrezGeneSymbol,rsid1,rsid2,nsnps,variant.id1,variant.id2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,LD.R,anno.rsid1,anno.rsid2
0,FEV1FVC,All,SL003680_ENSG00000204305.14,AGER,rs2844456,rs9391855,1477,chr6_31896897_T_C_b38,chr6_32182024_C_T_b38,5.54e-13,0.000287,1.32e-10,0.066289,0.933424,0.58,EHMT2_Intron; C2_2KB Upstream,AGER_Intron


## Extract Unique Variants

In [5]:
mqtl=list(set(mqtl_df["variant.id1"].tolist() + mqtl_df["variant.id2"].tolist()))
pqtl=list(set(pqtl_df["variant.id1"].tolist() + pqtl_df["variant.id2"].tolist()))
eqtl=list(set(eqtl_df["variant.id1"].tolist() + eqtl_df["variant.id2"].tolist()))

eqtl

['chr5_157508516_C_T_b38',
 'chr4_105884676_C_G_b38',
 'chr5_157517467_G_T_b38',
 'chr6_32183666_C_T_b38',
 'chr4_105863123_G_C_b38',
 'chr6_31896897_T_C_b38']

## Fuzzy & Exact Matching

In [6]:
distance_threshold = 1000000

### Expression and Methylation

In [7]:
tmp_df = pd.DataFrame()

for eqtl_variant in eqtl:
    for mqtl_variant in mqtl:
        # if on the same chromosome 
        if eqtl_variant.split("_")[0] == mqtl_variant.split("_")[0]:
            distance = abs(int(eqtl_variant.split("_")[1])-int(mqtl_variant.split("_")[1]))
            # if they're close enough 
            if distance <= distance_threshold:

                tmp_df = pd.concat(
                    [tmp_df,
                         pd.DataFrame(
                             [[eqtl_variant,
                               # get all expression genes for variants in expression matrix and join them by comma
                               ",".join(eqtl_df[(eqtl_df["variant.id1"] == eqtl_variant) | (eqtl_df["variant.id2"] == eqtl_variant)]["gene.name"].tolist()),
                               # same as above but also look at methylation sheet for an eqtl in case it's also an mqtl
                               ",".join(mqtl_df[(mqtl_df["variant.id1"] == eqtl_variant) | (mqtl_df["variant.id2"] == eqtl_variant)]["phenotype.id"].tolist()),
                               mqtl_variant,
                               # mqtl in eqtl sheet 
                               ",".join(eqtl_df[(eqtl_df["variant.id1"] == mqtl_variant) | (eqtl_df["variant.id2"] == mqtl_variant)]["gene.name"].tolist()),
                               # mqtl in  mqtl sheet 
                               ",".join(mqtl_df[(mqtl_df["variant.id1"] == mqtl_variant) | (mqtl_df["variant.id2"] == mqtl_variant)]["phenotype.id"].tolist()),
                               distance
                              ]]
                         )
                    ]
                )
                    

In [8]:
# rename columns
tmp_df.columns=[
    "Expression Variant", 
    "Expression Variant Expression Gene", 
    "Expression Variant CpG Site", 
    "Methylation Variant", 
    "Methylation Variant Expression Gene", 
    "Methylation Variant CpG Site",
    "Distance"
]

tmp_df.sort_values("Distance")

Unnamed: 0,Expression Variant,Expression Variant Expression Gene,Expression Variant CpG Site,Methylation Variant,Methylation Variant Expression Gene,Methylation Variant CpG Site,Distance
0,chr5_157508516_C_T_b38,ADAM19,cg25090510,chr5_157508516_C_T_b38,ADAM19,cg25090510,0
0,chr4_105884676_C_G_b38,GSTCD,cg19338966,chr4_105884676_C_G_b38,GSTCD,cg19338966,0
0,chr6_31896897_T_C_b38,HLA-DQA2,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",chr6_31896897_T_C_b38,HLA-DQA2,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",0
0,chr4_105884676_C_G_b38,GSTCD,cg19338966,chr4_105882525_A_C_b38,,cg19338966,2151
0,chr5_157517467_G_T_b38,ADAM19,,chr5_157508516_C_T_b38,ADAM19,cg25090510,8951
0,chr4_105863123_G_C_b38,GSTCD,,chr4_105882525_A_C_b38,,cg19338966,19402
0,chr4_105863123_G_C_b38,GSTCD,,chr4_105884676_C_G_b38,GSTCD,cg19338966,21553
0,chr6_32183666_C_T_b38,HLA-DQA2,,chr6_32053184_G_C_b38,,"cg07810190,cg26997880,cg07810190,cg26997880",130482
0,chr6_31896897_T_C_b38,HLA-DQA2,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",chr6_32053184_G_C_b38,,"cg07810190,cg26997880,cg07810190,cg26997880",156287
0,chr6_31896897_T_C_b38,HLA-DQA2,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",chr6_31714252_G_A_b38,,"cg05483184,cg22547540,cg04960128,cg05483184,cg...",182645


In [9]:
# net.draw(net.from_pandas_edgelist(tmp_df, "Expression","Methylation"), with_labels=True)


### Expression and Protein

In [10]:
tmp_df = pd.DataFrame()

for eqtl_variant in eqtl:
    for pqtl_variant in pqtl:
        # if on the same chromosome 
        if eqtl_variant.split("_")[0] == pqtl_variant.split("_")[0]:
            distance = abs(int(eqtl_variant.split("_")[1])-int(pqtl_variant.split("_")[1]))
            # if they're close enough 
            if distance <= distance_threshold:

                tmp_df = pd.concat(
                    [tmp_df,
                         pd.DataFrame(
                             [[eqtl_variant,
                               # get all expression genes for variants in expression matrix and join them by comma
                               ",".join(eqtl_df[(eqtl_df["variant.id1"] == eqtl_variant) | (eqtl_df["variant.id2"] == eqtl_variant)]["gene.name"].tolist()),
                               # same as above but also look at methylation sheet for an eqtl in case it's also an pqtl
                               ",".join(pqtl_df[(pqtl_df["variant.id1"] == eqtl_variant) | (pqtl_df["variant.id2"] == eqtl_variant)]["phenotype.id"].tolist()),
                               pqtl_variant,
                               # pqtl in eqtl sheet 
                               ",".join(eqtl_df[(eqtl_df["variant.id1"] == pqtl_variant) | (eqtl_df["variant.id2"] == pqtl_variant)]["gene.name"].tolist()),
                               # pqtl in  pqtl sheet 
                               ",".join(pqtl_df[(pqtl_df["variant.id1"] == pqtl_variant) | (pqtl_df["variant.id2"] == pqtl_variant)]["phenotype.id"].tolist()),
                               distance
                              ]]
                         )
                    ]
                )


In [11]:
# rename columns
tmp_df.columns=[
    "Expression Variant", 
    "Expression Variant Expression Gene", 
    "Expression Variant Protein", 
    "Protein Variant", 
    "Protein Variant Expression Gene", 
    "Protein Variant Protein",
    "Distance"
]

tmp_df.sort_values("Distance")

Unnamed: 0,Expression Variant,Expression Variant Expression Gene,Expression Variant Protein,Protein Variant,Protein Variant Expression Gene,Protein Variant Protein,Distance
0,chr6_31896897_T_C_b38,HLA-DQA2,SL003680_ENSG00000204305.14,chr6_31896897_T_C_b38,HLA-DQA2,SL003680_ENSG00000204305.14,0
0,chr6_32183666_C_T_b38,HLA-DQA2,,chr6_32182024_C_T_b38,,SL003680_ENSG00000204305.14,1642
0,chr6_31896897_T_C_b38,HLA-DQA2,SL003680_ENSG00000204305.14,chr6_32182024_C_T_b38,,SL003680_ENSG00000204305.14,285127
0,chr6_32183666_C_T_b38,HLA-DQA2,,chr6_31896897_T_C_b38,HLA-DQA2,SL003680_ENSG00000204305.14,286769


### Methylation and Protein

In [12]:
tmp_df = pd.DataFrame()

for mqtl_variant in mqtl:
    for pqtl_variant in pqtl:
        # if on the same chromosome 
        if mqtl_variant.split("_")[0] == pqtl_variant.split("_")[0]:
            distance = abs(int(mqtl_variant.split("_")[1])-int(pqtl_variant.split("_")[1]))
            # if they're close enough 
            if distance <= distance_threshold:

                tmp_df = pd.concat(
                    [tmp_df,
                         pd.DataFrame(
                             [[mqtl_variant,
                               # get all expression genes for variants in expression matrix and join them by comma
                               ",".join(mqtl_df[(mqtl_df["variant.id1"] == mqtl_variant) | (mqtl_df["variant.id2"] == mqtl_variant)]["phenotype.id"].tolist()),
                               # same as above but also look at methylation sheet for an mqtl in case it's also an pqtl
                               ",".join(pqtl_df[(pqtl_df["variant.id1"] == mqtl_variant) | (pqtl_df["variant.id2"] == mqtl_variant)]["phenotype.id"].tolist()),
                               pqtl_variant,
                               # pqtl in mqtl sheet 
                               ",".join(mqtl_df[(mqtl_df["variant.id1"] == pqtl_variant) | (mqtl_df["variant.id2"] == pqtl_variant)]["phenotype.id"].tolist()),
                               # pqtl in  pqtl sheet 
                               ",".join(pqtl_df[(pqtl_df["variant.id1"] == pqtl_variant) | (pqtl_df["variant.id2"] == pqtl_variant)]["phenotype.id"].tolist()),
                               distance
                              ]]
                         )
                    ]
                )

In [13]:
# rename columns
tmp_df.columns=[
    "Methylation Variant", 
    "Methylation Variant CpG Site", 
    "Methylation Variant Protein", 
    "Protein Variant", 
    "Protein Variant CpG Site", 
    "Protein Variant Protein",
    "Distance"
]

tmp_df.sort_values("Distance")

Unnamed: 0,Methylation Variant,Methylation Variant CpG Site,Methylation Variant Protein,Protein Variant,Protein Variant CpG Site,Protein Variant Protein,Distance
0,chr6_31896897_T_C_b38,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",SL003680_ENSG00000204305.14,chr6_31896897_T_C_b38,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",SL003680_ENSG00000204305.14,0
0,chr6_32053184_G_C_b38,"cg07810190,cg26997880,cg07810190,cg26997880",,chr6_32182024_C_T_b38,,SL003680_ENSG00000204305.14,128840
0,chr6_32053184_G_C_b38,"cg07810190,cg26997880,cg07810190,cg26997880",,chr6_31896897_T_C_b38,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",SL003680_ENSG00000204305.14,156287
0,chr6_31714252_G_A_b38,"cg05483184,cg22547540,cg04960128,cg05483184,cg...",,chr6_31896897_T_C_b38,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",SL003680_ENSG00000204305.14,182645
0,chr6_31636578_C_A_b38,"cg15316118,cg05483184,cg07810190,cg10812186,cg...",,chr6_31896897_T_C_b38,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",SL003680_ENSG00000204305.14,260319
0,chr6_31616064_C_G_b38,"cg15316118,cg10812186,cg20471413,cg24055029,cg...",,chr6_31896897_T_C_b38,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",SL003680_ENSG00000204305.14,280833
0,chr6_31896897_T_C_b38,"cg05483184,cg07810190,cg10812186,cg20471413,cg...",SL003680_ENSG00000204305.14,chr6_32182024_C_T_b38,,SL003680_ENSG00000204305.14,285127
0,chr6_31714252_G_A_b38,"cg05483184,cg22547540,cg04960128,cg05483184,cg...",,chr6_32182024_C_T_b38,,SL003680_ENSG00000204305.14,467772
0,chr6_31636578_C_A_b38,"cg15316118,cg05483184,cg07810190,cg10812186,cg...",,chr6_32182024_C_T_b38,,SL003680_ENSG00000204305.14,545446
0,chr6_31616064_C_G_b38,"cg15316118,cg10812186,cg20471413,cg24055029,cg...",,chr6_32182024_C_T_b38,,SL003680_ENSG00000204305.14,565960
