# GWAS 

Original data was pulled from [here](https://www.ebi.ac.uk/gwas/docs/file-downloads) (all associations v1.0) on 2/14/24.

In [218]:
import pandas as pd
import table_cleaning_functions as tcf 

## Preliminary Wrangling

In [219]:
# read in raw data 
gwas_raw = pd.read_csv('../home/data/raw_data/GlyGen/gwas_raw.tsv', sep='\t')

  gwas_raw = pd.read_csv('../home/data/raw_data/GlyGen/gwas_raw.tsv', sep='\t')


In [220]:
print(gwas_raw.shape)
gwas_raw.head()

(571148, 34)


Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,...,CONTEXT,INTERGENIC,RISK ALLELE FREQUENCY,P-VALUE,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV
0,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI in non-smokers,"77,279 European ancestry women, 47,280 Europea...","16,011 European ancestry women, 17,912 Europea...",...,intergenic_variant,1.0,0.847,5e-08,7.30103,,0.023,[0.015-0.031] kg/m2 increase,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
1,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI in non-smokers,"77,279 European ancestry women, 47,280 Europea...","16,011 European ancestry women, 17,912 Europea...",...,intergenic_variant,1.0,0.469,7e-07,6.154902,,0.0192,[0.012-0.027] kg/m2 increase,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
2,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI in non-smokers,"77,279 European ancestry women, 47,280 Europea...","16,011 European ancestry women, 17,912 Europea...",...,intron_variant,0.0,0.2894,7e-09,8.154902,(men),0.0341,[0.023-0.046] kg/m2 decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
3,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI in non-smokers,"77,279 European ancestry women, 47,280 Europea...","16,011 European ancestry women, 17,912 Europea...",...,intron_variant,0.0,0.2894,7e-10,9.154902,,0.0264,[0.018-0.035] kg/m2 decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
4,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI in non-smokers,"77,279 European ancestry women, 47,280 Europea...","16,011 European ancestry women, 17,912 Europea...",...,intergenic_variant,1.0,0.8269,3e-09,8.522879,(women),0.0376,[0.025-0.05] kg/m2 decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N


Drop unnecessary columns. 

In [221]:
gwas_raw.columns

Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',
       'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE',
       'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS',
       'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',
       'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',
       'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',
       'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',
       'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',
       'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV'],
      dtype='object')

In [222]:
drop_cols = ['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',
            'LINK', 'STUDY', 'INITIAL SAMPLE SIZE', 'REPLICATION SAMPLE SIZE', 'REGION',
            'CHR_ID', 'CHR_POS', 'REPORTED GENE(S)', 'UPSTREAM_GENE_ID', 'DOWNSTREAM_GENE_ID',
            'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE', 'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE',
            'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC', 'RISK ALLELE FREQUENCY', 'P-VALUE',
            'PVALUE_MLOG', 'P-VALUE (TEXT)', 'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV']
gwas_raw = gwas_raw.drop(columns = drop_cols)
gwas_raw.head()

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS
0,BMI in non-smokers,SRRM1P2 - LINC00971,rs6794880
1,BMI in non-smokers,STARP1 - HNRNPA3P5,rs9540493
2,BMI in non-smokers,TCF7L2,rs7903146
3,BMI in non-smokers,TCF7L2,rs7903146
4,BMI in non-smokers,RPS17P5 - FTH1P5,rs2207139


Only interested in cancer related rows right now, filter on cancer related conditions. 

In [223]:
# cancer related words
cancer_related = ['cancer', 'carcinoma', 'leukemia', 'tumor', 'malignancy', 'glioblastoma',
                'melanoma', 'lymphoma', 'sarcoma']

# filter on only cancer related rows
gwas_raw = gwas_raw[gwas_raw['DISEASE/TRAIT'].str.contains('|'.join(cancer_related), case = False, na = False)]
print(gwas_raw.shape)
gwas_raw.head()

(15456, 3)


Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS
149,Survival in pancreatic cancer,PAIP2B,rs113988120
150,Survival in pancreatic cancer,"B4GALT4, B4GALT4-AS1",rs4568126
151,Survival in pancreatic cancer,RPL21P119 - LINC02177,rs4780973
152,Survival in pancreatic cancer,LINC00376 - LINC00395,rs1000589
153,Survival in pancreatic cancer,LINC01163 - LINC02667,rs10734079


Drop range, list, and null mapped gene rows. 

In [224]:
# drop rows where MAPPED_GENE is NA
gwas_raw = gwas_raw.dropna(subset = ['MAPPED_GENE'])
gwas_raw['MAPPED_GENE'].isna().sum()

0

In [225]:
# drop rows where MAPPED_GENE is a range or intersection 
gwas_raw = gwas_raw[~gwas_raw['MAPPED_GENE'].str.contains(' - ') & ~gwas_raw['MAPPED_GENE'].str.contains(' x ')]
gwas_raw.shape

(9354, 3)

In [226]:
# drop rows where MAPPED_GENE is a list
gwas_raw = gwas_raw[~gwas_raw['MAPPED_GENE'].str.contains(',') & ~gwas_raw['MAPPED_GENE'].str.contains(';')]
gwas_raw.shape

(7577, 3)

In [227]:
gwas_raw.sample(10)

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS
354963,Gastric cancer,HLA-C,rs1050437
46342,Barrett's esophagus or Esophageal adenocarcinoma,LINC00208,rs10108511
421052,Tumor necrosis factor receptor superfamily mem...,PLAUR,rs4760
57044,Squamous cell carcinoma,BRCA2,rs1460816
278471,Prostate cancer,CHEK2,rs17886163
63817,Alanine aminotransferase (ALT) levels after re...,PNPLA3,rs738409
415522,Colorectal cancer,ARHGAP42,rs55864876
330715,Pancreatic cancer,ANKRD27,rs150101296
280530,Prostate cancer,ANO7,rs76832527
193863,Breast cancer,SLC25A21,rs8003014


In [228]:
# save to tsv 
gwas_raw.to_csv('../home/data/processed_data/GlyGen/gwas/gwas_processed.tsv', sep = '\t', index = False)

In [24]:
# isolate conditions
conditions = gwas_raw['DISEASE/TRAIT']
# consolidate unique conditions
conditions = conditions.drop_duplicates()
# conditions.to_csv('../home/data/processed_data/gwas_conditions.tsv', sep = '\t', index = False)

## Secondary Wrangling

In [229]:
gwas_processed = pd.read_csv('../home/data/processed_data/GlyGen/gwas/gwas_processed.tsv', sep='\t')
print(gwas_processed.shape)
gwas_processed.head()

(7577, 3)


Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS
0,Survival in pancreatic cancer,PAIP2B,rs113988120
1,Survival in pancreatic cancer,ACOT11,rs10736390
2,Survival in pancreatic cancer,BDNF-AS,rs10767646
3,Survival in pancreatic cancer,WHRN,rs10817611
4,Survival in pancreatic cancer,LIN7C,rs10835188


Clean conditions. 

In [230]:
gwas_conditions = pd.read_csv('../home/data/processed_data/GlyGen/gwas/gwas_conditions.tsv', sep = '\t')
gwas_drop_conditions = pd.read_csv('../home/data/processed_data/GlyGen/gwas/gwas_drop_conditions.tsv', sep = '\t')
gwas_explode_conditions = pd.read_csv('../home/data/processed_data/GlyGen/gwas/gwas_explode_conditions.tsv', sep = '\t')

In [231]:
drop_conditions = set(gwas_drop_conditions['condition'])
gwas_processed = gwas_processed[~gwas_processed['DISEASE/TRAIT'].isin(drop_conditions)]

Isolate single and list condtions. 

In [232]:
gwas_explode_condition_rows = gwas_processed[gwas_processed['DISEASE/TRAIT'].isin(set(gwas_explode_conditions['raw_disease']))]
print(gwas_explode_condition_rows.shape)
gwas_explode_condition_rows.head()

(198, 3)


Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS
38,"Breast cancer, ovarian cancer or prostate canc...",ZBTB7B,rs77548871
39,"Breast cancer, ovarian cancer or prostate canc...",VAMP8,rs13007211
40,"Breast cancer, ovarian cancer or prostate canc...",JAZF1,rs739704
41,"Breast cancer, ovarian cancer or prostate canc...",ARHGEF5,rs720475
42,"Breast cancer, ovarian cancer or prostate canc...",PCAT1,rs17762342


In [233]:
gwas_single_condition_rows = gwas_processed[gwas_processed['DISEASE/TRAIT'].isin(set(gwas_conditions['DISEASE/TRAIT']))]
print(gwas_single_condition_rows.shape) 
gwas_single_condition_rows.head()

(7045, 3)


Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS
0,Survival in pancreatic cancer,PAIP2B,rs113988120
1,Survival in pancreatic cancer,ACOT11,rs10736390
2,Survival in pancreatic cancer,BDNF-AS,rs10767646
3,Survival in pancreatic cancer,WHRN,rs10817611
4,Survival in pancreatic cancer,LIN7C,rs10835188


In [234]:
# sanity check
assert(gwas_processed.shape[0] == gwas_explode_condition_rows.shape[0] + gwas_single_condition_rows.shape[0])

Map single condition rows.

In [235]:
# merge single condition rows with condition list
gwas_single_condition_rows = gwas_single_condition_rows.merge(gwas_conditions, how = 'left', on = 'DISEASE/TRAIT')
gwas_single_condition_rows.sample(5)

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS,condition,doid
4377,Colorectal cancer,OSBP2,rs926340,colorectal cancer,:9256
5833,Uterine leiomyoma or breast cancer (pleiotropy),ATAD5,rs117662433,breast cancer,:1612
2472,Squamous cell lung carcinoma,CLPTM1L,rs467095,lung squamous cell carcinoma,:3907
1895,Nevus count or cutaneous melanoma,CDH3,rs8046299,skin melanoma,:8923
296,Prostate cancer,RASSF3,rs7968403,prostate cancer,:10283


In [236]:
gwas_single_condition_rows.isnull().sum()

DISEASE/TRAIT    0
MAPPED_GENE      0
SNPS             0
condition        0
doid             0
dtype: int64

In [237]:
gwas_single_condition_rows = gwas_single_condition_rows.drop('DISEASE/TRAIT', axis = 1)
gwas_single_condition_rows.sample(5)

Unnamed: 0,MAPPED_GENE,SNPS,condition,doid
1551,HELQ,rs1494961,oral cavity cancer,:8618
5368,CNIH3,rs72761829,breast cancer,:1612
6019,LINC02540,rs116995626,colorectal cancer,:9256
6419,MAST2,rs4420029,prostate cancer,:10283
4274,NAPEPLD,rs56196003,breast cancer,:1612


In [238]:
gwas_single_condition_rows['doid'] = gwas_single_condition_rows['doid'].str.split(':').str[1]
gwas_explode_condition_rows.sample(5)

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS
3790,Breast cancer or ovarian cancer (pleiotropy),ABHD8,rs4808616
3796,Breast cancer or ovarian cancer (pleiotropy),ADAM29,rs6826366
5520,Endometriosis or endometrial cancer (pleiotropy),PTPRD,rs2475335
3797,Breast cancer or ovarian cancer (pleiotropy),TERT,rs10069690
3737,Breast cancer or lung cancer (pleiotropy),HSPA4,rs13718


Map multiple condition rows.

In [239]:
# merge explode condition rows with explode condition list
gwas_explode_condition_rows = gwas_explode_condition_rows.merge(gwas_explode_conditions, how = 'left', left_on = 'DISEASE/TRAIT', right_on = 'raw_disease')
gwas_explode_condition_rows.sample(5)

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE,SNPS,raw_disease,list_diseases
174,"Malignant lymphoma (Non-Hodgkin lymphoma, mult...",QPCT,rs3770745,"Malignant lymphoma (Non-Hodgkin lymphoma, mult...","non-Hodgkin lymphoma:0060060,multiple myeloma:..."
23,Colorectal or endometrial cancer,LSAMP,rs4378954,Colorectal or endometrial cancer,"colorectal cancer:9256,endometrial cancer:1380"
91,Breast cancer or ovarian cancer (pleiotropy),NRIP1,rs2822991,Breast cancer or ovarian cancer (pleiotropy),"breast cancer:1612,ovarian cancer:2394"
77,Breast cancer or lung cancer (pleiotropy),ABHD8,rs4808616,Breast cancer or lung cancer (pleiotropy),"breast cancer:1612,lung cancer:1324"
127,Barrett's esophagus or esophageal adenocarcino...,DPYSL2,rs17321041,Barrett's esophagus or esophageal adenocarcino...,"Barrett's esophagus:9206,esophagus adenocarcin..."


In [240]:
gwas_explode_condition_rows.isnull().sum()

DISEASE/TRAIT    0
MAPPED_GENE      0
SNPS             0
raw_disease      0
list_diseases    0
dtype: int64

In [241]:
gwas_explode_condition_rows = gwas_explode_condition_rows.drop(['DISEASE/TRAIT', 'raw_disease'], axis = 1)
gwas_explode_condition_rows.head()

Unnamed: 0,MAPPED_GENE,SNPS,list_diseases
0,ZBTB7B,rs77548871,"breast cancer:1612,ovarian cancer:2394,prostat..."
1,VAMP8,rs13007211,"breast cancer:1612,ovarian cancer:2394,prostat..."
2,JAZF1,rs739704,"breast cancer:1612,ovarian cancer:2394,prostat..."
3,ARHGEF5,rs720475,"breast cancer:1612,ovarian cancer:2394,prostat..."
4,PCAT1,rs17762342,"breast cancer:1612,ovarian cancer:2394,prostat..."


In [242]:
gwas_explode_condition_rows['list_diseases'] = gwas_explode_condition_rows['list_diseases'].str.split(',')
gwas_explode_condition_rows.head()

Unnamed: 0,MAPPED_GENE,SNPS,list_diseases
0,ZBTB7B,rs77548871,"[breast cancer:1612, ovarian cancer:2394, pros..."
1,VAMP8,rs13007211,"[breast cancer:1612, ovarian cancer:2394, pros..."
2,JAZF1,rs739704,"[breast cancer:1612, ovarian cancer:2394, pros..."
3,ARHGEF5,rs720475,"[breast cancer:1612, ovarian cancer:2394, pros..."
4,PCAT1,rs17762342,"[breast cancer:1612, ovarian cancer:2394, pros..."


In [243]:
# explode condition list
gwas_explode_condition_rows = gwas_explode_condition_rows.explode('list_diseases')
print(gwas_explode_condition_rows.shape)
gwas_explode_condition_rows.head()

(430, 3)


Unnamed: 0,MAPPED_GENE,SNPS,list_diseases
0,ZBTB7B,rs77548871,breast cancer:1612
0,ZBTB7B,rs77548871,ovarian cancer:2394
0,ZBTB7B,rs77548871,prostate cancer:10283
1,VAMP8,rs13007211,breast cancer:1612
1,VAMP8,rs13007211,ovarian cancer:2394


Split separate DOID and condition names.

In [244]:
def split_condition_doid(row):
    row['condition'] = row['list_diseases'].split(':')[0]
    row['doid'] = row['list_diseases'].split(':')[1]
    return row

gwas_explode_condition_rows = gwas_explode_condition_rows.apply(split_condition_doid, axis = 1)
gwas_explode_condition_rows = gwas_explode_condition_rows.drop('list_diseases', axis = 1)
print(gwas_explode_condition_rows.shape)
gwas_explode_condition_rows.head()

(430, 4)


Unnamed: 0,MAPPED_GENE,SNPS,condition,doid
0,ZBTB7B,rs77548871,breast cancer,1612
0,ZBTB7B,rs77548871,ovarian cancer,2394
0,ZBTB7B,rs77548871,prostate cancer,10283
1,VAMP8,rs13007211,breast cancer,1612
1,VAMP8,rs13007211,ovarian cancer,2394


Recombine data.

In [245]:
# combine single and explode condition rows
gwas_processed = pd.concat([gwas_single_condition_rows, gwas_explode_condition_rows])
gwas_processed.sample(10)

Unnamed: 0,MAPPED_GENE,SNPS,condition,doid
4626,LHPP,rs35837782,acute lymphoblastic leukemia,9952
2468,NEB,rs10174077,lung squamous cell carcinoma,3907
6861,MSMB,rs10993994,prostate cancer,10283
418,LINGO2,rs2891316,endometrial cancer,1380
868,MYO9B,rs7249698,breast cancer,1612
3845,ATM,rs1801516,melanoma,1909
5825,TERT,rs56345976,breast cancer,1612
3295,LPP,rs1464510,skin cancer,4159
1320,PRPSAP1,rs66459581,ovarian cancer,2394
6934,LINC01169,rs8023793,prostate cancer,10283


## Final Mappings

In [260]:
# gwas_processed.to_csv('../home/data/processed_data/GlyGen/gwas/gwas_final_processed.tsv', sep = '\t', index = False)
gwas_processed = pd.read_csv('../home/data/processed_data/GlyGen/gwas/gwas_final_processed.tsv', sep = '\t', dtype={'doid': str})

Create the empty TSV file.

In [261]:
df = pd.DataFrame(columns = tcf.TSV_HEADERS)
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag


Final mappings.

In [262]:
df['assessed_biomarker_entity'] = gwas_processed['MAPPED_GENE']
df['biomarker'] = 'presence of ' + gwas_processed['SNPS'].astype(str) + ' mutation in ' + df['assessed_biomarker_entity']
df['assessed_biomarker_entity_id'] = 'dbSNP:' + gwas_processed['SNPS'].astype(str)
df['assessed_entity_type'] = 'gene'
df['condition'] = gwas_processed['condition'].str.strip()
df['condition_id'] = 'DOID:' + gwas_processed['doid'].astype(str).str.strip()
df['best_biomarker_role'] = 'risk'
df['evidence_source'] = 'GWAS:' + gwas_processed['MAPPED_GENE'].astype(str).str.strip()
df['tag'] = 'biomarker;assessed_biomarker_entity_id;assessed_biomarker_entity;specimen;condition'

print(df.shape)
df.head()

(7510, 16)


Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,,presence of rs113988120 mutation in PAIP2B,PAIP2B,dbSNP:rs113988120,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:PAIP2B,,biomarker;assessed_biomarker_entity_id;assesse...
1,,presence of rs10736390 mutation in ACOT11,ACOT11,dbSNP:rs10736390,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:ACOT11,,biomarker;assessed_biomarker_entity_id;assesse...
2,,presence of rs10767646 mutation in BDNF-AS,BDNF-AS,dbSNP:rs10767646,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:BDNF-AS,,biomarker;assessed_biomarker_entity_id;assesse...
3,,presence of rs10817611 mutation in WHRN,WHRN,dbSNP:rs10817611,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:WHRN,,biomarker;assessed_biomarker_entity_id;assesse...
4,,presence of rs10835188 mutation in LIN7C,LIN7C,dbSNP:rs10835188,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:LIN7C,,biomarker;assessed_biomarker_entity_id;assesse...


In [266]:
# drop rows that are full duplicates
df = df.drop_duplicates()
df.shape

(5432, 17)

Assign temporary IDs.

In [264]:
df.insert(0, 'tmp_id', df.groupby(['biomarker', 'assessed_biomarker_entity_id', 'assessed_biomarker_entity', 'condition']).ngroup())
df.head()

Unnamed: 0,tmp_id,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,533,,presence of rs113988120 mutation in PAIP2B,PAIP2B,dbSNP:rs113988120,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:PAIP2B,,biomarker;assessed_biomarker_entity_id;assesse...
1,176,,presence of rs10736390 mutation in ACOT11,ACOT11,dbSNP:rs10736390,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:ACOT11,,biomarker;assessed_biomarker_entity_id;assesse...
2,193,,presence of rs10767646 mutation in BDNF-AS,BDNF-AS,dbSNP:rs10767646,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:BDNF-AS,,biomarker;assessed_biomarker_entity_id;assesse...
3,218,,presence of rs10817611 mutation in WHRN,WHRN,dbSNP:rs10817611,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:WHRN,,biomarker;assessed_biomarker_entity_id;assesse...
4,229,,presence of rs10835188 mutation in LIN7C,LIN7C,dbSNP:rs10835188,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:LIN7C,,biomarker;assessed_biomarker_entity_id;assesse...


In [269]:
# drop biomarker_id column and rename tmp_id to biomarker_id
df = df.drop('biomarker_id', axis = 1)
df = df.rename(columns = {'tmp_id': 'biomarker_id'})
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,533,presence of rs113988120 mutation in PAIP2B,PAIP2B,dbSNP:rs113988120,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:PAIP2B,,biomarker;assessed_biomarker_entity_id;assesse...
1,176,presence of rs10736390 mutation in ACOT11,ACOT11,dbSNP:rs10736390,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:ACOT11,,biomarker;assessed_biomarker_entity_id;assesse...
2,193,presence of rs10767646 mutation in BDNF-AS,BDNF-AS,dbSNP:rs10767646,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:BDNF-AS,,biomarker;assessed_biomarker_entity_id;assesse...
3,218,presence of rs10817611 mutation in WHRN,WHRN,dbSNP:rs10817611,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:WHRN,,biomarker;assessed_biomarker_entity_id;assesse...
4,229,presence of rs10835188 mutation in LIN7C,LIN7C,dbSNP:rs10835188,gene,pancreatic cancer,DOID:1793,,,risk,,,,GWAS:LIN7C,,biomarker;assessed_biomarker_entity_id;assesse...


In [270]:
# save cleaned data 
df.to_csv('../home/data/cleaned_data/GlyGen/gwas.tsv', sep = '\t', index = False)