# ClinVar

Original data was pulled from [here](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) (FTP link) on 2/14/24.

In [1]:
import pandas as pd
import table_cleaning_functions as tcf

# Preliminary Wrangling

In [45]:
# read in raw clinvar data 
clinvar_raw = pd.read_csv('../home/data/raw_data/GlyGen/clinvar_raw.txt', sep = '\t')
print(clinvar_raw.shape)
clinvar_raw.head()

  clinvar_raw = pd.read_csv('../home/data/raw_data/GlyGen/clinvar_raw.txt', sep = '\t')


(4807886, 40)


Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity
0,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,0,-,397704705,...,2,4820844,GGAT,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-
1,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,0,-,397704705,...,2,4781213,GGAT,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-
2,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,0,"Jun 29, 2010",397704709,...,3,4827360,GCTGCTGGACCTGCC,G,-,-,-,-,-,-
3,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,0,"Jun 29, 2010",397704709,...,3,4787729,GCTGCTGGACCTGCC,G,-,-,-,-,-,-
4,15043,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,4,85342440,G,A,-,-,-,-,-,-


In [46]:
clinvar_raw.isnull().sum()

#AlleleID                             0
Type                                  0
Name                                  0
GeneID                                0
GeneSymbol                            0
HGNC_ID                               0
ClinicalSignificance                  0
ClinSigSimple                         0
LastEvaluated                         0
RS# (dbSNP)                           0
nsv/esv (dbVar)                       0
RCVaccession                          0
PhenotypeIDS                          0
PhenotypeList                         0
Origin                                0
OriginSimple                          0
Assembly                              0
ChromosomeAccession                   0
Chromosome                            0
Start                                 0
Stop                                  0
ReferenceAllele                       0
AlternateAllele                       0
Cytogenetic                           0
ReviewStatus                          0


Drop unnecessary rows.

In [47]:
clinvar_raw.columns

Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
       'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines',
       'TestedInGTR', 'OtherIDs', 'SubmitterCategories', 'VariationID',
       'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF',
       'SomaticClinicalImpact', 'SomaticClinicalImpactLastEvaluated',
       'ReviewStatusClinicalImpact', 'Oncogenicity',
       'OncogenicityLastEvaluated', 'ReviewStatusOncogenicity'],
      dtype='object')

In [48]:
cols = list(clinvar_raw.columns)
cols

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'PositionVCF',
 'ReferenceAlleleVCF',
 'AlternateAlleleVCF',
 'SomaticClinicalImpact',
 'SomaticClinicalImpactLastEvaluated',
 'ReviewStatusClinicalImpact',
 'Oncogenicity',
 'OncogenicityLastEvaluated',
 'ReviewStatusOncogenicity']

In [49]:
drop_cols = [x for x in cols if x not in {'GeneSymbol', 'RS# (dbSNP)', 'PhenotypeList', 'VariationID'}]
clinvar_raw = clinvar_raw.drop(columns = drop_cols)
clinvar_raw.head()

Unnamed: 0,GeneSymbol,RS# (dbSNP),PhenotypeList,VariationID
0,AP5Z1,397704705,Hereditary spastic paraplegia 48,2
1,AP5Z1,397704705,Hereditary spastic paraplegia 48,2
2,AP5Z1,397704709,Hereditary spastic paraplegia 48,3
3,AP5Z1,397704709,Hereditary spastic paraplegia 48,3
4,ZNF592,150829393,Galloway-Mowat syndrome 1,4


Only interested in cancer related rows right now, filter on cancer related conditions.

In [50]:
cancer_related = ['cancer', 'carcinoma', 'leukemia', 'tumor', 'malignancy', 'glioblastoma',
                'melanoma', 'lymphoma', 'sarcoma']

clinvar_raw = clinvar_raw[clinvar_raw['PhenotypeList'].str.contains('|'.join(cancer_related), case = False, na = False)]  
print(clinvar_raw.shape)
clinvar_raw.head()

(422117, 4)


Unnamed: 0,GeneSymbol,RS# (dbSNP),PhenotypeList,VariationID
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
13,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
206,TMEM127,121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
207,TMEM127,121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
528,KLHDC8B,387906223,Classic Hodgkin lymphoma,273


Explode on phenotype list by `|` then `;` and then re-filter (filtering is done again versus doing it before original filter because of sheer size of data).

In [51]:
clinvar_raw['PhenotypeList_new'] = clinvar_raw['PhenotypeList'].str.split('|')
clinvar_raw.head()

Unnamed: 0,GeneSymbol,RS# (dbSNP),PhenotypeList,VariationID,PhenotypeList_new
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9,"[Hemochromatosis type 1, Hereditary cancer-pre..."
13,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9,"[Hemochromatosis type 1, Hereditary cancer-pre..."
206,TMEM127,121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108,"[Pheochromocytoma, susceptibility to, Pheochro..."
207,TMEM127,121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108,"[Pheochromocytoma, susceptibility to, Pheochro..."
528,KLHDC8B,387906223,Classic Hodgkin lymphoma,273,[Classic Hodgkin lymphoma]


In [52]:
# explode phenotype list new 
clinvar_raw = clinvar_raw.explode('PhenotypeList_new')
print(clinvar_raw.shape)
clinvar_raw.head()

(822841, 5)


Unnamed: 0,GeneSymbol,RS# (dbSNP),PhenotypeList,VariationID,PhenotypeList_new
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9,Hemochromatosis type 1
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9,Hereditary cancer-predisposing syndrome
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9,not provided
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9,Hereditary hemochromatosis
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9,Porphyrinuria;Cutaneous photosensitivity


In [53]:
# drop old phenotype list column
clinvar_raw = clinvar_raw.drop(columns = ['PhenotypeList'])
clinvar_raw.head()

Unnamed: 0,GeneSymbol,RS# (dbSNP),VariationID,PhenotypeList_new
12,HFE,1800562,9,Hemochromatosis type 1
12,HFE,1800562,9,Hereditary cancer-predisposing syndrome
12,HFE,1800562,9,not provided
12,HFE,1800562,9,Hereditary hemochromatosis
12,HFE,1800562,9,Porphyrinuria;Cutaneous photosensitivity


In [54]:
clinvar_raw['PhenotypeList'] = clinvar_raw['PhenotypeList_new'].str.split(';')
clinvar_raw.head()

Unnamed: 0,GeneSymbol,RS# (dbSNP),VariationID,PhenotypeList_new,PhenotypeList
12,HFE,1800562,9,Hemochromatosis type 1,[Hemochromatosis type 1]
12,HFE,1800562,9,Hereditary cancer-predisposing syndrome,[Hereditary cancer-predisposing syndrome]
12,HFE,1800562,9,not provided,[not provided]
12,HFE,1800562,9,Hereditary hemochromatosis,[Hereditary hemochromatosis]
12,HFE,1800562,9,Porphyrinuria;Cutaneous photosensitivity,"[Porphyrinuria, Cutaneous photosensitivity]"


In [55]:
# explode phenotype list again
clinvar_raw = clinvar_raw.explode('PhenotypeList')
print(clinvar_raw.shape)
clinvar_raw.head()

(882110, 5)


Unnamed: 0,GeneSymbol,RS# (dbSNP),VariationID,PhenotypeList_new,PhenotypeList
12,HFE,1800562,9,Hemochromatosis type 1,Hemochromatosis type 1
12,HFE,1800562,9,Hereditary cancer-predisposing syndrome,Hereditary cancer-predisposing syndrome
12,HFE,1800562,9,not provided,not provided
12,HFE,1800562,9,Hereditary hemochromatosis,Hereditary hemochromatosis
12,HFE,1800562,9,Porphyrinuria;Cutaneous photosensitivity,Porphyrinuria


In [56]:
# drop old phenotype list column
clinvar_raw = clinvar_raw.drop(columns = ['PhenotypeList_new'])
clinvar_raw.head()

Unnamed: 0,GeneSymbol,RS# (dbSNP),VariationID,PhenotypeList
12,HFE,1800562,9,Hemochromatosis type 1
12,HFE,1800562,9,Hereditary cancer-predisposing syndrome
12,HFE,1800562,9,not provided
12,HFE,1800562,9,Hereditary hemochromatosis
12,HFE,1800562,9,Porphyrinuria


In [57]:
# re-filter for cancer related diseases
clinvar_raw = clinvar_raw[clinvar_raw['PhenotypeList'].str.contains('|'.join(cancer_related), case = False, na = False)]
print(clinvar_raw.shape)
clinvar_raw.head()

(563364, 4)


Unnamed: 0,GeneSymbol,RS# (dbSNP),VariationID,PhenotypeList
12,HFE,1800562,9,Hereditary cancer-predisposing syndrome
13,HFE,1800562,9,Hereditary cancer-predisposing syndrome
206,TMEM127,121908830,108,Hereditary cancer-predisposing syndrome
207,TMEM127,121908830,108,Hereditary cancer-predisposing syndrome
528,KLHDC8B,387906223,273,Classic Hodgkin lymphoma


In [58]:
clinvar_raw.isnull().sum()

GeneSymbol       0
RS# (dbSNP)      0
VariationID      0
PhenotypeList    0
dtype: int64

Re-format rs ID column.

In [59]:
clinvar_raw['RS# (dbSNP)'] = 'rs' + clinvar_raw['RS# (dbSNP)'].astype(str)
clinvar_raw['RS# (dbSNP)']

12           rs1800562
13           rs1800562
206        rs121908830
207        rs121908830
528        rs387906223
              ...     
4807791           rs-1
4807792           rs-1
4807793           rs-1
4807794           rs-1
4807795           rs-1
Name: RS# (dbSNP), Length: 563364, dtype: object

In [60]:
clinvar_raw = clinvar_raw[~clinvar_raw['RS# (dbSNP)'].str.contains('-1')]
print(clinvar_raw.shape)
clinvar_raw['RS# (dbSNP)'].value_counts()

(451062, 4)


RS# (dbSNP)
rs11540652      132
rs28934576      122
rs587778720     112
rs786201057     104
rs587780070     100
               ... 
rs2137450852      1
rs552306042       1
rs2137457893      1
rs2137458677      1
rs45445694        1
Name: count, Length: 129423, dtype: int64

Rename columns. 

In [61]:
cols = ['gene', 'rs_id', 'variation_id', 'disease']
clinvar_raw.columns = cols
clinvar_raw.head()

Unnamed: 0,gene,rs_id,variation_id,disease
12,HFE,rs1800562,9,Hereditary cancer-predisposing syndrome
13,HFE,rs1800562,9,Hereditary cancer-predisposing syndrome
206,TMEM127,rs121908830,108,Hereditary cancer-predisposing syndrome
207,TMEM127,rs121908830,108,Hereditary cancer-predisposing syndrome
528,KLHDC8B,rs387906223,273,Classic Hodgkin lymphoma


Normalize condition values before isolating.

In [62]:
clinvar_raw['disease'] = clinvar_raw['disease'].str.lower().str.strip()
clinvar_raw.head()

Unnamed: 0,gene,rs_id,variation_id,disease
12,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
13,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
206,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
207,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
528,KLHDC8B,rs387906223,273,classic hodgkin lymphoma


Isolate condition data for cleaning. 

In [67]:
conditions = clinvar_raw['disease']
conditions = conditions.drop_duplicates()
conditions.to_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_conditions.tsv', sep = '\t', index = False)

In [68]:
conditions.shape

(414,)

Save processed data and isolate conditions to clean manually. 

In [69]:
clinvar_raw.to_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_processed.tsv', sep = '\t', index = False)

# Secondary Wrangling

In [12]:
clinvar_processed = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_processed.tsv', sep = '\t')
print(clinvar_processed.shape)
clinvar_processed.head()

(451062, 4)


Unnamed: 0,gene,rs_id,variation_id,disease
0,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
1,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
2,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
3,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
4,KLHDC8B,rs387906223,273,classic hodgkin lymphoma


In [102]:
clinvar_processed.isnull().sum()

gene            0
rs_id           0
variation_id    0
disease         0
dtype: int64

Load cleaned conditions. 

In [103]:
clinvar_conditions = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_conditions.tsv', sep = '\t')
clinvar_drop_conditions = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_drop_conditions.tsv', sep = '\t')
clinvar_explode_conditions = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_explode_conditions.tsv', sep = '\t')

Drop conditions that are in the drop list.

In [104]:
drop_conditions = set(clinvar_drop_conditions['condition'])
clinvar_processed = clinvar_processed[~clinvar_processed['disease'].isin(drop_conditions)]

Isolate single and list conditions. 

In [105]:
clinvar_explode_condition_rows = clinvar_processed[clinvar_processed['disease'].isin(set(clinvar_explode_conditions['raw_disease']))]
print(clinvar_explode_condition_rows.shape)
clinvar_explode_condition_rows.head()

(4228, 4)


Unnamed: 0,gene,rs_id,variation_id,disease
330,ATM,rs587776547,3019,breast and/or ovarian cancer
335,ATM,rs587776547,3019,breast and/or ovarian cancer
338,ATM,rs774925473,3021,breast and/or ovarian cancer
342,ATM,rs774925473,3021,breast and/or ovarian cancer
353,ATM,rs28904921,3023,breast and/or ovarian cancer


In [106]:
clinvar_single_condition_rows = clinvar_processed[clinvar_processed['disease'].isin(set(clinvar_conditions['disease']))]
print(clinvar_single_condition_rows.shape)
clinvar_single_condition_rows.head()

(445544, 4)


Unnamed: 0,gene,rs_id,variation_id,disease
0,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
1,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
2,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
3,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
4,KLHDC8B,rs387906223,273,classic hodgkin lymphoma


In [107]:
# sanity check
assert(clinvar_processed.shape[0] == clinvar_explode_condition_rows.shape[0] + clinvar_single_condition_rows.shape[0])

Map single condition rows.

In [108]:
clinvar_single_condition_rows = clinvar_single_condition_rows.merge(clinvar_conditions, how='left', on='disease')
clinvar_single_condition_rows.sample(5)

Unnamed: 0,gene,rs_id,variation_id,disease,condition,doid
353797,BRCA1,rs730881472,950867,hereditary cancer-predisposing syndrome,cancer,:162
69981,PALB2,rs730881867,182740,familial cancer of breast,breast cancer,:1612
287787,ALK,rs552187705,702961,hereditary cancer-predisposing syndrome,cancer,:162
304607,MET,rs1584914144,818832,renal cell carcinoma,renal cell carcinoma,:4450
92176,BRCA1,rs863224762,216670,hereditary breast ovarian cancer syndrome,hereditary breast ovarian cancer syndrome,:5683


In [109]:
clinvar_single_condition_rows.isnull().sum()

gene                0
rs_id               0
variation_id        0
disease             0
condition       35923
doid                0
dtype: int64

In [110]:
clinvar_single_condition_rows.columns

Index(['gene', 'rs_id', 'variation_id', 'disease', 'condition', 'doid'], dtype='object')

In [111]:
clinvar_single_condition_rows['doid'] = 'DOID' + clinvar_single_condition_rows['doid'].astype(str)
clinvar_single_condition_rows.sample(5)

Unnamed: 0,gene,rs_id,variation_id,disease,condition,doid
399297,BRCA1,rs2154475121,1171931,hereditary cancer-predisposing syndrome,cancer,DOID:162
188671,RET,rs191769748,477367,familial medullary thyroid carcinoma,familial medullary thyroid carcinoma,DOID:0050547
77024,MRE11,rs786202951,186437,hereditary cancer-predisposing syndrome,cancer,DOID:162
285187,HOXB13,rs1597934943,690580,"prostate cancer, hereditary, 1",prostate cancer,DOID:10283
26960,BRCA1,rs397508931,54441,"breast-ovarian cancer, familial, susceptibilit...",hereditary breast ovarian cancer syndrome,DOID:5683


In [112]:
clinvar_single_condition_rows = clinvar_single_condition_rows.drop('disease', axis=1)
clinvar_single_condition_rows.sample(5)

Unnamed: 0,gene,rs_id,variation_id,condition,doid
123080,SMARCA4,rs146141457,238531,,DOID:
390437,SMARCA4,rs765751495,1108552,cancer,DOID:162
393230,DICER1,rs552975520,1139621,,DOID:
329659,TSC2,rs397515241,838204,cancer,DOID:162
369137,ALK,rs753812499,1023840,cancer,DOID:162


Map multiple condition rows.

In [113]:
clinvar_explode_condition_rows.sample(5)

Unnamed: 0,gene,rs_id,variation_id,disease
292330,ATR,rs1397771631,702854,familial cutaneous telangiectasia and orophary...
51408,PMS2,rs374704824,127793,breast and/or ovarian cancer
35049,BRCA1,rs80357066,55769,breast and/or ovarian cancer
373973,MSH2,rs1203462814,1021808,breast and/or ovarian cancer
57531,CDH1,rs115817750,136067,breast and/or ovarian cancer


In [114]:
clinvar_explode_conditions.sample(5)

Unnamed: 0,raw_disease,list_diseases
4,colorectal / endometrial cancer,"colorectal cancer:9256,endometrial cancer:1380"
8,increased risk to develop myelodysplastic synd...,"myelodysplastic syndrome:0050908,acute myeloid..."
2,"tumor predisposition syndrome 4, breast/prosta...","breast cancer:1612,prostate cancer:10283,color..."
0,breast and/or ovarian cancer,"breast cancer:1612,ovarian cancer:2394"
3,prostate cancer/brain cancer susceptibility,"prostate cancer:10283,brain cancer:1319"


In [115]:
clinvar_explode_condition_rows = clinvar_explode_condition_rows.merge(clinvar_explode_conditions, how='left', left_on='disease', right_on='raw_disease')
clinvar_explode_condition_rows.sample(5)

Unnamed: 0,gene,rs_id,variation_id,disease,raw_disease,list_diseases
844,BRCA2,rs397507907,52298,breast and/or ovarian cancer,breast and/or ovarian cancer,"breast cancer:1612,ovarian cancer:2394"
1992,RAD51D,rs587781756,141452,breast and/or ovarian cancer,breast and/or ovarian cancer,"breast cancer:1612,ovarian cancer:2394"
3872,ATR,rs778499519,1127830,familial cutaneous telangiectasia and orophary...,familial cutaneous telangiectasia and orophary...,"[TODO]:[TODO],[TODO]:[TODO]"
1138,BRCA1,rs80357066,55769,breast and/or ovarian cancer,breast and/or ovarian cancer,"breast cancer:1612,ovarian cancer:2394"
3022,ATM,rs767070325,378989,breast and/or ovarian cancer,breast and/or ovarian cancer,"breast cancer:1612,ovarian cancer:2394"


In [116]:
clinvar_explode_condition_rows.isnull().sum()

gene             0
rs_id            0
variation_id     0
disease          0
raw_disease      0
list_diseases    0
dtype: int64

In [117]:
clinvar_explode_condition_rows = clinvar_explode_condition_rows.drop(['disease', 'raw_disease'], axis = 1)
clinvar_explode_condition_rows.head()

Unnamed: 0,gene,rs_id,variation_id,list_diseases
0,ATM,rs587776547,3019,"breast cancer:1612,ovarian cancer:2394"
1,ATM,rs587776547,3019,"breast cancer:1612,ovarian cancer:2394"
2,ATM,rs774925473,3021,"breast cancer:1612,ovarian cancer:2394"
3,ATM,rs774925473,3021,"breast cancer:1612,ovarian cancer:2394"
4,ATM,rs28904921,3023,"breast cancer:1612,ovarian cancer:2394"


In [118]:
clinvar_explode_condition_rows['list_diseases'] = clinvar_explode_condition_rows['list_diseases'].str.split(',')
clinvar_explode_condition_rows.head()

Unnamed: 0,gene,rs_id,variation_id,list_diseases
0,ATM,rs587776547,3019,"[breast cancer:1612, ovarian cancer:2394]"
1,ATM,rs587776547,3019,"[breast cancer:1612, ovarian cancer:2394]"
2,ATM,rs774925473,3021,"[breast cancer:1612, ovarian cancer:2394]"
3,ATM,rs774925473,3021,"[breast cancer:1612, ovarian cancer:2394]"
4,ATM,rs28904921,3023,"[breast cancer:1612, ovarian cancer:2394]"


In [119]:
clinvar_explode_condition_rows = clinvar_explode_condition_rows.explode('list_diseases')
print(clinvar_explode_condition_rows.shape)
clinvar_explode_condition_rows.head()

(8462, 4)


Unnamed: 0,gene,rs_id,variation_id,list_diseases
0,ATM,rs587776547,3019,breast cancer:1612
0,ATM,rs587776547,3019,ovarian cancer:2394
1,ATM,rs587776547,3019,breast cancer:1612
1,ATM,rs587776547,3019,ovarian cancer:2394
2,ATM,rs774925473,3021,breast cancer:1612


In [120]:
clinvar_explode_condition_rows[['condition', 'doid']] = clinvar_explode_condition_rows['list_diseases'].str.split(':', expand=True, n=1)
clinvar_explode_condition_rows['doid'] = 'DOID:' + clinvar_explode_condition_rows['doid'].astype(str)
clinvar_explode_condition_rows.drop(columns=['list_diseases'], inplace=True)
clinvar_explode_condition_rows.head()

Unnamed: 0,gene,rs_id,variation_id,condition,doid
0,ATM,rs587776547,3019,breast cancer,DOID:1612
0,ATM,rs587776547,3019,ovarian cancer,DOID:2394
1,ATM,rs587776547,3019,breast cancer,DOID:1612
1,ATM,rs587776547,3019,ovarian cancer,DOID:2394
2,ATM,rs774925473,3021,breast cancer,DOID:1612


Recombine data.

In [121]:
clinvar_processed = pd.concat([clinvar_single_condition_rows, clinvar_single_condition_rows])
clinvar_processed.sample(10)

Unnamed: 0,gene,rs_id,variation_id,condition,doid
421640,MET,rs2117029370,1430735,cancer,DOID:162
163028,PALB2,rs1060502737,410114,breast cancer,DOID:1612
258727,RAD51D,rs28363283,581983,cancer,DOID:162
135158,MTAP,rs4977735,366307,,DOID:
259378,GPC3,rs761660909,578171,,DOID:
48696,ATM,rs587779861,127437,breast cancer,DOID:1612
280192,BRCA1,rs1597864637,639392,hereditary breast ovarian cancer syndrome,DOID:5683
74835,PMS2,rs759192470,185217,cancer,DOID:162
154308,ABRAXAS1,rs370520589,416705,cancer,DOID:162
103092,MLH1,rs63750016,233750,cancer,DOID:162


In [122]:
clinvar_processed.to_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_final_processed.tsv', sep='\t', index = False)

# Final Mappings

In [2]:
clinvar_processed = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_final_processed.tsv', sep='\t')
clinvar_processed.head()

Unnamed: 0,gene,rs_id,variation_id,condition,doid
0,HFE,rs1800562,9,cancer,DOID:162
1,HFE,rs1800562,9,cancer,DOID:162
2,TMEM127,rs121908830,108,cancer,DOID:162
3,TMEM127,rs121908830,108,cancer,DOID:162
4,KLHDC8B,rs387906223,273,Hodgkin's lymphoma,DOID:8567


In [3]:
df = pd.DataFrame(columns = tcf.TSV_HEADERS)
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag


In [4]:
df['assessed_biomarker_entity'] = clinvar_processed['gene']
df['biomarker'] = 'presence of ' + clinvar_processed['rs_id'].astype(str) + ' mutation in ' + df['assessed_biomarker_entity']
df['assessed_biomarker_entity_id'] = 'dbSNP:' + clinvar_processed['rs_id'].astype(str)
df['assessed_entity_type'] = 'gene'
df['condition'] = clinvar_processed['condition'].str.strip()
df['condition_id'] = clinvar_processed['doid'].astype(str).str.strip()
df['best_biomarker_role'] = 'risk'
df['evidence_source'] = 'CLINVAR:' + clinvar_processed['variation_id'].astype(str).str.strip()
df['tag'] = 'biomarker;assessed_biomarker_entity_id;assessed_biomarker_entity;condition'

print(df.shape)
df.head()

(891088, 16)


Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,,presence of rs1800562 mutation in HFE,HFE,dbSNP:rs1800562,gene,cancer,DOID:162,,,risk,,,,CLINVAR:9,,biomarker;assessed_biomarker_entity_id;assesse...
1,,presence of rs1800562 mutation in HFE,HFE,dbSNP:rs1800562,gene,cancer,DOID:162,,,risk,,,,CLINVAR:9,,biomarker;assessed_biomarker_entity_id;assesse...
2,,presence of rs121908830 mutation in TMEM127,TMEM127,dbSNP:rs121908830,gene,cancer,DOID:162,,,risk,,,,CLINVAR:108,,biomarker;assessed_biomarker_entity_id;assesse...
3,,presence of rs121908830 mutation in TMEM127,TMEM127,dbSNP:rs121908830,gene,cancer,DOID:162,,,risk,,,,CLINVAR:108,,biomarker;assessed_biomarker_entity_id;assesse...
4,,presence of rs387906223 mutation in KLHDC8B,KLHDC8B,dbSNP:rs387906223,gene,Hodgkin's lymphoma,DOID:8567,,,risk,,,,CLINVAR:273,,biomarker;assessed_biomarker_entity_id;assesse...


In [5]:
# drop rows that are full duplicates
df = df.drop_duplicates()
df.shape

(211968, 16)

In [6]:
df['biomarker_id'] = df.groupby(['biomarker', 'assessed_biomarker_entity_id', 'assessed_biomarker_entity', 'condition']).ngroup()
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,64850.0,presence of rs1800562 mutation in HFE,HFE,dbSNP:rs1800562,gene,cancer,DOID:162,,,risk,,,,CLINVAR:9,,biomarker;assessed_biomarker_entity_id;assesse...
2,14099.0,presence of rs121908830 mutation in TMEM127,TMEM127,dbSNP:rs121908830,gene,cancer,DOID:162,,,risk,,,,CLINVAR:108,,biomarker;assessed_biomarker_entity_id;assesse...
4,96427.0,presence of rs387906223 mutation in KLHDC8B,KLHDC8B,dbSNP:rs387906223,gene,Hodgkin's lymphoma,DOID:8567,,,risk,,,,CLINVAR:273,,biomarker;assessed_biomarker_entity_id;assesse...
6,22218.0,presence of rs137854550 mutation in NF1,NF1,dbSNP:rs137854550,gene,juvenile myelomonocytic leukemia,DOID:0050458,,,risk,,,,CLINVAR:336,,biomarker;assessed_biomarker_entity_id;assesse...
7,22217.0,presence of rs137854550 mutation in NF1,NF1,dbSNP:rs137854550,gene,cancer,DOID:162,,,risk,,,,CLINVAR:336,,biomarker;assessed_biomarker_entity_id;assesse...


In [11]:
df.to_csv('../home/data/cleaned_data/GlyGen/clinvar.tsv', sep = '\t', index = False)