In [1]:
import pandas as pd
import numpy as np
from biological_fuzzy_logic_networks.utils import read_sif
from scipy.sparse import dok_matrix
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = "/dccstor/ipc1/CAR/DREAM/DREAMdata/"

In [3]:
nodes, edge_list = read_sif(f"{data_dir}prior_knowledge.sif")
nodes

['PIP3',
 'AKT_S473',
 'p53',
 'RB',
 'GSK3B',
 'AMPK',
 'cleavedCas',
 'SMAD23',
 'ERK12',
 'MSK12',
 'MKK36',
 'H3',
 'p90RSK',
 'SERUM',
 'mTOR',
 'AKT',
 'NFkB',
 'cAMP',
 'PKA',
 'PKC',
 'p38',
 'PLCg2',
 'MET',
 'RAF',
 'p70S6K',
 'PDPK1',
 'AKT_T308',
 'MAPKAPK2',
 'PTEN',
 'b-catenin',
 'S6',
 'CREB',
 'RAS',
 'PI3K',
 'BTK',
 'RAF_S259',
 'MARCKS',
 'SRC',
 'FAK',
 'EGFR',
 'MEK12_S221',
 '4EBP1',
 'SYK',
 'MKK4',
 'JNK',
 'MAP3Ks',
 'INSR',
 'PAK',
 'STAT5',
 'EGF',
 'MEK12',
 'STAT1',
 'STAT3']

In [4]:
set(item for key in edge_list.keys() for item in [key[0], key[1]]) == set(nodes)

True

In [5]:
protein_to_gene_map = {
 'PIP3': ["PIP"],
 'AKT_S473': ["AKT1", "AKT2", "AKT3"], # Isoforms
 'p53': ["TP53"],
 'RB': ["RB1"],
 'GSK3B': ["GSK3B"],
 'AMPK': ["PRKAA1", "PRKAA2", "PRKAB1", "PRKAB2", "PRKAG1", "PRKAG2", "PRKAG3"] ,# Isoforms?
 'cleavedCas': ["CASP3"], # Caspase 3
 'SMAD23': ["SMAD2", "SMAD3"], # Subunits
 'ERK12': ["MAPK1", "MAPK3"], # Subunits
 'MSK12': ["ITGB1"],
 'MKK36': ["MAP2K6", "MAP2K3"], # Subunits
 'H3': ["H3C1", "H3C2", "H3C3", "H3C4", "H3C5", "H3C6", "H3C7" "H3C8", 
        "H3C10", "H3C11", "H3C12", "H3C13", "H3C14", "H3C15", 
        "H3-3A", "H3-3B", "H3Y1", "H3Y2", "H3-4"], 
 'p90RSK': ["RPS6KA1", "RPS6KA2", "RPS6KA3", "RPS6KA4", "RPS6KA5", "RPS6KA6"], # Isoforms
 "SERUM": [""],
 'mTOR': ["MTOR", "DEPTOR", "RPTOR", "RICTOR", "MLST8", # See mtor wiki page, subunits
          "AKT1S1", #PRAS40 
          "MAPKAP1", #mSin1
          "RICTOR", 
          "PRR5L", "PRR5" # PROTOR 
         ],
 'AKT': ["AKT1", "AKT2", "AKT3"], # Isoforms
 'NFkB': ["NFKB1", "NFKB2", "RELA", "RELB", "REL" ], # Subunits and isoforms
 'cAMP': [""],
 'PKA': ["PRKACA", "PRKACB", "PRKACG", "PRKAR1A", "PRKAR1B", "PRKAR2A", "PRKAR2B"], # Subunits and isoforms
 'PKC': ["PRKCA", "PRKCB", "PRKCD", "PRKCE", "PRKCG", "PRKCH", "PRKCQ", "PRKCI", "PRKCZ"], # Isoforms
 'p38': ["MAPK14", "MAPK11", "MAPK12", "MAPK13"], # Isoforms
 'PLCg2': ["PLCG2"],
 'MET': ["MET"],
 'RAF': ["RAF1", "ARAF", "BRAF"], # Isoforms
 'p70S6K': ["RPS6KB1", "RPS6KB2"],
 'PDPK1': ["PDPK1"],
 'AKT_T308': ["AKT1", "AKT2", "AKT3"],
 'MAPKAPK2': ["MAPKAPK2"],
 'PTEN': ["PTEN"],
 'b-catenin': ["CTNNB1"],
 'S6': ["RPS6"],
 'CREB': ["CREB1", "ATF4", "CREB3", "CREB5", "CREB3L2", "CREB3L3", "CREB3L4"], # Isoforms
 'RAS': ["NRAS", "KRAS", "HRAS"], # MANY MORE but these are most important
 'PI3K': ["PIK3CA", "PIK3CB", "PIK3CG", "PIK3CD", # Isoforms and subunits
          "PIK3R1", "PIK3R2", "PIK3R3", "PIK3R4", "PIK3R5", "PIK3R6",
          "PIK3C2A", "PIK3C2B", "PIK3C2G", "PIK3C3"], 
 'BTK': ["BTK"],
 'RAF_S259': ["RAF1", "ARAF", "BRAF"], # Isofors
 'MARCKS': ["MARCKS"],
 'SRC': ["SRC"],
 'FAK': ["PTK2"],
 'EGFR': ["EGFR"],
 'MEK12_S221': ["MAP2K1", "MAP2K2"], # Subunits
 '4EBP1': ["EIF4EBP1"],
 'SYK': ["SYK"],
 'MKK4': ["MAP2K4"],
 'JNK': ["MAPK8", "MAPK9"], # Isoform for JNK1 and JNK2 (JNK3 only found in brain, heart and testes)
 'MAP3Ks': ["MAP3K1", "MAP3K2", "MAP3K3", "MAP3K4", "MAP3K5", "MAP3K6", "MAP3K7", "MAP3K8", "MAP3K9",
           "MAP3K10", "MAP3K11", "MAP3K12", "MAP3K13", "MAP3K14", "MAP3K15", "TAOK1", "TAOK2",
           "TAOK3", "ZAK"], # Isoforms
 'INSR': ["INSR"],
 'PAK': ["PAK1", "PAK2", "PAK3", "PAK4", "PAK5", "PAK6" ],
 'STAT5': ["STAT5A", "STAT5B"],
 'EGF': [""],
 'MEK12': ["MAP2K1", "MAP2K2"],
 'STAT1': ["STAT1"],
 'STAT3': ["STAT3"]
}

node_gene_set = set([gene for gene_list in protein_to_gene_map.values() for gene in gene_list])

In [6]:
# SNP_marcotte = pd.read_csv(f"{data_dir}SNP_Marcotte.csv")
SNP_mapping = pd.read_csv(f"{data_dir}SNP_mapping.csv", index_col=0).reset_index(drop=True)
SNP_mapping["SNPid"] = [f"X{x}" if (x[0]).isdigit() else x for x in SNP_mapping["SNPid"] ]
SNP_mapping


Unnamed: 0,SNPid,IlmnID,GenomeBuild,Chr,MapInfo,Source,SourceVersion,WtGenotype,GeneNames,NucleotideA,NucleotideB
0,X200006,200006-0_T_R_1853021091,37.1,9,139926402,ILLUMINA,0,A,"C9orf139,FUT7",A,G
1,X200052,200052-0_B_R_1852966093,37.1,2,220074793,ILLUMINA,0,B,"ABCB6,ATG9A",T,A
2,X200053,200053-0_B_F_1852966094,37.1,2,220075045,ILLUMINA,0,A,"ABCB6,ATG9A",T,C
3,X200078,200078-0_T_R_1853021093,37.1,16,16286614,ILLUMINA,0,A,ABCC6,C,G
4,X200087,200087-0_B_F_1852966095,37.1,16,16246164,ILLUMINA,0,A,ABCC6,T,G
...,...,...,...,...,...,...,...,...,...,...,...
1124671,VGXS34744,VGXS34744-0_T_F_1852963054,37.1,X,153761010,1000genomes,0,B,G6PD,A,G
1124672,VGXS34759,VGXS34759-0_T_F_1858975629,37.1,X,153760484,ILLUMINA,0,B,G6PD,A,C
1124673,VGXS34760,VGXS34760-0_T_F_1858975703,37.1,X,153760484,ILLUMINA,0,A,G6PD,C,G
1124674,VGXS34761,VGXS34761-0_B_F_1852963135,37.1,X,153760472,1000genomes,0,B,G6PD,T,C


In [7]:
def format_mutation(row, sep: str=">"):
    wildtype = row["WtGenotype"]
    if wildtype=="A":
        return f"{row['NucleotideA']}{sep}{row['NucleotideB']}"
    elif wildtype=="B":
        return f"{row['NucleotideB']}{sep}{row['NucleotideA']}"
    else:
        return np.nan

In [8]:
SNP_mapping["allele"] = [format_mutation(row, sep="/") for i, row in SNP_mapping.iterrows()]
# SNP_mapping["strand"] = ["+" if ID.split("_")[2]=="F" else "-" for ID in SNP_mapping["IlmnID"]]
SNP_mappingpingppingP_mapping["strand"] = "+"
SNP_mapping = SNP_mapping.dropna(subset="allele")
SNP_mapping = SNP_mapping.sort_values(["Chr", "MapInfo"], ascending=True)


In [9]:
SNP_mapping[["Chr", "MapInfo", "MapInfo", "allele", "strand", "SNPid"]].to_csv(f"{data_dir}SNP_query_file_all_FWD.tsv",
                                                                              sep="\t", index=False, header=False)
SNP_mapping

Unnamed: 0,SNPid,IlmnID,GenomeBuild,Chr,MapInfo,Source,SourceVersion,WtGenotype,GeneNames,NucleotideA,NucleotideB,allele,strand
78555,cnvi0146654,cnvi0146654-0_P_F_1860030213,37.1,1,47851,dcgn,0,B,,C,C,C/C,+
78557,cnvi0146656,cnvi0146656-0_M_R_1860253700,37.1,1,50251,dcgn,0,B,,T,T,T/T,+
82519,cnvi0151530,cnvi0151530-1_P_F_1860025716,37.1,1,51938,CNV-design,1,A,,N,A,,+
78556,cnvi0146655,cnvi0146655-0_M_R_1859965134,37.1,1,52651,dcgn,0,B,,T,T,T/T,+
88613,cnvi0159124,cnvi0159124-1_P_F_1860031721,37.1,1,55338,CNV-design,1,B,,N,A,A/N,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103513,rs9786720,rs9786720-131_B_F_1857712640,37.1,Y,58883690,dbSNP,131,A,AC068123.5-2,T,C,T/C,+
974025,rs7474471,rs7474471-131_T_F_1863187717,37.1,Y,58912652,dbSNP,131,A,,C,G,C/G,+
91201,cnvi0162258,cnvi0162258-1_P_F_1860082712,37.1,Y,59012463,CNV-design,1,A,,N,A,,+
881681,rs6568319,rs6568319-131_B_R_1863355296,37.1,Y,59015360,dbSNP,131,A,,T,C,T/C,+


Used https://www.ensembl.org/Homo_sapiens/Tools/VEP on 25-04-2023 with default settings and file created above. Results are downleaded as txt file in file below.

In [10]:
variant_effect = pd.read_csv(f"{data_dir}Variant_effects.txt", sep="\t")


Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,...,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS
1859,rs6665000,1:924898-924898,A,missense_variant,MODERATE,SAMD11,ENSG00000187634,Transcript,ENST00000616016.5,protein_coding,...,-,-,-,-,-,-,-,-,-,-
1863,rs6665000,1:924898-924898,A,missense_variant,MODERATE,SAMD11,ENSG00000187634,Transcript,ENST00000618323.5,protein_coding,...,-,-,-,-,-,-,-,-,-,-
2041,cnvi0150926,1:963938-963938,A,coding_sequence_variant,MODIFIER,KLHL17,ENSG00000187961,Transcript,ENST00000338591.8,protein_coding,...,-,-,-,-,-,-,-,-,-,-
2051,cnvi0065227,1:963966-963966,A,coding_sequence_variant,MODIFIER,KLHL17,ENSG00000187961,Transcript,ENST00000338591.8,protein_coding,...,-,-,-,-,-,-,-,-,-,-
2081,cnvi0065230,1:964124-964124,A,coding_sequence_variant,MODIFIER,KLHL17,ENSG00000187961,Transcript,ENST00000338591.8,protein_coding,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2221432,rs7218312,17:58308996-58308996,A,stop_gained,HIGH,TSPOAP1,ENSG00000005379,Transcript,ENST00000268893.10,protein_coding,...,-,-,1,1,-,-,-,-,-,-
2221433,rs7218312,17:58308996-58308996,A,stop_gained,HIGH,TSPOAP1,ENSG00000005379,Transcript,ENST00000343736.9,protein_coding,...,-,-,1,1,-,-,-,-,-,-
2221437,rs7218312,17:58308996-58308996,A,stop_gained,HIGH,TSPOAP1,ENSG00000005379,Transcript,ENST00000580669.6,protein_coding,...,-,-,1,1,-,-,-,-,-,-
2221464,rs6503954,17:58322729-58322729,T,synonymous_variant,LOW,TSPOAP1,ENSG00000005379,Transcript,ENST00000268893.10,protein_coding,...,-,-,-,-,-,-,-,-,-,-


In [16]:
conding_consequences = ["missense_variant", # Change in AA
                        "synonymous_variant", # Change in nt but not in AA
                        "coding_sequence_variant",
                        "stop_gained",
                        "stop_lost",
                        "stop_retained_variant", # NT mutation in stop codon but stop remains
                        "start_lost",
                        "incomplete_terminal_codon_variant" # NT mutation in stop codon of incompletely annotated transcripts
                       ]

# Filter for coding variants and snps that effect our gene set
coding_effects = variant_effect[(variant_effect["Consequence"].isin(conding_consequences)) & 
                                (variant_effect)["SYMBOL"].isin(node_gene_set)]
coding_snps = set(coding_effects["#Uploaded_variation"])

In [19]:
# If True each snp is only associated with one gene
len(coding_effects[["#Uploaded_variation", "SYMBOL"]].drop_duplicates()) == len(coding_snps)

Unnamed: 0,#Uploaded_variation,SYMBOL
7060,cnvi0128184,PRKCZ
30856,rs2261217,MTOR
30883,rs910660,MTOR
31091,rs12122605,MTOR
31171,rs17036508,MTOR
...,...,...
2181047,rs2631299,STAT5B
2182043,rs3213650,STAT5A
2182222,rs9901595,STAT5A
2192307,rs2285591,MAP3K14


In [20]:
len(coding_snps)

62

In [46]:
[x.split("_")[2] for x in SNP_mapping["IlmnID"] if not (x.split("_")[2]) in "RF"]

['T', 'B', 'T', 'B', 'B', 'T', 'B', 'T', 'T', 'B', 'B', 'T', 'B']

In [23]:
SNP_marcotte = pd.read_csv(f"{data_dir}SNP_Marcotte.csv", index_col=0)
SNPs = set(SNP_marcotte.columns)
CLs = set(SNP_marcotte.index)

In [24]:
set(SNP_mapping.loc[SNP_mapping["WtGenotype"].isna(), "SNPid"]).intersection(SNPs)

set()

Check how well the VEF predictor and the given mapping agree on which SNP maps to which gene

In [38]:
nodes_to_snps_mapping = {}
nodes_to_snps_effects = {}
for node, genes in protein_to_gene_map.items():
    
    snps = set(SNP_mapping.loc[SNP_mapping["GeneNames"].isin(genes), "SNPid"])
    effect_snps = set(coding_effects.loc[coding_effects["SYMBOL"].isin(genes), "#Uploaded_variation"])
    nodes_to_snps_mapping[node] = list(snps.intersection(SNPs))
    nodes_to_snps_effects[node] = list(effect_snps.intersection(SNPs))

In [39]:
in_effects_not_in_mapping = {}
in_mapping_not_in_effects = {}
for node in nodes_to_snps_effects.keys():
    effect_snps = set(nodes_to_snps_effects[node])
    mapping_snps = set(nodes_to_snps_mapping[node])
    
    in_effects_not_in_mapping[node] = effect_snps - mapping_snps
    in_mapping_not_in_effects[node] = mapping_snps - effect_snps

In [40]:
in_effects_not_in_mapping

{'PIP3': set(),
 'AKT_S473': set(),
 'p53': {'rs11868946', 'rs9889453'},
 'RB': set(),
 'GSK3B': set(),
 'AMPK': {'rs11583679', 'rs11804045', 'rs6669296', 'rs7545497', 'rs778413'},
 'cleavedCas': set(),
 'SMAD23': set(),
 'ERK12': set(),
 'MSK12': set(),
 'MKK36': set(),
 'H3': {'rs360097'},
 'p90RSK': {'rs1059296', 'rs11231825', 'rs3795687', 'rs476037'},
 'SERUM': set(),
 'mTOR': {'rs12122605', 'rs17036508', 'rs2261217', 'rs258282', 'rs910660'},
 'AKT': set(),
 'NFkB': {'rs17113301', 'rs2489012', 'rs2495712'},
 'cAMP': set(),
 'PKA': {'rs4243784', 'rs6576930', 'rs723251'},
 'PKC': set(),
 'p38': set(),
 'PLCg2': set(),
 'MET': set(),
 'RAF': set(),
 'p70S6K': {'rs2447571', 'rs4646823', 'rs4646825'},
 'PDPK1': {'rs11866579'},
 'AKT_T308': set(),
 'MAPKAPK2': set(),
 'PTEN': set(),
 'b-catenin': set(),
 'S6': set(),
 'CREB': set(),
 'RAS': {'kgp2822831'},
 'PI3K': {'rs2926535', 'rs4129180'},
 'BTK': set(),
 'RAF_S259': set(),
 'MARCKS': set(),
 'SRC': set(),
 'FAK': set(),
 'EGFR': set(

In [41]:
in_mapping_not_in_effects

{'PIP3': {'kgp22852551', 'rs10234174', 'rs315283', 'rs9986765'},
 'AKT_S473': {'GA034580',
  'kgp15713508',
  'kgp22843012',
  'kgp6506022',
  'rs10138227',
  'rs10157763',
  'rs10158245',
  'rs10732305',
  'rs10754807',
  'rs10803152',
  'rs10927035',
  'rs10927040',
  'rs10927046',
  'rs1130233',
  'rs11670912',
  'rs11849304',
  'rs12032342',
  'rs12045585',
  'rs12047209',
  'rs12048930',
  'rs12079019',
  'rs12091469',
  'rs12117580',
  'rs12140414',
  'rs12144559',
  'rs12744297',
  'rs16974157',
  'rs17846825',
  'rs17846831',
  'rs2304186',
  'rs2304188',
  'rs2304189',
  'rs2494731',
  'rs2494738',
  'rs2494739',
  'rs2494743',
  'rs2498794',
  'rs3006938',
  'rs3006939',
  'rs320311',
  'rs320332',
  'rs3730256',
  'rs3856231',
  'rs4375597',
  'rs4658588',
  'rs4803320',
  'rs6686591',
  'rs6703013',
  'rs6704286',
  'rs892120',
  'rs9428588',
  'rs9428966',
  'rs946824',
  'rs971285'},
 'p53': {'kgp10265473',
  'rs1042522',
  'rs11652704',
  'rs12949853',
  'rs1614984',
  '

In [28]:
nodes_to_snps

{'PIP3': ['rs9986765', 'kgp22852551', 'rs10234174', 'rs315283'],
 'AKT_S473': ['rs10803152',
  'rs12048930',
  'rs11670912',
  'rs2304189',
  'rs946824',
  'rs3006938',
  'GA034580',
  'rs12047209',
  'rs2304186',
  'rs2494739',
  'rs12140414',
  'rs4658588',
  'rs4803320',
  'rs10927040',
  'rs11849304',
  'rs12144559',
  'kgp6506022',
  'rs12117580',
  'kgp15713508',
  'rs12744297',
  'rs2494738',
  'rs6704286',
  'rs2494743',
  'rs320332',
  'rs10138227',
  'rs971285',
  'rs1130233',
  'rs10927035',
  'rs9428966',
  'rs12091469',
  'rs16974157',
  'rs2498794',
  'rs892120',
  'rs2494731',
  'rs10754807',
  'rs10927046',
  'rs12079019',
  'rs320311',
  'rs4375597',
  'rs6703013',
  'rs9428588',
  'rs10927034',
  'rs3006939',
  'rs10732305',
  'rs10157763',
  'rs3730256',
  'rs12032342',
  'rs12045585',
  'rs3856231',
  'rs17846831',
  'rs6686591',
  'rs10158245',
  'kgp22843012',
  'rs17846825',
  'rs1058304',
  'rs2304188'],
 'p53': ['rs17887200',
  'rs8079544',
  'rs8073498',
  'rs

In [29]:
SNP_marcotte.shape

(62, 1001902)

In [48]:
sparse = dok_matrix(np.array(SNP_marcotte))
with open(f"{data_dir}sparse_marcotte.pkl", "wb") as f:
    pickle.dump(sparse, f)

In [31]:
SNP_marcotte.index

Index(['CAL51', 'MDAMB134VI', 'BT20', 'AU565', 'SKBR3', 'MDAMB157', 'JIMT1',
       'MCF10A', 'EVSAT', 'HCC1187', 'BT474', 'KPL1', 'MDAMB231', 'UACC893',
       'HCC2218', 'BT549', 'HCC1143', '184A1', 'EFM192A', 'HCC1806', 'UACC812',
       'T47D', 'HCC1937', 'MDAMB453', 'CAMA1', 'HCC38', 'BT483', 'MDAMB361',
       'MFM223', 'ZR751', 'HCC1428', 'HCC1419', 'EFM19', 'MDAMB468', 'HCC202',
       'MDAMB436', 'HCC1954', 'MCF7', 'CAL851', 'CAL148', 'CAL120', 'Hs578T',
       'MX1', 'HCC2185', 'MACLS2', 'DU4475', 'MCF12A', 'HDQP1', '184B5',
       'HCC1500', 'OCUBM', 'HCC1599', 'HBL100', 'LY2', 'ZR75B', 'HCC3153',
       'HCC70', 'MDAMB175VII', 'MDAMB415', 'MPE600', 'HCC1569', 'UACC3199'],
      dtype='object')

In [32]:
nodes_to_cell_lines = {}
for node, snps in nodes_to_snps.items():
    cell_lines = SNP_marcotte.index.values[SNP_marcotte[snps].sum(axis=1)>0]
    
    nodes_to_cell_lines[node] = list(cell_lines)

In [33]:
nodes_to_cell_lines

{'PIP3': ['CAL51',
  'MDAMB134VI',
  'MDAMB157',
  'HCC1187',
  'MDAMB231',
  'HCC1143',
  'HCC1806',
  'HCC1937',
  'CAMA1',
  'BT483',
  'MDAMB361',
  'EFM19',
  'MDAMB468',
  'MDAMB436',
  'CAL148',
  'MX1',
  'HCC3153',
  'HCC70',
  'MPE600',
  'HCC1569'],
 'AKT_S473': ['CAL51',
  'MDAMB134VI',
  'BT20',
  'AU565',
  'SKBR3',
  'MDAMB157',
  'JIMT1',
  'MCF10A',
  'EVSAT',
  'HCC1187',
  'BT474',
  'KPL1',
  'MDAMB231',
  'UACC893',
  'HCC2218',
  'BT549',
  'HCC1143',
  '184A1',
  'EFM192A',
  'HCC1806',
  'UACC812',
  'T47D',
  'HCC1937',
  'MDAMB453',
  'CAMA1',
  'HCC38',
  'BT483',
  'MDAMB361',
  'MFM223',
  'ZR751',
  'HCC1428',
  'HCC1419',
  'EFM19',
  'MDAMB468',
  'HCC202',
  'MDAMB436',
  'HCC1954',
  'MCF7',
  'CAL851',
  'CAL148',
  'CAL120',
  'Hs578T',
  'MX1',
  'HCC2185',
  'MACLS2',
  'DU4475',
  'MCF12A',
  'HDQP1',
  '184B5',
  'HCC1500',
  'OCUBM',
  'HCC1599',
  'HBL100',
  'LY2',
  'ZR75B',
  'HCC3153',
  'HCC70',
  'MDAMB175VII',
  'MDAMB415',
  'MPE600',
 

In [34]:
node_snps_set = set([snp for snp_list in nodes_to_snps.values() for snp in snp_list])

In [None]:
node_snps_set

In [47]:
for n in node_snps_set:
    if n.startswith("rs"):
        print(n)

rs4952774
rs8055243
rs5955559
rs12466402
rs4776881
rs4261913
rs2711305
rs11238349
rs2602465
rs6926393
rs158687
rs12601850
rs6935679
rs3856748
rs9389412
rs2594489
rs12976130
rs6926800
rs516741
rs13427647
rs7429119
rs7779184
rs13232030
rs12566637
rs3817781
rs281477
rs1088672
rs2953
rs4002566
rs3915606
rs6720958
rs6487464
rs13401638
rs7217884
rs9614882
rs180515
rs7803520
rs40239
rs12199654
rs11657742
rs274009
rs1755991
rs10814274
rs3799665
rs12582971
rs11890554
rs17156694
rs636819
rs217513
rs3736430
rs10420008
rs2154754
rs17883843
rs11170563
rs16933873
rs12563929
rs2999487
rs1981
rs11259432
rs11101291
rs6952398
rs41301234
rs2255094
rs17107001
rs2362707
rs1129880
rs1790753
rs10937215
rs7275012
rs17770535
rs6965771
rs2229714
rs198162
rs867286
rs3743135
rs12999695
rs2705897
rs10221473
rs10508307
rs2235283
rs10114882
rs16892149
rs8961
rs6681133
rs2199503
rs817556
rs17766621
rs936214
rs177578
rs10789040
rs763187
rs7199551
rs42733
rs507577
rs160375
rs1286092
rs11671297
rs11249687
rs2140101
rs23