In [89]:
from neo4j import GraphDatabase
import pandas as pd
import requests
from tqdm import tqdm
import re

### Variant

In [167]:
query_variant_nodes = """MATCH (v:Variation)
    RETURN properties(v) AS Variation
"""

result = execute_query(driver, query_variant_nodes)

# Close the connection
driver.close()
print(len(result))


1008


  with driver.session() as session:


In [173]:
data = []
for record in result:
    variation = record.get('Variation', {})
    row = {
        'id': variation.get('id', None),
        'expression_hgvs_c': variation.get('expression_hgvs_c', None),
        'expression_hgvs_p': variation.get('expression_hgvs_p', None),
        'digest': variation.get('digest', None),
        'label': variation.get('label', None),
        'type': variation.get('type', None),
        'expression_hgvs_g': variation.get('expression_hgvs_g', None),
    }
    data.append(row)

df = pd.DataFrame(data)

df[0:5]

Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T]
1,ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,,,W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,NM_004333.4:c.1799T>A,Allele,
2,ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe,,,Otc5ovrw906Ack087o1fhegB4jDRqCAe,NC_000007.13:g.140453136A>T,Allele,
3,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,"[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...",[NP_005219.2:p.Thr790Met],sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,T790M,Allele,[NC_000007.13:g.55249071C>T]
4,ga4gh:VA.uldmTYEfqQ0PtALYw8aiE14mYGs5bzkS,,,uldmTYEfqQ0PtALYw8aiE14mYGs5bzkS,NM_005228.4:c.2369C>T,Allele,


### Study

In [1]:


# Function to create a connection to the Neo4j database
def create_db_connection(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

# Function to execute a Cypher query
def execute_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]

# Connect to the Neo4j database
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"  # Replace 'your_password' with your actual password
driver = create_db_connection(uri, user, password)

# Strict, Must have Combination Therapies
query = """MATCH (s:Study)
    RETURN properties(s) AS Study
"""

# Execute the query
result = execute_query(driver, query)

# Close the connection
driver.close()




In [8]:
result[0]

<Record Study={'alleleOrigin': 'somatic', 'id': 'civic.eid:238', 'description': 'The T790M mutation in EGFR has been shown to confer resistance to the tyrosine kinase inhibitor erlotinib, and patients harboring this mutation that are placed on the drug are likely to relapse.', 'direction': 'supports', 'predicate': 'predictsResistanceTo', 'type': 'VariantTherapeuticResponseStudy'}>

In [2]:
for field in result[0]['Study']:
    print(field)

alleleOrigin
id
description
direction
predicate
type


### Categorical Variation
First, basic at evidence level. Each variant evidence item will be normalized and the variation_id will be used between sources to demonstrate overlap of evidence across CIViC and Moalmanac. Later, will look across studies 

In [174]:
query_variant_categorical = """MATCH (v:Variation)
    OPTIONAL MATCH (v)-[:HAS_DEFINING_CONTEXT]-(c:CategoricalVariation)
    OPTIONAL MATCH (c)-[:HAS_VARIANT]-(s:Study)
    RETURN properties(v) AS Variation,
            properties(c) AS Category,
            COUNT(s) as Count
"""

# Execute the query
result = execute_query(driver, query_variant_categorical)

# Close the connection
driver.close()
print(len(result))


  with driver.session() as session:


1048


In [141]:
result[0]['Count']

31

In [175]:
data = []
for record in result:
    variation = record.get('Variation', {})
    category = record.get('Category', {})    # Safe check if 'Category' key is missing
    count = record.get('Count', {})
    # print(record)
    row = {
        'id': variation.get('id', None),
        'expression_hgvs_c': variation.get('expression_hgvs_c', None),
        'expression_hgvs_p': variation.get('expression_hgvs_p', None),
        'digest': variation.get('digest', None),
        'label': variation.get('label', None),
        'type': variation.get('type', None),
        'expression_hgvs_g': variation.get('expression_hgvs_g', None),
        'category_id': category.get('id', None) if isinstance(category, dict) else None,
        'category_desc': category.get('description', None) if isinstance(category, dict) else None,
        'category_label': category.get('label', None) if isinstance(category, dict) else None,
        'applied_study_count': count
    }
    data.append(row)

df = pd.DataFrame(data)

df[0:5]

Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g,category_id,category_desc,category_label,applied_study_count
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],moa.variant:144,,BRAF p.V600E (Missense),31
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],civic.mpid:12,BRAF V600E has been shown to be recurrent in m...,BRAF V600E,70
2,ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,,,W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,NM_004333.4:c.1799T>A,Allele,,,,,0
3,ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe,,,Otc5ovrw906Ack087o1fhegB4jDRqCAe,NC_000007.13:g.140453136A>T,Allele,,,,,0
4,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,"[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...",[NP_005219.2:p.Thr790Met],sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,T790M,Allele,[NC_000007.13:g.55249071C>T],moa.variant:242,,EGFR p.T790M (Missense),11


In [176]:
def normalize_variant(variant):
    url = f'https://normalize.cancervariants.org/variation/normalize?q={variant}&hgvs_dup_del_mode=default'
    r = requests.get(url) 
    if r.status_code == 200:   
        return(r.json())
    else:
        return None

df['var_id'] = None
df['var_type'] = None
df['var_loc_id'] = None
df['var_loc_start'] = None
df['var_loc_end'] = None

for idx, row in tqdm(df.iterrows()):
    # print(idx)
    if row['expression_hgvs_g'] == None:
        continue

    result = normalize_variant(row['expression_hgvs_g'][0])

    if result == None:
        continue
    else:
        # print(result)
        if any("Unable to find classification for:" in warning for warning in result['warnings']):
            continue
        if any("Unable to translate" in warning for warning in result['warnings']):
            continue

        df.at[idx, 'var_id'] = result['variation']['id']
        df.at[idx, 'var_type'] = result['variation']['type']
        df.at[idx, 'var_loc_id'] = result['variation']['location']['id']
        df.at[idx, 'var_loc_start'] = result['variation']['location']['start']
        df.at[idx, 'var_loc_end'] = result['variation']['location']['end']


1048it [01:08, 15.23it/s]


In [177]:
df

Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g,category_id,category_desc,category_label,applied_study_count,var_id,var_type,var_loc_id,var_loc_start,var_loc_end
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],moa.variant:144,,BRAF p.V600E (Missense),31,ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,Allele,ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi,140753335,140753336
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],civic.mpid:12,BRAF V600E has been shown to be recurrent in m...,BRAF V600E,70,ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,Allele,ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi,140753335,140753336
2,ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,,,W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,NM_004333.4:c.1799T>A,Allele,,,,,0,,,,,
3,ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe,,,Otc5ovrw906Ack087o1fhegB4jDRqCAe,NC_000007.13:g.140453136A>T,Allele,,,,,0,,,,,
4,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,"[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...",[NP_005219.2:p.Thr790Met],sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,T790M,Allele,[NC_000007.13:g.55249071C>T],moa.variant:242,,EGFR p.T790M (Missense),11,ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp,Allele,ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY,55181377,55181378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqD,,,uaKTab81sgTH6v26fsDJkGcKidJa_GqD,2-209113113-G-C,Allele,,,,,0,,,,,
1044,ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H,,,y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H,,Allele,,moa.variant:860,,IDH1 p.R132S (Missense),1,,,,,
1045,ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh,,,VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh,2-209113113-G-T,Allele,,,,,0,,,,,
1046,ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N,,,2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N,,Allele,,moa.variant:861,,IDH1 p.R132L (Missense),1,,,,,


In [178]:
df[['var_id','label']].value_counts()

var_id                                     label         
ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7  D816V             3
ga4gh:VA.JX_FH6W6rts4VAX6GWlurlqheqrgHGoT  E384*             2
ga4gh:VA.TAARa2cxRHmOiij9UBwvW-noMDoOq2x9  L858R             2
ga4gh:VA.Ol69g1SmOdYaopX-zIp42cHsWZCWrCj7  H1047L            2
ga4gh:VA.VzsVyqlcWS87LveLKdzeYwvmm7lz9ie1  K642E             2
                                                            ..
ga4gh:VA.NbIeg9oY7URUtee74IS69PsAgTVwqmMS  E81K              1
ga4gh:VA.O7TpCfowJTi-tHr3uIA3ZU_Cg9XUDITD  Y220C             1
ga4gh:VA.OdkVLBI2BYn4rmrjkqjEh6v_9RKfzswA  D770_N771insGT    1
ga4gh:VA.POEl_3_26UPoaUTf3nqH0s77PWQJEGPD  L597R             1
ga4gh:VA.zvOLR_KJgwNfZVpYbUA6IPmR86rWKp-5  D1228N            1
Name: count, Length: 261, dtype: int64

In [179]:
data = df[['id','expression_hgvs_g','var_id','label','category_label','category_id','applied_study_count']]
data

Unnamed: 0,id,expression_hgvs_g,var_id,label,category_label,category_id,applied_study_count
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,[NC_000007.13:g.140453136A>T],ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,V600E,BRAF p.V600E (Missense),moa.variant:144,31
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,[NC_000007.13:g.140453136A>T],ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,V600E,BRAF V600E,civic.mpid:12,70
2,ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,,,NM_004333.4:c.1799T>A,,,0
3,ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe,,,NC_000007.13:g.140453136A>T,,,0
4,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,[NC_000007.13:g.55249071C>T],ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp,T790M,EGFR p.T790M (Missense),moa.variant:242,11
...,...,...,...,...,...,...,...
1043,ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqD,,,2-209113113-G-C,,,0
1044,ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H,,,,IDH1 p.R132S (Missense),moa.variant:860,1
1045,ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh,,,2-209113113-G-T,,,0
1046,ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N,,,,IDH1 p.R132L (Missense),moa.variant:861,1


In [180]:
data.sort_values(by='var_id',ascending=False)[0:50]

Unnamed: 0,id,expression_hgvs_g,var_id,label,category_label,category_id,applied_study_count
545,ga4gh:VA.8XH05eUE8D7mjRtirrCsZDxZ1gAW_991,"[NC_000007.13:g.116423407G>A, NC_000007.14:g.1...",ga4gh:VA.zvOLR_KJgwNfZVpYbUA6IPmR86rWKp-5,D1228N,MET D1228N,civic.mpid:645,1
758,ga4gh:VA.OTw24jUGXJ_6t8D8lmQBvRaJQBbyrd9e,[NC_000011.9:g.108201096G>A],ga4gh:VA.zqaGzhlafiGlcmXVqwfdZ2EwAKM3xG4b,C2488Y,ATM C2488Y,civic.mpid:1146,1
253,ga4gh:VA.mcg4uK30oZzG7dO94hdwIQaOO09XHtJp,[NC_000004.11:g.55593612_55593614del],ga4gh:VA.zmmhHogAiHozB4_pcU_6L6ISfnTtoIpe,V560DEL,KIT V560DEL,civic.mpid:202,1
353,ga4gh:VA.5kcc1aPV7AaADQmZRAXyXMqEzU7T6GN2,[NC_000006.11:g.152419920_152419921delinsAG],ga4gh:VA.zbjFtas3cyxI78Ph-xrWHNGlJdUE-Hm8,L536Q,ESR1 L536Q,civic.mpid:46,2
219,ga4gh:VA.uBwLZ5VRi7kD4O1yEvezXKYrJ3ZaegK9,[NC_000007.13:g.140453145A>T],ga4gh:VA.zATpR7iDy-_AbeeQ93IgDZwYYZwApall,L597Q,BRAF L597Q,civic.mpid:579,3
213,ga4gh:VA.mZCLwhePJQizohrW8LmmPs62Mzh_I5Y8,[NC_000012.11:g.56478854G>A],ga4gh:VA.z7tojh3NvzxeTqX5seNcHlmCuB-s_-cF,V104M,ERBB3 V104M,civic.mpid:682,1
162,ga4gh:VA.skyk7EM-V-UuFN--3WwDEBwtwwBedVU8,[NC_000007.13:g.55249005G>T],ga4gh:VA.yr3duXsAtLA9Sd79rm8szW7ILvFJAYWv,S768I,EGFR S768I,civic.mpid:558,5
310,ga4gh:VA.P0kqaLBYh5iXcaY2nPaxhHHIQrDUshsW,[NC_000017.10:g.7577548C>T],ga4gh:VA.yr-4Fnb8Q_RBQD-JtGGQemDK3Mby1BXe,G245S,TP53 G245S,civic.mpid:853,1
425,ga4gh:VA.zDDKySmgMwFwLtAqrBXvNpQA_CQ5hToT,[NC_000003.11:g.178936091G>C],ga4gh:VA.yldbvaOLm7SnNhqBQM9XiY3BxKrtNcMN,E545Q,PIK3CA E545Q,civic.mpid:855,1
424,ga4gh:VA.zDDKySmgMwFwLtAqrBXvNpQA_CQ5hToT,[NC_000003.11:g.178936091G>C],ga4gh:VA.yldbvaOLm7SnNhqBQM9XiY3BxKrtNcMN,E545Q,PIK3CA p.E545Q (Missense),moa.variant:459,1


In [181]:
data[data['var_id']=='ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7']

Unnamed: 0,id,expression_hgvs_g,var_id,label,category_label,category_id,applied_study_count
58,ga4gh:VA.nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd,[NC_000004.11:g.55599321A>T],ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7,D816V,KIT p.D816V (Missense),moa.variant:360,1
59,ga4gh:VA.nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd,[NC_000004.11:g.55599321A>T],ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7,D816V,NOT KIT D816V,civic.mpid:4353,1
60,ga4gh:VA.nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd,[NC_000004.11:g.55599321A>T],ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7,D816V,KIT D816V,civic.mpid:65,3


In [170]:
df[df['label']=='D816V']

Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g
48,ga4gh:VA.nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd,"[NM_000222.2:c.2447A>T, ENST00000288135.5:c.24...",[NP_000213.1:p.Asp816Val],nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd,D816V,Allele,[NC_000004.11:g.55599321A>T]


#### Inspects

In [160]:
df[df['label'].isna()==True] # MATCH (c:CategoricalVariation) WHERE c.id = 'moa.variant:66'  RETURN c LIMIT 25 || shows that some Variation nodes can have no label on it


Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g,category_id,category_desc,category_label,applied_study_count,var_id,var_type,var_loc_id,var_loc_start,var_loc_end
912,ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,,,D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ,,Allele,,moa.variant:66,,ABL1 p.T315I (Missense),6,,,,,
914,ga4gh:VA.37YVc2HpRgXOq3HtsjcL1eiyLhDXLmYy,,,37YVc2HpRgXOq3HtsjcL1eiyLhDXLmYy,,Allele,,moa.variant:68,,ABL1 p.T315A (Missense),3,,,,,
916,ga4gh:VA.ZJZc_8PkTSu-twmaJvj6yQXvPJHElPZc,,,ZJZc_8PkTSu-twmaJvj6yQXvPJHElPZc,,Allele,,moa.variant:70,,ABL1 p.F317L (Missense),3,,,,,
918,ga4gh:VA.SnGz3wUT2JaIid12PoI6OHc4t7LgHVj1,,,SnGz3wUT2JaIid12PoI6OHc4t7LgHVj1,,Allele,,moa.variant:71,,ABL1 p.F317V (Missense),3,,,,,
920,ga4gh:VA.wDDVWfpuxnuYkLj5_0OrnaBvrJAXYcJA,,,wDDVWfpuxnuYkLj5_0OrnaBvrJAXYcJA,,Allele,,moa.variant:72,,ABL1 p.F317I (Missense),3,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,ga4gh:VA.zS_-FFo-cPjizcBEraRCMJ-wfCLNXM9F,,,zS_-FFo-cPjizcBEraRCMJ-wfCLNXM9F,,Allele,,moa.variant:476,,PIK3CA p.P539R (Missense),1,,,,,
1039,ga4gh:VA.8aRynLgwo0OYPIuCyiw6BGNd8oLxoGXx,,,8aRynLgwo0OYPIuCyiw6BGNd8oLxoGXx,,Allele,,moa.variant:478,,PIK3CA p.Y1021C (Missense),1,,,,,
1042,ga4gh:VA.aAXNeFGBgeJUGbun-bKvgoW204tC1xdp,,,aAXNeFGBgeJUGbun-bKvgoW204tC1xdp,,Allele,,moa.variant:859,,IDH1 p.R132G (Missense),1,,,,,
1044,ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H,,,y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H,,Allele,,moa.variant:860,,IDH1 p.R132S (Missense),1,,,,,


In [158]:
tdf = data[data['label'].isna()==False]
tdf[tdf['label']=='NM_004333.4:c.1799T>A']

Unnamed: 0,expression_hgvs_g,var_id,label,category_label,category_id,applied_study_count
2,,,NM_004333.4:c.1799T>A,,,0


In [159]:
tdf[tdf['category_id']=='civic.mpid:12']

Unnamed: 0,expression_hgvs_g,var_id,label,category_label,category_id,applied_study_count
1,[NC_000007.13:g.140453136A>T],ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,V600E,BRAF V600E,civic.mpid:12,70


#### Weird Regex Stuff

In [125]:
pattern1 = r'^[A-Za-z]\d+[A-Za-z]{1,3}$' # V600E
pattern2 = r'^([A-Z]+_\d+\.\d+):(g|c|p)\.((\d+(_\d+)?[A-Z]+>[A-Z]+)|(\d+|\d+_\d+)del(?!ins)|(\d+|\d+_\d+)ins[A-Z]+|(\d+|\d+_\d+)delins[A-Z]+|[A-Z][a-z]{2}\d+[A-Z][a-z]{2}|[A-Z][a-z]{2}\d+del|\d+=)$' # NM_004333.4:c.1799T>A
pattern3 = r'^\d+-\d+-[A-Za-z]-[A-Za-z]$' # 2-209113113-G-C
pattern4 = r'^c\.\d+[A-Z]>[A-Z]$' # c.393T>C
pattern5 = r'ENST'

# Function to classify based on patterns
def classify_by_pattern(value):
    if pd.isna(value):  # Check for NaN or NA values
        return 'No Match'
    value = str(value)  # Convert value to string
    if re.match(pattern1, value):
        return 'Pattern 1'
    elif re.match(pattern2, value):
        return 'Pattern 2'
    elif re.match(pattern3, value):
        return 'Pattern 3'
    elif re.match(pattern4, value):
        return 'Pattern 4'
    elif re.search(pattern5, value): 
        return 'Pattern 5'
    else:
        return 'No Match'

# Apply the function to create a new column
df['label_regex'] = df['label'].apply(classify_by_pattern)


In [128]:
df['label_regex'].value_counts()

label_regex
Pattern 2    471
Pattern 1    356
No Match     121
Pattern 3     74
Pattern 5     25
Pattern 4      1
Name: count, dtype: int64

In [99]:
df[df['label_regex']=='Pattern 1']

Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g,category_id,category_desc,category_label,var_id,var_type,var_loc_id,var_loc_start,var_loc_end,label_regex
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],moa.variant:144,,BRAF p.V600E (Missense),ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,Allele,ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi,140753335,140753336,Pattern 1
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],civic.mpid:12,BRAF V600E has been shown to be recurrent in m...,BRAF V600E,ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,Allele,ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi,140753335,140753336,Pattern 1
4,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,"[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...",[NP_005219.2:p.Thr790Met],sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,T790M,Allele,[NC_000007.13:g.55249071C>T],moa.variant:242,,EGFR p.T790M (Missense),ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp,Allele,ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY,55181377,55181378,Pattern 1
5,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,"[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...",[NP_005219.2:p.Thr790Met],sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,T790M,Allele,[NC_000007.13:g.55249071C>T],civic.mpid:34,EGFR T790M was one of the very first mutations...,EGFR T790M,ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp,Allele,ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY,55181377,55181378,Pattern 1
8,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,"[NM_005228.4:c.2573T>G, ENST00000275493.2:c.25...",[NP_005219.2:p.Leu858Arg],S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,L858R,Allele,[NC_000007.13:g.55259515T>G],moa.variant:254,,EGFR p.L858R (Missense),ga4gh:VA.TAARa2cxRHmOiij9UBwvW-noMDoOq2x9,Allele,ga4gh:SL.ulUNwZvajob7nzyrlpOd6uUWZIYCsoWb,55191821,55191822,Pattern 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,ga4gh:VA.RynDzpGjpLKfmAOrN0yrjRyAeIPqV52Q,"[NM_005631.4:c.1234C>T, ENST00000249373.3:c.12...",[NP_005622.1:p.Leu412Phe],RynDzpGjpLKfmAOrN0yrjRyAeIPqV52Q,L412F,Allele,[NC_000007.13:g.128846398C>T],civic.mpid:1478,,SMO L412F,ga4gh:VA.6kcO_sqNNSHOo6fR6cZJ6_o992202o97,Allele,ga4gh:SL.xuaPDRPulmJSAqoCc1WSSSBhcVvY0fhY,129206556,129206557,Pattern 1
898,ga4gh:VA.rIlkyhIg01Me8yT2_Q2woVzTaTcK-Dz1,,,rIlkyhIg01Me8yT2_Q2woVzTaTcK-Dz1,C284Y,Allele,,civic.mpid:1555,,POLD1 C284Y,,,,,,Pattern 1
899,ga4gh:VA.tAvB46rxfRKnXF1pWq1iRJAzyu-pNEz6,,,tAvB46rxfRKnXF1pWq1iRJAzyu-pNEz6,E374K,Allele,,civic.mpid:1556,,POLD1 E374K,,,,,,Pattern 1
900,ga4gh:VA.ie88C_NJ9fuZjOO1ZgGVoGb6ZU1yYuOb,,,ie88C_NJ9fuZjOO1ZgGVoGb6ZU1yYuOb,Q179X,Allele,,civic.mpid:1562,,NRAS Q179X,,,,,,Pattern 1


In [124]:
df[df['label_regex']=='Pattern 2']['label']


2                          NM_004333.4:c.1799T>A
3                    NC_000007.13:g.140453136A>T
6                          NM_005228.4:c.2369C>T
7                     NC_000007.13:g.55249071C>T
10                    NC_000007.13:g.55259515T>G
                         ...                    
886                        NM_005228.3:c.2590G>A
893    NC_000007.13:g.55249012_55249013insGGCACA
894             NM_005228.3:c.2310_2311insGGCACA
896                        NM_005631.4:c.1234C>T
897                  NC_000007.13:g.128846398C>T
Name: label, Length: 471, dtype: object

In [101]:
df[df['label_regex']=='Pattern 3']


Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g,category_id,category_desc,category_label,var_id,var_type,var_loc_id,var_loc_start,var_loc_end,label_regex
913,ga4gh:VA.HUJOQCml0LngKmUf5IJIYQk9CfKmagbf,,,HUJOQCml0LngKmUf5IJIYQk9CfKmagbf,9-133748283-C-T,Allele,,,,,,,,,,Pattern 3
915,ga4gh:VA.R7udthNB0ErCSOrSgHNUKB1uCLXE5BZ5,,,R7udthNB0ErCSOrSgHNUKB1uCLXE5BZ5,9-133747582-A-G,Allele,,,,,,,,,,Pattern 3
917,ga4gh:VA.UTV6lwIVIZgs38dBRg1TU7HYgG5cObP0,,,UTV6lwIVIZgs38dBRg1TU7HYgG5cObP0,9-133748290-C-G,Allele,,,,,,,,,,Pattern 3
919,ga4gh:VA.gVx_jtWuo12r_n-3PxXKH5eV3L8MDt0y,,,gVx_jtWuo12r_n-3PxXKH5eV3L8MDt0y,9-133748288-T-G,Allele,,,,,,,,,,Pattern 3
921,ga4gh:VA.a8TcUEmtsJGEaOHdkcqe-TGj2z19iLhO,,,a8TcUEmtsJGEaOHdkcqe-TGj2z19iLhO,9-133748288-T-A,Allele,,,,,,,,,,Pattern 3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040,ga4gh:VA.TCoGnTPgu4-nkn9VkajNGKLRSTN7ei6s,,,TCoGnTPgu4-nkn9VkajNGKLRSTN7ei6s,3-178952007-A-G,Allele,,,,,,,,,,Pattern 3
1041,ga4gh:VA.Fw5XPRvCcynVkUKffAWKjwPTuF7R1FO3,,,Fw5XPRvCcynVkUKffAWKjwPTuF7R1FO3,10-43614996-G-A,Allele,,,,,,,,,,Pattern 3
1043,ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqD,,,uaKTab81sgTH6v26fsDJkGcKidJa_GqD,2-209113113-G-C,Allele,,,,,,,,,,Pattern 3
1045,ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh,,,VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh,2-209113113-G-T,Allele,,,,,,,,,,Pattern 3


In [127]:
df[df['label_regex']=='No Match']['label'][0:25]

19     NP_000507.1:p.Ile131=
174             E746_A750del
175                    R233*
222             W557_K558del
257                   Q1178*
293                    Q503*
300         R200W (c.598C>T)
303               Q56_V60del
317           D770_N771insGL
321             D770delinsGY
395                   S2289*
414          V769_D770insASV
415          V769_D770insASV
419            M774DELINSWLV
447         L747_P753delinsS
472            D770_N771insG
475          H773_V774insNPH
476         L747_S752delinsQ
479          P772_H773insYNP
480          P772_V774insPHV
488             N486_P490del
490                   K3326*
492                    L938*
496           A502_Y503insAY
506         P551_E554delPMYE
Name: label, dtype: object

In [129]:
df

Unnamed: 0,id,expression_hgvs_c,expression_hgvs_p,digest,label,type,expression_hgvs_g,category_id,category_desc,category_label,var_id,var_type,var_loc_id,var_loc_start,var_loc_end,label_regex
0,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],moa.variant:144,,BRAF p.V600E (Missense),ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,Allele,ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi,140753335,140753336,Pattern 1
1,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,"[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...",[NP_004324.2:p.Val600Glu],j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,V600E,Allele,[NC_000007.13:g.140453136A>T],civic.mpid:12,BRAF V600E has been shown to be recurrent in m...,BRAF V600E,ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M,Allele,ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi,140753335,140753336,Pattern 1
2,ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,,,W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R,NM_004333.4:c.1799T>A,Allele,,,,,,,,,,Pattern 2
3,ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe,,,Otc5ovrw906Ack087o1fhegB4jDRqCAe,NC_000007.13:g.140453136A>T,Allele,,,,,,,,,,Pattern 2
4,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,"[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...",[NP_005219.2:p.Thr790Met],sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,T790M,Allele,[NC_000007.13:g.55249071C>T],moa.variant:242,,EGFR p.T790M (Missense),ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp,Allele,ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY,55181377,55181378,Pattern 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqD,,,uaKTab81sgTH6v26fsDJkGcKidJa_GqD,2-209113113-G-C,Allele,,,,,,,,,,Pattern 3
1044,ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H,,,y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H,,Allele,,moa.variant:860,,IDH1 p.R132S (Missense),,,,,,No Match
1045,ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh,,,VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh,2-209113113-G-T,Allele,,,,,,,,,,Pattern 3
1046,ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N,,,2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N,,Allele,,moa.variant:861,,IDH1 p.R132L (Missense),,,,,,No Match


### Disease

### Therapy

### Gene