# Molecular Oncology Almanac Assertion Analysis

In [2]:
from typing import Dict
import json

import pandas as pd
import plotly.express as px
import requests
from ga4gh.core import sha512t24u


## Creating a table with variant(feature) and evidence(assertion) information

In [3]:
def get_feature_digest(feature: Dict) -> str:
    """Get digest for feature

    :param feature: MOA feature
    :return: Digest
    """
    attrs = json.dumps(
        feature["attributes"][0], 
        sort_keys=True,
        separators=(",", ":"),
        indent=None
    ).encode("utf-8")
    return sha512t24u(attrs)

In [4]:
# Create dictionary for MOA Feature ID -> Feature Type
r = requests.get("https://moalmanac.org/api/features")
if r.status_code == 200:
    feature_data = r.json()

features = {}

for feature in feature_data:
    feature_id = feature["feature_id"]
    digest = get_feature_digest(feature)
    features[digest] = feature["feature_type"]

len(features.keys())


423

In [5]:
# Create DF for assertions and their associated feature + predictive implication
r = requests.get("https://moalmanac.org/api/assertions")
if r.status_code == 200:
    assertion_data = r.json()

transformed = []

for assertion in assertion_data:
    assertion_id = assertion["assertion_id"]
    predictive_implication = assertion["predictive_implication"]
    
    if len(assertion["features"]) != 1:
        print(f"assertion id ({assertion_id}) does not have 1 feature")
        continue 

    feature = assertion["features"][0]
    feature_digest = get_feature_digest(feature)

    transformed.append(
        {
            "assertion_id": assertion_id,
            "feature_id": feature["feature_id"], 
            "feature_type": features[feature_digest],
            "predictive_implication": predictive_implication,
            "feature_digest": feature_digest
        }
    )
moa_df =  pd.DataFrame(transformed)
moa_df

Unnamed: 0,assertion_id,feature_id,feature_type,predictive_implication,feature_digest
0,1,1,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP
1,2,2,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP
2,3,3,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP
3,4,4,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP
4,5,5,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP
...,...,...,...,...,...
869,870,870,somatic_variant,FDA-Approved,1JInmjKzPW9V9q9UKen4VODk1drBadA2
870,871,871,somatic_variant,FDA-Approved,txWE0iDd8r36tzSRZw9tyMcMz9-L5M0g
871,872,872,copy_number,FDA-Approved,PWxJ97XlgUgdA5hvLmyZENRIL9pTYY84
872,873,873,somatic_variant,FDA-Approved,qXCY7CGzeFBIDXYsc18tEVBImkWB_43E


In [6]:
len_features = len(moa_df.feature_digest.unique())
f"Total number of unique features (variants): {len_features}"

'Total number of unique features (variants): 423'

In [7]:
len_assertions = len(moa_df.assertion_id.unique())
f"Total number of unique assertions: {len_assertions}"

'Total number of unique assertions: 874'

### Converting feature types to normalized categories

In [8]:
list(moa_df.feature_type.unique())

['rearrangement',
 'somatic_variant',
 'germline_variant',
 'copy_number',
 'microsatellite_stability',
 'mutational_signature',
 'mutational_burden',
 'knockdown',
 'aneuploidy']

In [9]:
moa_df["category"] = moa_df["feature_type"].copy()

moa_df["category"]= moa_df["category"].replace("aneuploidy", "copy_number")
moa_df["category"]= moa_df["category"].replace("knockdown", "expression")
moa_df["category"]= moa_df["category"].replace("somatic_variant", "protein_consequence")
moa_df["category"]= moa_df["category"].replace("germline_variant", "protein_consequence")
moa_df["category"]= moa_df["category"].replace("microsatellite_stability", "rearrangement")
moa_df["category"]= moa_df["category"].replace("mutational_burden", "other")
moa_df["category"]= moa_df["category"].replace("mutational_signature", "other") 

moa_df.head()

Unnamed: 0,assertion_id,feature_id,feature_type,predictive_implication,feature_digest,category
0,1,1,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement
1,2,2,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement
2,3,3,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement
3,4,4,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement
4,5,5,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement


In [10]:
moa_normalized_categories = list(moa_df.category.unique())
moa_normalized_categories

['rearrangement', 'protein_consequence', 'copy_number', 'other', 'expression']

## Adding a numerical impact score based on the predictive implication
This is based on the structure of CIViC scoring

In [11]:
predictive_implication_categories = moa_df.predictive_implication.unique()
list(predictive_implication_categories)

['FDA-Approved',
 'Guideline',
 'Clinical trial',
 'Preclinical',
 'Inferential',
 'Clinical evidence']

In [12]:
moa_df["impact_score"] = moa_df["predictive_implication"].copy()

moa_df.loc[moa_df["impact_score"] == "FDA-Approved", "impact_score"] = 10
moa_df.loc[moa_df["impact_score"] == "Guideline", "impact_score"] = 10
moa_df.loc[moa_df["impact_score"] == "Clinical evidence", "impact_score"] = 5
moa_df.loc[moa_df["impact_score"] == "Clinical trial", "impact_score"] = 5
moa_df.loc[moa_df["impact_score"] == "Preclinical", "impact_score"] = 1
moa_df.loc[moa_df["impact_score"] == "Inferential", "impact_score"] = 1

moa_df.head()

Unnamed: 0,assertion_id,feature_id,feature_type,predictive_implication,feature_digest,category,impact_score
0,1,1,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement,10
1,2,2,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement,10
2,3,3,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement,10
3,4,4,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement,10
4,5,5,rearrangement,FDA-Approved,RnRyn89cJzVbVM93aw4OA44NIF5zblyP,rearrangement,10


### Impact Score Analysis

In [13]:
feature_categories_impact_data = dict()
for category in moa_normalized_categories:
    feature_categories_impact_data[category] = {}
    impact_category_df = moa_df[moa_df.category == category]

    total_sum_category_impact = impact_category_df["impact_score"].sum()
    feature_categories_impact_data[category]["total_sum_category_impact"] = total_sum_category_impact
    print(f"{category}: {total_sum_category_impact}")

rearrangement: 644
protein_consequence: 3999
copy_number: 415
other: 62
expression: 12


### Variant(feature) Analysis

In [14]:
moa_feature_data = dict()
for category in moa_normalized_categories:
    print(category)
    moa_feature_data[category] = {}
    feature_type_df = moa_df[moa_df.category == category]
    
    number_unique_category_features = len(set(feature_type_df.feature_digest))
    moa_feature_data[category]["number_unique_category_features"] = number_unique_category_features

    fraction_category_feature = f"{number_unique_category_features} / {len_features}"
    moa_feature_data[category]["fraction_category_feature"] = fraction_category_feature
    print(f"Number of {category} features in MOA: {fraction_category_feature}")
    
    percent_category_feature = "{:.2f}".format(number_unique_category_features / len_features * 100) + "%"
    moa_feature_data[category]["percent_category_feature"] = percent_category_feature
    print(f"Percent of {category} features in MOA: {percent_category_feature}")
    print("--------------------")

rearrangement
Number of rearrangement features in MOA: 38 / 423
Percent of rearrangement features in MOA: 8.98%
--------------------
protein_consequence
Number of protein_consequence features in MOA: 318 / 423
Percent of protein_consequence features in MOA: 75.18%
--------------------
copy_number
Number of copy_number features in MOA: 47 / 423
Percent of copy_number features in MOA: 11.11%
--------------------
other
Number of other features in MOA: 9 / 423
Percent of other features in MOA: 2.13%
--------------------
expression
Number of expression features in MOA: 11 / 423
Percent of expression features in MOA: 2.60%
--------------------


### Evidence(assertion) Analysis

In [15]:
moa_assertion_data = dict()
for category in moa_normalized_categories:
    print(f"category: {category}")
    moa_assertion_data[category] = {}
    assertion_type_df = moa_df[moa_df.category == category]
    
    number_unique_category_assertions = len(set(assertion_type_df.assertion_id))
    moa_assertion_data[category]["number_unique_category_assertions"] = number_unique_category_assertions

    fraction_category_assertion = f"{number_unique_category_assertions} / {len_assertions}"
    moa_assertion_data[category]["fraction_category_assertion"] = fraction_category_assertion
    print(f"Number of {category} assertions in MOA: {fraction_category_assertion}")
    
    percent_category_assertion = "{:.2f}".format(number_unique_category_assertions/len_assertions*100) + "%"
    moa_assertion_data[category]["percent_category_assertion"] = percent_category_assertion
    print(f"Percent of {category} assertions in MOA: {percent_category_assertion}")

    print("--------------------")

category: rearrangement
Number of rearrangement assertions in MOA: 81 / 874
Percent of rearrangement assertions in MOA: 9.27%
--------------------
category: protein_consequence
Number of protein_consequence assertions in MOA: 656 / 874
Percent of protein_consequence assertions in MOA: 75.06%
--------------------
category: copy_number
Number of copy_number assertions in MOA: 102 / 874
Percent of copy_number assertions in MOA: 11.67%
--------------------
category: other
Number of other assertions in MOA: 23 / 874
Percent of other assertions in MOA: 2.63%
--------------------
category: expression
Number of expression assertions in MOA: 12 / 874
Percent of expression assertions in MOA: 1.37%
--------------------


## Summaries

In [16]:
feature_categories = ["Rearrangement", "Protein Consequence", "Copy Number", "Other", "Expression"]
feature_category_impact_score= [v["total_sum_category_impact"] for v in feature_categories_impact_data.values()]
feature_category_number = [v["number_unique_category_features"] for v in moa_feature_data.values()]
feature_category_fraction = [v["fraction_category_feature"] for v in moa_feature_data.values()]
feature_category_percent= [v["percent_category_feature"] for v in moa_feature_data.values()]
feature_category_assertion_number= [v["number_unique_category_assertions"] for v in moa_assertion_data.values()]
feature_category_assertion_fraction= [v["fraction_category_assertion"] for v in moa_assertion_data.values()]
feature_category_assertion_percent= [v["percent_category_assertion"] for v in moa_assertion_data.values()]

In [17]:
feature_category_dict = {
    "Category":feature_categories,
    "Number of Features": feature_category_number,
    "Fraction of Features": feature_category_fraction,
    "Percent of Features": feature_category_percent,
    "Number of Assertions": feature_category_assertion_number,       
    "Fraction of Assertions":feature_category_assertion_fraction,
    "Percent of Assertions":feature_category_assertion_percent,
    "Impact Score":feature_category_impact_score
}

In [18]:
moa_feature_df = pd.DataFrame(feature_category_dict)
moa_feature_df

Unnamed: 0,Category,Number of Features,Fraction of Features,Percent of Features,Number of Assertions,Fraction of Assertions,Percent of Assertions,Impact Score
0,Rearrangement,38,38 / 423,8.98%,81,81 / 874,9.27%,644
1,Protein Consequence,318,318 / 423,75.18%,656,656 / 874,75.06%,3999
2,Copy Number,47,47 / 423,11.11%,102,102 / 874,11.67%,415
3,Other,9,9 / 423,2.13%,23,23 / 874,2.63%,62
4,Expression,11,11 / 423,2.60%,12,12 / 874,1.37%,12


In [19]:

moa_feature_df["Percent of Features"] = moa_feature_df["Fraction of Features"].astype(str) + " (" + moa_feature_df["Percent of Features"]+ ")"
moa_feature_df["Percent of Assertions"] = moa_feature_df["Fraction of Assertions"].astype(str) + " (" + moa_feature_df["Percent of Assertions"]+ ")"

In [20]:
moa_feature_df_abbreviated = moa_feature_df.drop(["Number of Features", "Fraction of Features", "Number of Assertions", "Fraction of Assertions"], axis=1)

In [21]:
moa_feature_df_abbreviated = moa_feature_df_abbreviated.set_index("Category")
moa_feature_df_abbreviated

Unnamed: 0_level_0,Percent of Features,Percent of Assertions,Impact Score
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rearrangement,38 / 423 (8.98%),81 / 874 (9.27%),644
Protein Consequence,318 / 423 (75.18%),656 / 874 (75.06%),3999
Copy Number,47 / 423 (11.11%),102 / 874 (11.67%),415
Other,9 / 423 (2.13%),23 / 874 (2.63%),62
Expression,11 / 423 (2.60%),12 / 874 (1.37%),12


In [22]:
fig = px.scatter(
    data_frame=moa_feature_df,
    x="Number of Assertions",
    y="Impact Score",
    size="Number of Features",
    size_max=40,
    text="Number of Features",
    color="Category"
)
fig.show()

In [23]:
fig.write_html("moa_feature_categories_impact_scatterplot.html")