# Molecular Oncology Almanac Assertion Analysis

In [243]:

import csv
import pandas as pd
import numpy as np
from civicpy import civic
from pathlib import Path 
import zipfile 
import plotly.express as px
import requests


## Creating a table with variant(feature) and evidence(assertion) information

In [244]:
import requests
import json

r = requests.get("https://moalmanac.org/api/features")
if r.status_code == 200:
    feature_data = r.json()

transformed = []
for feature in feature_data:
    feature_id = feature["feature_id"]
    feature_type = feature["feature_type"]
    
    transformed.append({"feature_id": feature_id,
                        "feature_type": feature_type})
feature_resp_df =  pd.DataFrame(transformed)
print(len(feature_resp_df))

423


In [245]:
import requests
import json

r = requests.get("https://moalmanac.org/api/assertions")
if r.status_code == 200:
    assertion_data = r.json()

transformed = []
for assertion in assertion_data:
    assertion_id = assertion["assertion_id"]
    predictive_implication = assertion["predictive_implication"]
    
    if len(assertion["features"]) != 1:
        print(f"assertion id ({assertion_id}) does not have 1 feature")

    feature_id = assertion["features"][0]["feature_id"]
    transformed.append({"assertion_id": assertion_id,
                        "feature_id": feature_id,
                        "predictive_implication": predictive_implication})
assertion_resp_df =  pd.DataFrame(transformed)
print(len(assertion_resp_df))

874


In [266]:
moa_df = pd.merge(feature_resp_df, assertion_resp_df, on ='feature_id')
moa_df.head()
print(len(moa_df))

423


### Converting feature types to normalized categories

In [247]:
moa_df.feature_type.unique()

array(['rearrangement', 'somatic_variant', 'germline_variant',
       'copy_number', 'microsatellite_stability', 'mutational_signature',
       'mutational_burden', 'knockdown', 'aneuploidy'], dtype=object)

In [248]:
moa_df["category"] = moa_df["feature_type"].copy()

moa_df["category"]= moa_df["category"].replace("aneuploidy", "copy_number")
moa_df["category"]= moa_df["category"].replace("knockdown", "expression")
moa_df["category"]= moa_df["category"].replace("somatic_variant", "protein_consequence")
moa_df["category"]= moa_df["category"].replace("germline_variant", "protein_consequence")
moa_df["category"]= moa_df["category"].replace("microsatellite_stability","rearrangement")
moa_df["category"]= moa_df["category"].replace("mutational_burden","other")
moa_df["category"]= moa_df["category"].replace("mutational_signature","other") 

moa_df.head()

Unnamed: 0,feature_id,feature_type,assertion_id,predictive_implication,category
0,1,rearrangement,1,FDA-Approved,rearrangement
1,12,rearrangement,12,FDA-Approved,rearrangement
2,15,rearrangement,15,FDA-Approved,rearrangement
3,18,rearrangement,18,Guideline,rearrangement
4,21,rearrangement,21,Preclinical,rearrangement


In [249]:
moa_df.category.unique()

array(['rearrangement', 'protein_consequence', 'copy_number', 'other',
       'expression'], dtype=object)

## Adding a numerical impact score based on the predictive implication
This is based on the structure of CIViC scoring

In [250]:
predictive_implication_categories = moa_df.predictive_implication.unique()
predictive_implication_categories

array(['FDA-Approved', 'Guideline', 'Preclinical', 'Inferential',
       'Clinical evidence', 'Clinical trial'], dtype=object)

In [251]:
moa_df["impact_score"] = moa_df["predictive_implication"].copy()

moa_df.loc[moa_df["impact_score"] == "FDA-Approved", "impact_score"] = 10
moa_df.loc[moa_df["impact_score"] == "Guideline", "impact_score"] = 10
moa_df.loc[moa_df["impact_score"] == "Clinical evidence", "impact_score"] = 5
moa_df.loc[moa_df["impact_score"] == "Clinical trial", "impact_score"] = 5
moa_df.loc[moa_df["impact_score"] == "Preclinical", "impact_score"] = 1
moa_df.loc[moa_df["impact_score"] == "Inferential", "impact_score"] = 1

moa_df.head()

Unnamed: 0,feature_id,feature_type,assertion_id,predictive_implication,category,impact_score
0,1,rearrangement,1,FDA-Approved,rearrangement,10
1,12,rearrangement,12,FDA-Approved,rearrangement,10
2,15,rearrangement,15,FDA-Approved,rearrangement,10
3,18,rearrangement,18,Guideline,rearrangement,10
4,21,rearrangement,21,Preclinical,rearrangement,1


### Impact Score Analysis

In [252]:
normalized_categories_list = list(moa_df.category.unique())
normalized_categories_list

['rearrangement', 'protein_consequence', 'copy_number', 'other', 'expression']

In [253]:
feature_categories_impact_data = dict()
for category in normalized_categories_list:
    print(category)
    feature_categories_impact_data[category] = {}
    impact_category_df = moa_df[moa_df.category == category]

    total_sum_category_impact = impact_category_df['impact_score'].sum()
    feature_categories_impact_data[category]["total_sum_category_impact"]=  total_sum_category_impact
    print(total_sum_category_impact)

    print("--------------------")

rearrangement
292
--------------------
protein_consequence
1958
--------------------
copy_number
153
--------------------
other
35
--------------------
expression
11
--------------------


### Variant(feature) Analysis

In [254]:
total_number_features = len(set(moa_df.feature_id))
print(total_number_features)

423


In [255]:
total_number_assertions = len(set(moa_df.assertion_id))
print(total_number_assertions)

423


In [256]:
moa_feature_data = dict()
for category in normalized_categories_list:
    print(category)
    moa_feature_data[category] = {}
    feature_type_df = moa_df[moa_df.category == category]
    
    number_unique_category_features = len(set(feature_type_df.feature_id))
    moa_feature_data[category]["number_unique_category_features"] = number_unique_category_features

    fraction_category_feature = f"{number_unique_category_features} / {total_number_features}"
    moa_feature_data[category]["fraction_category_feature"] = fraction_category_feature
    print(f"Number of {category} features in MOA: {fraction_category_feature}")
    
    percent_category_feature = "{:.2f}".format(number_unique_category_features/total_number_features*100) + "%"
    moa_feature_data[category]["percent_category_feature"] = percent_category_feature
    print(f"Percent of {category} features in MOA: {percent_category_feature}")

    print("--------------------")

rearrangement
Number of rearrangement features in MOA: 38 / 423
Percent of rearrangement features in MOA: 8.98%
--------------------
protein_consequence
Number of protein_consequence features in MOA: 318 / 423
Percent of protein_consequence features in MOA: 75.18%
--------------------
copy_number
Number of copy_number features in MOA: 47 / 423
Percent of copy_number features in MOA: 11.11%
--------------------
other
Number of other features in MOA: 9 / 423
Percent of other features in MOA: 2.13%
--------------------
expression
Number of expression features in MOA: 11 / 423
Percent of expression features in MOA: 2.60%
--------------------


### Evidence(assertion) Analysis

In [257]:
moa_assertion_data = dict()
for category in normalized_categories_list:
    print(category)
    moa_assertion_data[category] = {}
    assertion_type_df = moa_df[moa_df.category == category]
    
    number_unique_category_assertions = len(set(assertion_type_df.assertion_id))
    moa_assertion_data[category]["number_unique_category_assertions"] = number_unique_category_assertions

    fraction_category_assertion = f"{number_unique_category_assertions} / {total_number_assertions}"
    moa_assertion_data[category]["fraction_category_assertion"] = fraction_category_assertion
    print(f"Number of {category} assertions in MOA: {fraction_category_assertion}")
    
    percent_category_assertion = "{:.2f}".format(number_unique_category_assertions/total_number_assertions*100) + "%"
    moa_assertion_data[category]["percent_category_assertion"] = percent_category_assertion
    print(f"Percent of {category} assertions in MOA: {percent_category_assertion}")

    print("--------------------")

rearrangement
Number of rearrangement assertions in MOA: 38 / 423
Percent of rearrangement assertions in MOA: 8.98%
--------------------
protein_consequence
Number of protein_consequence assertions in MOA: 318 / 423
Percent of protein_consequence assertions in MOA: 75.18%
--------------------
copy_number
Number of copy_number assertions in MOA: 47 / 423
Percent of copy_number assertions in MOA: 11.11%
--------------------
other
Number of other assertions in MOA: 9 / 423
Percent of other assertions in MOA: 2.13%
--------------------
expression
Number of expression assertions in MOA: 11 / 423
Percent of expression assertions in MOA: 2.60%
--------------------


## Summaries

In [258]:
feature_categories = ['Rearrangement', 'Protein Consequence', 'Copy Number', 'Other', 'Expression']
feature_category_impact_score= [v["total_sum_category_impact"] for k, v in feature_categories_impact_data.items()]
feature_category_number = [v["number_unique_category_features"] for k, v in moa_feature_data.items()]
feature_category_fraction = [v["fraction_category_feature"] for k, v in moa_feature_data.items()]
feature_category_percent= [v["percent_category_feature"] for k, v in moa_feature_data.items()]
feature_category_assertion_number= [v["number_unique_category_assertions"] for k, v in moa_assertion_data.items()]
feature_category_assertion_fraction= [v["fraction_category_assertion"] for k, v in moa_assertion_data.items()]
feature_category_assertion_percent= [v["percent_category_assertion"] for k, v in moa_assertion_data.items()]

In [259]:
feature_category_dict = {'Category':feature_categories,
                        'Number of Features': feature_category_number,
                        'Fraction of Features': feature_category_fraction,
                        'Percent of Features': feature_category_percent,
                        'Number of Assertions': feature_category_assertion_number,       
                        'Fraction of Assertions':feature_category_assertion_fraction,
                        'Percent of Assertions':feature_category_assertion_percent,
                        'Impact Score':feature_category_impact_score}

In [260]:
moa_feature_df = pd.DataFrame(feature_category_dict)
moa_feature_df

Unnamed: 0,Category,Number of Features,Fraction of Features,Percent of Features,Number of Assertions,Fraction of Assertions,Percent of Assertions,Impact Score
0,Rearrangement,38,38 / 423,8.98%,38,38 / 423,8.98%,292
1,Protein Consequence,318,318 / 423,75.18%,318,318 / 423,75.18%,1958
2,Copy Number,47,47 / 423,11.11%,47,47 / 423,11.11%,153
3,Other,9,9 / 423,2.13%,9,9 / 423,2.13%,35
4,Expression,11,11 / 423,2.60%,11,11 / 423,2.60%,11


In [261]:

moa_feature_df['Percent of Features'] = moa_feature_df['Fraction of Features'].astype(str) + " (" + moa_feature_df['Percent of Features']+ ")"
moa_feature_df['Percent of Assertions'] = moa_feature_df['Fraction of Assertions'].astype(str) + " (" + moa_feature_df['Percent of Assertions']+ ")"

In [262]:
moa_feature_df_abbreviated = moa_feature_df.drop(['Number of Features','Fraction of Features','Number of Assertions', 'Fraction of Assertions'], axis=1)

In [263]:
moa_feature_df_abbreviated =moa_feature_df_abbreviated.set_index("Category")
moa_feature_df_abbreviated

Unnamed: 0_level_0,Percent of Features,Percent of Assertions,Impact Score
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rearrangement,38 / 423 (8.98%),38 / 423 (8.98%),292
Protein Consequence,318 / 423 (75.18%),318 / 423 (75.18%),1958
Copy Number,47 / 423 (11.11%),47 / 423 (11.11%),153
Other,9 / 423 (2.13%),9 / 423 (2.13%),35
Expression,11 / 423 (2.60%),11 / 423 (2.60%),11


In [264]:
fig= px.scatter(data_frame= moa_feature_df,
                x= 'Number of Assertions',
                y= 'Impact Score',
                size= 'Number of Features',
                size_max= 40,
                text= 'Number of Features',
                color= 'Category')
fig.show()

In [265]:
# fig.write_html("../../../../../../moa_feature_categories_impact_scatterplot.html")