In [1]:
# Data manipulation
# --------------------------------------------------------
import pandas as pd
import numpy as np
import polars as pl
import math
import time

# Visualizations
# --------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import altair as alt
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
from bokeh.plotting import figure, show
import seaborn.objects as so
import plotly.express as px

# Custom scripts
# ---------------------------------------------------------
from scripts.data_processing import cleaning, compare_and_drop_duplicates

# Other
# ---------------------------------------------------------
from collections import Counter
import umap
from sklearn.preprocessing import MinMaxScaler, RobustScaler, QuantileTransformer, OrdinalEncoder
from tqdm import tqdm

In [2]:
# Load raw data
var_info = pd.read_csv("./data/raw/variation_information.tsv", sep="\t")
cancer_clinvar = pd.read_csv("./data/raw/cancermama_clinvarmain.csv", sep="\t", low_memory=False)
upgenevsrep = pd.read_csv("./data/raw/UP.geneVsrepList.txt", sep="\t", header=None)

In [3]:
# Clean each dataset
var_info = cleaning(var_info, df_name="variation_information")
cancer_clinvar = cleaning(cancer_clinvar, df_name="cancermama_clinvarmain")
upgenevsrep = cleaning(upgenevsrep, df_name="UP.geneVsrepList")

In [4]:
# Join var_info and cancer_clinvar
var_info["real_id"] = var_info["rsID"].astype(str) + " " + var_info["ClinInfo"].astype(str)
cancer_clinvar["real_id"] = cancer_clinvar["snpId"].astype(str) + " " + cancer_clinvar["ClinInfo"].astype(str)
cancer_clinvar = cancer_clinvar.drop(["ClinInfo"], axis=1)
merged_df = cancer_clinvar.merge(var_info, left_on="real_id", right_on="real_id")

In [5]:
# Remove duplicated columns with the same values
merged_df = compare_and_drop_duplicates(merged_df)

Columns 'clinSign' and 'ClinClass' have 2 differences.
Columns '_clinSignCode' and 'Classification' have 2 differences.


In [6]:
# Make Binary classification using PG and LP as 1 
merged_df['bin_class'] = merged_df['Classification'].apply(lambda x: 1 if x in ['pg'] else 0)

In [7]:
# Drop geneId as it has the same information of gene
merged_df.drop(["geneId"], axis=1, inplace=True)
# ids
id_columns = ["numSubmit", "_variantId", "real_id", "origName", "rcvAcc", "snpId"]
merged_df.drop(id_columns, axis=1, inplace=True)
# url in column, no additional info
url_columns = ["phenotype"]
merged_df.drop(url_columns, axis=1, inplace=True)
# _mouseOver have repeated information included in other columns:
other_columns = ["_mouseOver"]
merged_df.drop(other_columns, axis=1, inplace=True)

In [8]:
# Fill nans per column
for i in merged_df.columns:
    # 1 missing value in clinSign and ClinClass for the same row
    if i in ["clinSign", "ClinClass", "molConseq"]:
        merged_df[i] = merged_df[i].fillna("unkwown")
    # 45 missing values in phenotypeList
    if i in ["phenotypeList"]:
        merged_df[i] = merged_df[i].fillna("not provided")
    # 45 missing values in rcvAcc
    if i in ["rcvAcc"]:
        merged_df[i] = merged_df[i].fillna("rcv000000000")

In [9]:
for i in merged_df.columns:
    print("="*25)
    print()
    print(merged_df[i].value_counts().nlargest(5))
    print()


#chrom
chr17    49125
chr2     30159
chr16    29047
chr11    24704
chr5     24219
Name: count, dtype: int64


chromStart
37025628    32
47414419    32
47414420    22
95099770    17
78131728    16
Name: count, dtype: int64


chromEnd
37025630    33
47414421    28
37025608    19
95099771    17
78131619    16
Name: count, dtype: int64


name
c>t    49778
g>a    47477
a>g    40605
t>c    30231
del    26620
Name: count, dtype: int64


score
1    222786
2     97300
3     10212
0      8501
Name: count, dtype: int64


reserved
0,0,128        146341
0,210,0        113196
210,0,0         49066
137,121,212     25602
128,128,128      4594
Name: count, dtype: int64


blockSizes
1    306825
2     20478
3      2890
4      2190
5      1184
Name: count, dtype: int64


clinSign
uncertain significance                          146311
likely benign                                    98087
pathogenic                                       36150
conflicting classifications of pathogenicity     25599
benign  

In [10]:
merged_df.to_csv("./data/interim/merged_df.csv", index=False)

Cluster -1 Genes: GLMN, GPC3, HFE, HRAS




    GLMN: GLMN (Glomulin) gene mutations are associated with glomuvenous malformations.
    GPC3: GPC3 (Glypican 3) is involved in cell growth control and differentiation and mutations can cause Simpson-Golabi-Behmel syndrome.
    HFE: HFE gene mutations are linked to hereditary hemochromatosis, affecting iron metabolism.
    HRAS: HRAS is an oncogene associated with the regulation of cell division and mutations can lead to cancer.




Similarity: Genes related to various syndromes and cancers.




Cluster 0 Genes: RTEL1, RUNX1, SBDS, SDHA, SDHAF2, SDHB, SDHC, SDHD, SMAD4, SMARCA4, SMARCB1, SMC1A, SMC3, STK11, SUFU, SYNE1, SYNE2, TERT, TGFBR1, TGFBR2, TMEM127




    RTEL1: Involved in DNA repair and telomere maintenance.
    RUNX1: Key role in hematopoiesis and is associated with leukemia.
    SBDS: Linked to Shwachman-Diamond syndrome.
    SDHA, SDHAF2, SDHB, SDHC, SDHD: Part of the succinate dehydrogenase complex, mutations can cause mitochondrial disorders.
    SMAD4, SMARCA4, SMARCB1: Tumor suppressor genes involved in cancer development.
    STK11, SUFU: Tumor suppressor genes, mutations associated with Peutz-Jeghers syndrome and medulloblastoma respectively.
    TERT, TGFBR1, TGFBR2: Associated with telomere function and TGF-beta signaling pathway.
    SYNE1, SYNE2: Involved in nuclear envelope structure and mutations can cause muscular dystrophies.
    TMEM127: Associated with pheochromocytoma.




Similarity: Predominantly tumor suppressor genes and genes involved in syndromes or cancer.




Cluster 1 Genes: PIK3CA, PMS2, POLD1, POLH, POU6F2, PTCH1, PTEN, RAD51C, RAD51D, RB1, RET, RHBDF2, RHOA, RINT1, RNF43, ROS1, RTEL1, RUNX1, SBDS, SDHA, SDHAF2, SDHB, SDHC, SDHD, SMAD4, SMARCA4, SMARCB1, SMC1A, SMC3, STK11, SUFU, SYNE1, SYNE2, TERT, TGFBR1, TGFBR2, TMEM127




    PIK3CA: Involved in the PI3K/AKT pathway, mutations associated with various cancers.
    PMS2, POLD1, POLH: DNA repair genes, associated with Lynch syndrome.
    POU6F2, PTCH1, PTEN: Involved in developmental processes and tumor suppression.
    RAD51C, RAD51D: Involved in homologous recombination repair, mutations can lead to cancer.
    RB1, RET, RHBDF2: Tumor suppressor genes, mutations associated with retinoblastoma and other cancers.
    RHOA: Involved in signal transduction and cell migration.
    RINT1, RNF43, ROS1: Associated with DNA repair and cancer pathways.
    RTEL1, RUNX1, SBDS, SDHA, SDHAF2, SDHB, SDHC, SDHD: As above, involved in DNA repair, hematopoiesis, mitochondrial function.
    SMAD4, SMARCA4, SMARCB1, SMC1A, SMC3, STK11, SUFU: Tumor suppressor genes.
    SYNE1, SYNE2, TERT, TGFBR1, TGFBR2, TMEM127: As above, involved in nuclear structure, telomere function, and TGF-beta signaling.




Similarity: Predominantly DNA repair genes and tumor suppressors, associated with various cancers and syndromes.




Cluster 2 Genes: CYP21A2, DDB2, DICER1, DIS3L2, EHBP1, EPCAM, ERCC1, ERCC2, ERCC3, ERCC4, ERCC5, FAN1, FANCA, FANCB, FANCC, FANCD2, FANCE, FANCF, FANCG, FANCI, FANCL, FANCM, FH, FLCN, FOS, FSTL3, FTH1, GATA2, GBE1, GCK




    CYP21A2: Associated with congenital adrenal hyperplasia.
    DDB2, DICER1, DIS3L2: Involved in DNA repair and RNA processing.
    EHBP1, EPCAM: Associated with epithelial cell adhesion and migration.
    ERCC1, ERCC2, ERCC3, ERCC4, ERCC5: Involved in nucleotide excision repair.
    FAN1, FANCA, FANCB, FANCC, FANCD2, FANCE, FANCF, FANCG, FANCI, FANCL, FANCM: Involved in Fanconi anemia pathway.
    FH, FLCN: Associated with metabolic disorders and cancer.
    FOS, FSTL3: Involved in signal transduction and cell differentiation.
    FTH1, GATA2, GBE1, GCK: Involved in iron storage, hematopoiesis, and glycogen metabolism.




Similarity: Genes involved in DNA repair, metabolic disorders, and signal transduction.




Cluster 3 Genes: AIP, AKT1, ALK, ANTXR2, APC, ASCC1, ATM, ATR, BARD1, BCL2, BCL2L11, BLM, BMPR1A, BRCA1, BRCA2, BRIP1, BTK, CBL, CD79A, CD79B, CDH1, CDK4, CDK6, CDKN1B, CDKN2A, CEBPA, CHEK1, CHEK2, CIC, CLP1, CUL3, CYLD




    AIP: Associated with pituitary adenomas.
    AKT1, ALK: Involved in cell survival and proliferation.
    ANTXR2: Associated with anthrax toxin receptor.
    APC: Tumor suppressor gene, mutations associated with colorectal cancer.
    ASCC1, ATM, ATR: Involved in DNA repair and signal transduction.
    BARD1, BCL2, BCL2L11: Involved in apoptosis regulation.
    BLM, BMPR1A: Involved in DNA repair and bone morphogenesis.
    BRCA1, BRCA2, BRIP1: DNA repair genes, mutations associated with breast cancer.
    BTK, CBL, CD79A, CD79B: Involved in B-cell receptor signaling.
    CDH1, CDK4, CDK6, CDKN1B, CDKN2A: Tumor suppressor genes.
    CEBPA, CHEK1, CHEK2, CIC, CLP1, CUL3, CYLD: Involved in cell cycle regulation and DNA repair.




Similarity: Genes involved in DNA repair, apoptosis regulation, and cell cycle control.




Cluster 4 Genes: LYST, MC1R, MEN1, MET, MLH1, MLH3, MSH2, MSH6, MUTYH, NBN, NDUFA13, NF1, NF2, NFE2L2, NFKBIA, NFKBIE, NIN, NKX2-1, NLRP1, NOTCH1, NQO1, NRAS, NSD1, NTHL1, NTRK1, NTRK2, NTRK3




LYST: Associated with Chediak-Higashi syndrome.
MC1R: Involved in pigmentation and associated with melanoma risk.
MEN1, MET: Tumor suppressor genes.
MLH1, MLH3, MSH2, MSH6: DNA mismatch repair genes.
MUTYH, NBN, NDUFA13: Involved in DNA repair and mitochondrial function.
NF1, NF2: Tumor suppressor genes.
NFE2L2, NFKBIA, NFKBIE: Involved in oxidative stress response and inflammation.




NIN, NKX2-1, NLRP1, ​NOTCH1: Involved in cell signaling and immune response.
NQO1, NRAS, NSD1, NTHL1: Involved in cellular metabolism and DNA repair.
NTRK1, NTRK2, NTRK3: Neurotrophic receptors, involved in nervous system development.




Similarity: Genes involved in DNA repair, tumor suppression, and signaling pathways.




Cluster 5 Genes: FANCB, FANCC, FANCD2, FANCE, FANCF, FANCG, FANCI, FANCL, FANCM




FANCB, FANCC, FANCD2, FANCE, FANCF, FANCG, FANCI, FANCL, FANCM: All genes are part of the Fanconi anemia pathway, involved in DNA repair.




Similarity: All genes are involved in the Fanconi anemia pathway.




Cluster 6 Genes: ALK, ANTXR1, ANTXR2, APC, ASCC1, ATM, ATR, BARD1, BCL2, BCL2L11, BLM, BMPR1A, BRCA1, BRCA2, BRIP1, BTK, CBL, CD79A, CD79B, CDH1, CDK4, CDK6, CDKN1B, CDKN2A, CEBPA, CHEK1, CHEK2, CIC, CLP1, CUL3, CYLD




Similarity: Same as Cluster 3.




Cluster 7 Genes: KIF1B, KIT, KLHDC8B, LIG4, LYST, MAX, MC1R, MEN1, MET, MLH1, MLH3, MSH2, MSH6, MUTYH, NBN, NDUFA13, NF1, NF2, NFE2L2, NFKBIA, NFKBIE, NIN, NKX2-1, NLRP1, NOTCH1, NQO1, NRAS, NSD1, NTHL1, NTRK1, NTRK2, NTRK3




Similarity: Same as Cluster 4.




Cluster 8 Genes: MSH2, MSH6, MSMB, MSR1, MUTYH, NBN, NDUFA13, NF1, NF2, NFE2L2, NFKBIA, NFKBIE, NIN, NKX2-1, NLRP1, NOTCH1, NQO1, NRAS, NSD1, NTHL1, NTRK1, NTRK2, NTRK3




Similarity: Same as Cluster 4.






In [28]:
# Define similarity reasons for each cluster
similarity_reasons = {
    -1: "Genes related to various syndromes and cancers.",
    0: "Predominantly tumor suppressor genes and genes involved in syndromes or cancer.",
    1: "Predominantly DNA repair genes and tumor suppressors, associated with various cancers and syndromes.",
    2: "Genes involved in DNA repair, metabolic disorders, and signal transduction.",
    3: "Genes involved in DNA repair, apoptosis regulation, and cell cycle control.",
    4: "Genes involved in DNA repair, tumor suppression, and signaling pathways.",
    5: "All genes are involved in the Fanconi anemia pathway.",
    6: "Genes involved in DNA repair, apoptosis regulation, and cell cycle control.",
    7: "Genes involved in DNA repair, tumor suppression, and signaling pathways.",
    8: "Genes involved in DNA repair, tumor suppression, and signaling pathways."
}

# Add the new column with similarity reasons to the original dataframe
percentages['similarity_reason'] = percentages['labels'].map(similarity_reasons)
percentages

Unnamed: 0,labels,gene,counts,agg_kind,perc,similarity_reason
0,-1,GLMN,1,SINE/Alu,0.24,Genes related to various syndromes and cancers.
1,-1,GPC3,2,"LINE/L1, SINE/Alu",0.47,Genes related to various syndromes and cancers.
2,-1,HFE,2,"LINE/L1, SINE/Alu",0.47,Genes related to various syndromes and cancers.
3,-1,HRAS,2,"SINE/Alu, Simple_repeat",0.47,Genes related to various syndromes and cancers.
4,0,RTEL1,1,LINE/L2,0.24,Predominantly tumor suppressor genes and genes...
...,...,...,...,...,...,...
227,10,NF1,2,"SINE/Alu, SINE/MIR",0.47,
228,10,NF2,3,"LINE/L2, SINE/Alu, SINE/MIR",0.71,
229,10,NTRK1,2,"LINE/L2, SINE/Alu",0.47,
230,10,PALB2,2,"LINE/L2, SINE/Alu",0.47,


# Visualization

In [None]:
# Function to create count DataFrame for a given column
def create_count_df(column):
    count_df = final_df.groupby(by=[column, "bin_class"], observed=True).size().to_frame()
    count_df.columns = ["count"]
    count_df = count_df.reset_index()
    count_df = count_df.loc[count_df["count"] != 0]
    count_df = count_df.sort_values(by=["count"], ascending=False)
    return count_df

In [None]:
columns = ['#chrom', 'score',
       'blockSizes', 'clinSign',  'type', 'geneId',
       'molConseq', 'testedInGtr',
       'origin', "_originCode", "_allTypeCode", 'cytogenetic',
       'ClinClass', 'Classification', "gene", "labels", "agg_kind", "similarity_reason"]

# Initial figure
fig = go.Figure()

# Add traces for each column
for col in columns:
    count_df = create_count_df(col)
    fig.add_trace(go.Bar(
        x=count_df[col],
        y=count_df['count'],
        name=col,
        marker_color=count_df['bin_class'],
        visible=False  # Initially hide all traces
    ))

# Show the first trace by default
fig.data[0].visible = True

# Create the dropdown menu
dropdown_buttons = [
    {'label': col, 'method': 'update', 'args': [{'visible': [col == trace.name for trace in fig.data]}]}
    for col in columns
]

# Update layout with dropdown
fig.update_layout(
    updatemenus=[{
        'buttons': dropdown_buttons,
        'direction': 'down',
        'showactive': True

    }],
    barmode='stack'
)

fig.show()

In [None]:
so.Plot(grouped_df, x="labels", color="kind").add(so.Bar(), so.Count(), so.Stack())
pivot_df = grouped_df.pivot_table(index='gene', columns='labels', aggfunc='size', fill_value=0)
sns.heatmap(pivot_df)

In [None]:
# Create faceted pie charts
fig = px.pie(percentages, values='perc', names='gene', facet_col='labels', facet_col_wrap=3, title='Pie Charts of Gene Percentages by Label')
fig.update_traces(textinfo="none")

fig.show()