In [1]:
# Dependency management
# --------------------------------------------------------
import sys
from pathlib import Path
project_root = Path("..").resolve()
sys.path.append(str(project_root / "scripts"))
from setup_environment import setup_paths
project_root = setup_paths()

# Data manipulation
# --------------------------------------------------------
import pandas as pd
import numpy as np
import polars as pl
import math
import time

# Visualizations
# --------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import altair as alt
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
from bokeh.plotting import figure, show
import seaborn.objects as so
import plotly.express as px

# Custom scripts
# ---------------------------------------------------------
from data_processing.cleaner import cleaning, compare_and_drop_duplicates
from helper.loader import read_csv

# Other
# ---------------------------------------------------------
from collections import Counter
from tqdm import tqdm

In [2]:
# Load raw data
var_info = pd.read_csv(f"{project_root}/data/raw/variation_information.tsv", sep="\t")
cancer_clinvar = pd.read_csv(f"{project_root}/data/raw/cancermama_clinvarmain.csv", sep="\t", low_memory=False)

In [3]:
# Clean each dataset
var_info = cleaning(var_info, df_name="variation_information")
cancer_clinvar = cleaning(cancer_clinvar, df_name="cancermama_clinvarmain")

In [4]:
# Join var_info and cancer_clinvar
var_info["real_id"] = var_info["rsID"].astype(str) + " " + var_info["ClinInfo"].astype(str)
cancer_clinvar["real_id"] = cancer_clinvar["snpId"].astype(str) + " " + cancer_clinvar["ClinInfo"].astype(str)
cancer_clinvar = cancer_clinvar.drop(["ClinInfo"], axis=1)
merged_df = cancer_clinvar.merge(var_info, left_on="real_id", right_on="real_id")

In [5]:
# Remove duplicated columns with the same values
merged_df = compare_and_drop_duplicates(merged_df)

Columns 'clinSign' and 'ClinClass' have 2 differences.
Columns '_clinSignCode' and 'Classification' have 2 differences.


In [6]:
# Make Binary classification using PG and LP as 1 
merged_df['bin_class'] = merged_df['Classification'].apply(lambda x: 1 if x in ['pg'] else 0)

In [7]:
# Drop geneId as it has the same information of gene, _originCode has the same information of origin
merged_df.drop(["geneId", "_originCode", "_allTypeCode", "ClinInfo"], axis=1, inplace=True)
# ids
id_columns = ["numSubmit", "_variantId", "origName", "rcvAcc", "snpId", "Start", "End"]
merged_df.drop(id_columns, axis=1, inplace=True)
# url in column, no additional info
url_columns = ["phenotype"]
merged_df.drop(url_columns, axis=1, inplace=True)
# _mouseOver have repeated information included in other columns:
other_columns = ["_mouseOver"]
merged_df.drop(other_columns, axis=1, inplace=True)

In [8]:
# Fill nans per column
for i in merged_df.columns:
    # 1 missing value in clinSign and ClinClass for the same row
    if i in ["clinSign", "ClinClass", "molConseq"]:
        merged_df[i] = merged_df[i].fillna("unkwown")
    # 45 missing values in phenotypeList
    if i in ["phenotypeList"]:
        merged_df[i] = merged_df[i].fillna("not provided")
    # 45 missing values in rcvAcc
    if i in ["rcvAcc"]:
        merged_df[i] = merged_df[i].fillna("rcv000000000")

In [9]:
for i in merged_df.columns:
    print("="*25)
    print()
    print(merged_df[i].value_counts().nlargest(5))
    print()


#chrom
chr17    49125
chr2     30159
chr16    29047
chr11    24704
chr5     24219
Name: count, dtype: int64


chromStart
37025628    32
47414419    32
47414420    22
95099770    17
78131728    16
Name: count, dtype: int64


chromEnd
37025630    33
47414421    28
37025608    19
95099771    17
78131619    16
Name: count, dtype: int64


name
c>t    49778
g>a    47477
a>g    40605
t>c    30231
del    26620
Name: count, dtype: int64


score
1    222786
2     97300
3     10212
0      8501
Name: count, dtype: int64


reserved
0,0,128        146341
0,210,0        113196
210,0,0         49066
137,121,212     25602
128,128,128      4594
Name: count, dtype: int64


blockSizes
1    306825
2     20478
3      2890
4      2190
5      1184
Name: count, dtype: int64


clinSign
uncertain significance                          146311
likely benign                                    98087
pathogenic                                       36150
conflicting classifications of pathogenicity     25599
benign  

# Oncokb classification per gene

In [10]:
# External data path and files
oncokb_df = pd.read_csv(f"{project_root}/data/external/oncokb_classification.csv")
# String normalization
oncokb_df['gen'] = oncokb_df['gen'].str.lower()
# Gen classification
merged_df = merged_df.merge(oncokb_df, left_on= "Gene", right_on="gen", how="left")
merged_df = merged_df.drop(['gen','description'], axis=1)
merged_df.columns

Index(['#chrom', 'chromStart', 'chromEnd', 'name', 'score', 'reserved',
       'blockSizes', 'clinSign', 'reviewStatus', 'type', 'molConseq',
       'testedInGtr', 'phenotypeList', 'origin', 'cytogenetic', 'vcfDesc',
       '_clinSignCode', 'simplified_hgvs', 'Gene', 'ClinClass',
       'Classification', 'bin_class', 'classification_oncokb'],
      dtype='object')

In [11]:
print(merged_df['classification_oncokb'].value_counts())

classification_oncokb
tumor_supressor     273593
oncogen              27616
unknown              22058
tumor_supressor*      7545
oncogen*              4242
double_function       3745
Name: count, dtype: int64


# Gen label obtained from upgenevsrep clustering

In [12]:
# Load labels obtained after clustering upgenevsrep
upgenevsrep_clustering_labels = pd.read_csv(f"{project_root}/data/processed/labels.csv")
upgenevsrep_clustering_labels["gen"] = upgenevsrep_clustering_labels["gen"].str.lower()
merged_df = merged_df.merge(upgenevsrep_clustering_labels, left_on="Gene", right_on="gen")
merged_df.rename(columns={'label': 'gen_label'}, inplace=True)
merged_df.head()

Unnamed: 0,#chrom,chromStart,chromEnd,name,score,reserved,blockSizes,clinSign,reviewStatus,type,...,bin_class,classification_oncokb,gen,gen_label,string_per_umap_cluster,string_per_umap_cluster_description,string_total_clustering,string_total_clustering_description,x_position,y_position
0,chr11,67483073,67483074,c>t,1,128,1,uncertain significance,criteria_provided_no_conflict,single nucleotide variant,...,0,unknown,aip,9,-1,not_in_network,1,"DNA repair pathways, full network, and HDR thr...",0.215948,0.33245
1,chr11,67483074,67483075,c>g,1,2100,1,benign,criteria_provided_no_conflict,single nucleotide variant,...,0,unknown,aip,9,-1,not_in_network,1,"DNA repair pathways, full network, and HDR thr...",0.215948,0.33245
2,chr11,67483135,67483136,a>g,1,2100,1,benign,criteria_provided_no_conflict,single nucleotide variant,...,0,unknown,aip,9,-1,not_in_network,1,"DNA repair pathways, full network, and HDR thr...",0.215948,0.33245
3,chr11,67483146,67483147,c>g,1,128,1,uncertain significance,criteria_provided_no_conflict,single nucleotide variant,...,0,unknown,aip,9,-1,not_in_network,1,"DNA repair pathways, full network, and HDR thr...",0.215948,0.33245
4,chr11,67483153,67483154,g>a,1,2100,1,likely benign,criteria_provided_no_conflict,single nucleotide variant,...,0,unknown,aip,9,-1,not_in_network,1,"DNA repair pathways, full network, and HDR thr...",0.215948,0.33245


In [14]:
merged_df.to_csv(f"{project_root}/data/interim/merged_df.csv", index=False)