In [1]:
# Dependency management
# --------------------------------------------------------
import sys
from pathlib import Path
project_root = Path("..").resolve()
sys.path.append(str(project_root / "scripts"))
from setup_environment import setup_paths
project_root = setup_paths()

# Data manipulation
# --------------------------------------------------------
import pandas as pd
import numpy as np
import polars as pl
import math
import time
import joblib

# Visualizations
# --------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import altair as alt
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
from bokeh.plotting import figure, show
import seaborn.objects as so
import plotly.express as px

# Custom scripts
# ---------------------------------------------------------
from data_processing.cleaner import (cleaning, 
                                     compare_and_drop_duplicates)
from feature_selection.selector import (select_vars, 
                                       choose_variable_to_drop, 
                                       corr_comparison, 
                                       mutual_information)

# Correlation
# ---------------------------------------------------------
from dython.nominal import associations

# Dimension Reduction
# ---------------------------------------------------------
import umap

# Preprocessing
# ---------------------------------------------------------
from sklearn.preprocessing import (MinMaxScaler, 
                                   RobustScaler, 
                                   QuantileTransformer, 
                                   OrdinalEncoder,
                                  LabelEncoder)
from sklearn import preprocessing
from category_encoders import BinaryEncoder

# Random Forest
# ---------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier

# Support Vector Machine Classifier
# ---------------------------------------------------------
from sklearn.svm import SVC

# XGBoost
# ---------------------------------------------------------
import xgboost as xgb
from xgboost import XGBClassifier

# Model selection
# ---------------------------------------------------------
from sklearn.model_selection import (train_test_split, 
                                     RepeatedStratifiedKFold, 
                                     StratifiedKFold, 
                                     cross_val_score)

# Metrics
# ---------------------------------------------------------
from sklearn.metrics import (accuracy_score, 
                             f1_score, 
                             precision_score, 
                             recall_score, 
                             confusion_matrix,
                             make_scorer,
                             classification_report,
                             ConfusionMatrixDisplay
                             )

# Hyperparameter optimization
# ----------------------------------------------------------
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Other
# ---------------------------------------------------------
from collections import Counter
from tqdm import tqdm
import random
random.seed(2024)
import warnings
warnings.simplefilter("ignore", FutureWarning)

In [2]:
# Load saved merged_df.csv
merged_df = pd.read_csv(f"{project_root}/data/interim/merged_df.csv")
print(merged_df.shape)
print()
print(merged_df.columns)

(328116, 31)

Index(['#chrom', 'chromStart', 'chromEnd', 'name', 'score', 'reserved',
       'blockSizes', 'clinSign', 'reviewStatus', 'type', 'molConseq',
       'testedInGtr', 'phenotypeList', 'origin', 'cytogenetic', 'vcfDesc',
       '_clinSignCode', 'simplified_hgvs', 'Gene', 'ClinClass',
       'Classification', 'bin_class', 'classification_oncokb', 'gen',
       'gen_label', 'string_per_umap_cluster',
       'string_per_umap_cluster_description', 'string_total_clustering',
       'string_total_clustering_description', 'x_position', 'y_position'],
      dtype='object')


In [3]:
# Final columns to drop
final_drop_check = ["geneId", "_originCode", "_allTypeCode", "ClinInfo", 
                     "reserved", "numSubmit", "_variantId", "real_id", 
                     "origName", "rcvAcc", "snpId", "Start", "End", "phenotype", 
                    "_mouseOver", "Classification", "_clinSignCode", 
                    'clinSign', "ClinClass", "chromStart", "chromEnd", "gen", 
                    "vcfDesc", "testedInGtr", "string_per_umap_cluster", "string_total_clustering"]

for i in final_drop_check:
    if i in merged_df.columns:
        merged_df = merged_df.drop([i], axis=1)

merged_df.columns

Index(['#chrom', 'name', 'score', 'blockSizes', 'reviewStatus', 'type',
       'molConseq', 'phenotypeList', 'origin', 'cytogenetic',
       'simplified_hgvs', 'Gene', 'bin_class', 'classification_oncokb',
       'gen_label', 'string_per_umap_cluster_description',
       'string_total_clustering_description', 'x_position', 'y_position'],
      dtype='object')

<hr style="height:10px; border-width:0">


# XGBoost Balanced dataframe