In [3]:
import pandas as pd
import numpy as np
import anndata as an
import scanpy as sc
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error
from typing import Optional, Union, Dict
from scanpy.get import _get_obs_rep, _set_obs_rep
from scipy.sparse import issparse
from xgboost import XGBClassifier

In [None]:
np.random_seed(42)

# LOAD DATA

In [None]:
import preliminary_functions as pr
import data_utils as du
import classifier as cl

In [None]:
df_1 = pd.read_csv("unfiltered_data_1.txt",
                                engine = 'python', index_col = 0)
df_2 = pd.read_csv("unfiltered_data_2.txt",
                                engine = 'python', index_col = 0)

# SUBSAMPLE & MERGE

In [None]:
if df_1.shape[0] > df_2.shape[0]:
    df_1_sample = df_1.sample(df_2.shape[0], axis = 0)
    df = pd.concat([df_1_sample, df_2])
elif df_2.shape[0] > df_1.shape[0]:
    df_2_sample = df_2.sample(df_1.shape[0], axis = 0) 
    df = pd.concat([df_1, df_2_sample])

In [None]:
data = {'Dataframe 1': df_1, 'Dataframe 2': df_2, 'Dataframe Merged': df}

# DATA INSPECTION

In [None]:
df_1.head()

In [None]:
df_1.shape

In [None]:
df_2.head()

In [None]:
df_2.shape

In [None]:
df.head()

In [None]:
df.shape

# FILTERING

In [None]:
processed_data = {}
for name, dataframe in data.items():
    print(f'=== PROCESSING DATAFRAME {dataframe} ===')
    print(f'Shape before filtering: {dataframe.shape}')

    pr.remove_noncoding(dataframe) #Removing non-coding genes
    print(f'Shape after removing non-coding genes: {dataframe.shape}')

    dataframe.dropna(axis=1, inplace=True) #Removing missing values
    print(f'Shape after removing missing values: {dataframe.shape}')

    dataframe = pr.remove_duplicates(dataframe) #Removing duplicates
    print(f'Shape after removing duplicates: {dataframe.shape}')

    dataframe = pr.sparsity_threshold(dataframe) #Removing columns with sparsity exceeding the threshold
    print(f'Shape after removing duplicates: {dataframe.shape}')

    adata = an.AnnData(dataframe.iloc[:,2:].astype(int)) #Converting dataframe into AnnData object
    adata.obs["Condition"] = dataframe["Condition"]
    adata = pr.remove_mito(adata) #Removing cells with high mitochondrial count

    processed_dataframe = pd.DataFrame(adata.X)
    processed_dataframe.index = adata.obs_names
    processed_dataframe.columns = adata.var_names
    processed_dataframe.insert(0, 'Condition', adata.obs["Condition"])
    
    processed_data[name] = processed_dataframe

# CLEANING 
This step is specific to the dataset that I used, please check `data_utils.py` for further information about this step.

In [None]:
clean_data_filtered = {}

for name, df_filtered in processed_data.items():
    df_clean = du.data_cleaning(df_filtered)

    clean_data_filtered[name] = df_clean

In [None]:
clean_data_raw = {}
for name,d in data:
    df_clean = du.data_cleaning(d)

    clean_data_raw[name] = df_clean

# CLASSIFIER

In [None]:
if clean_data_raw['Dataframe 1'].shape[0] > clean_data_raw['Dataframe 2'].shape[0]:
    clean_data_raw['Dataframe 1'] = clean_data_raw['Dataframe 1'].sample(clean_data_raw['Dataframe 2'].shape[0], axis = 0)
elif clean_data_raw['Dataframe 2'].shape[0] > clean_data_raw['Dataframe 1'].shape[0]:
    clean_data_raw['Dataframe 2'] = clean_data_raw['Dataframe 2'].sample(clean_data_raw['Dataframe 1'].shape[0], axis = 0) 

## Elastic Net

In [None]:
results = []  # List to collect results

for name, df in clean_data_filtered.items():
    print(f'=== PROCESSING DATAFRAME {name} FILTERED ===')
    X = df.iloc[:, 2:]
    y = df["Condition"]

    max_accuracy, avg_accuracy, avg_mse, avg_roc = cl.tune_sdg_classifier(X, y)

    # Append results as a dictionary
    results.append({
        "Dataset": name,
        "Max Accuracy": max_accuracy,
        "Avg Accuracy": avg_accuracy,
        "Avg MSE": avg_mse,
        "Avg ROC AUC": avg_roc
    })

# Convert the list of results into a DataFrame
filtered_enet_results = pd.DataFrame(results)

In [None]:
enet_results = []  # List to store the results

for name, df in clean_data_raw.items():
    print(f'=== PROCESSING DATAFRAME {name} RAW ===')

    print(f'=== BINNING ===')
    adata = an.AnnData(df.iloc[:,1:].astype(int))
    adata.obs["Condition"] = df["Condition"]

    bins = cl.tune_binning_enet(adata)

    preprocessor = cl.Preprocessor(
        use_key = "X",
        filter_gene_by_counts=0,
        filter_cell_by_counts=False,
        normalize_total=False,
        result_normed_key="X_normed",
        log1p=False,
        result_log1p_key="X_log1p",
        subset_hvg=False,
        hvg_flavor="seurat_v3",
        binning=bins,
        result_binned_key="X_binned"
    )

    preprocessor.__call__(adata)

    df_enet = pd.DataFrame(adata.layers["X_binned"])
    df_enet.index = adata.obs_names
    df_enet.columns = adata.var_names
    df_enet.insert(0, 'Condition', adata.obs["Condition"])

    print('=== CLASSIFIER ===')

    X = df_enet.iloc[:, 1:]
    y = df_enet["Condition"]

    max_accuracy, avg_accuracy, avg_mse, avg_roc = cl.tune_sdg_classifier(X, y)

    # Save results
    enet_results.append({
        "Dataset": name,
        "Max Accuracy": max_accuracy,
        "Avg Accuracy": avg_accuracy,
        "Avg MSE": avg_mse,
        "Avg ROC AUC": avg_roc
    })

# Convert to DataFrame
raw_enet_results = pd.DataFrame(enet_results)

## XGBoost

In [None]:
results = []  # List to collect results

for name, df in clean_data_filtered.items():
    print(f'=== PROCESSING DATAFRAME {name} FILTERED ===')
    X = df.iloc[:, 2:]
    y = df["Condition"]

    max_accuracy, avg_accuracy, avg_mse, avg_roc = cl.tune_xgb_classifier(X, y)

    # Append results as a dictionary
    results.append({
        "Dataset": name,
        "Max Accuracy": max_accuracy,
        "Avg Accuracy": avg_accuracy,
        "Avg MSE": avg_mse,
        "Avg ROC AUC": avg_roc
    })

# Convert the list of results into a DataFrame
filtered_xgb_results = pd.DataFrame(results)

In [None]:
xgb_results = []  # List to store the results

for name, df in clean_data_raw.items():
    print(f'=== PROCESSING DATAFRAME {name} RAW ===')

    print(f'=== BINNING ===')
    adata = an.AnnData(df.iloc[:,1:].astype(int))
    adata.obs["Condition"] = df["Condition"]

    bins = cl.tune_binning_xgb(adata)

    preprocessor = cl.Preprocessor(
        use_key = "X",
        filter_gene_by_counts=0,
        filter_cell_by_counts=False,
        normalize_total=False,
        result_normed_key="X_normed",
        log1p=False,
        result_log1p_key="X_log1p",
        subset_hvg=False,
        hvg_flavor="seurat_v3",
        binning=bins,
        result_binned_key="X_binned"
    )

    preprocessor.__call__(adata)

    df_xgb = pd.DataFrame(adata.layers["X_binned"])
    df_xgb.index = adata.obs_names
    df_xgb.columns = adata.var_names
    df_xgb.insert(0, 'Condition', adata.obs["Condition"])

    print('=== CLASSIFIER ===')

    X = df_xgb.iloc[:, 1:]
    y = df_xgb["Condition"]

    max_accuracy, avg_accuracy, avg_mse, avg_roc = cl.tune_xgb_classifier(X, y)

    # Save results
    xgb_results.append({
        "Dataset": name,
        "Max Accuracy": max_accuracy,
        "Avg Accuracy": avg_accuracy,
        "Avg MSE": avg_mse,
        "Avg ROC AUC": avg_roc
    })

# Convert to DataFrame
raw_xgb_results = pd.DataFrame(xgb_results)