<center> <h1> GANs on cell profiles </center> </h1>


## 1) Adaptation of naive_classifier notebook to get trained xgboost compatible with torch
The goal of this Notebook is to adapt the [example of Diane](https://github.com/dlmbl/knowledge_extraction/blob/main/solution.ipynb) to create GANs but on cell profiles so 1D data instead of 2D. 

In [7]:
import polars as pl
import pandas as pd

import numpy as np
import cupy as cp

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

from features_engineering import features_drop_corr
from features_engineering import features_drop_corr_gpu

from data_split import StratifiedGroupKFold_custom

import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

import lightning as L

import custom_dataset

In [8]:
metadata_df = pd.read_csv("target2_eq_moa2_metadata", index_col="ID")
features_df = pd.read_csv("target2_eq_moa2_features", index_col="ID")
nan_col = features_df.columns[features_df.isna().sum(axis=0) > 0]
nan_col, len(nan_col)
inf_col = features_df.columns[(features_df == np.inf).sum(axis=0) > 0]
inf_col, len(inf_col)
features_df = features_df[features_df.columns[(features_df.isna().sum(axis=0) == 0) & 
                                            ((features_df == np.inf).sum(axis=0) == 0)]]
metadata_df = metadata_df.assign(moa_id=LabelEncoder().fit_transform(metadata_df["moa"]))
features_df = features_df.sort_index().reset_index(drop=True)
metadata_df = metadata_df.sort_index().reset_index()

In [9]:
kfold = list(StratifiedGroupKFold_custom().split(
    features_df, metadata_df["moa_id"], metadata_df["Metadata_InChIKey"]))

In [10]:
X = torch.tensor(features_df.values, dtype=torch.float)
y = torch.tensor(metadata_df["moa_id"].values, dtype=torch.long)
dataset_fold = {i: 
                {"train": custom_dataset.RowDataset(X[kfold[i][0]], 
                                                      y[kfold[i][0]]),
                 "test": custom_dataset.RowDataset(X[kfold[i][1]], 
                                                     y[kfold[i][1]])}
                for i in range(len(kfold))}


In [52]:
class SklearnModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x):
        x = x.cpu().detach().numpy()
        return torch.tensor(self.model.predict_proba(x))
        
def train_classifier(dataset_fold, 
                     classifier,
                     classifier_params): # True, False  
    
    trained_model = {i: 0 for i in list(dataset_fold.keys())}
    
    # Lists to store precision-recall auc score, train and test accuracy for each fold
    train_auc_fold = []
    train_f1_fold = []
    test_auc_fold = []
    test_f1_fold = []
    
    # Perform stratified k-fold cross-validation
    for i in tqdm(list(dataset_fold.keys())):
        # Initialize the Classifier
        model = classifier(**classifier_params)
        X_train, y_train = list(map(lambda x: x.cpu().detach().numpy(), dataset_fold[i]["train"][:]))
        X_test, y_test = list(map(lambda x: x.cpu().detach().numpy(), dataset_fold[i]["test"][:]))

        # Fit the classifier on training data
        model.fit(X_train, y_train)
        
        # Predict probabilities of the positive class for train and test data
        y_train_scores = model.predict_proba(X_train)  
        y_train_pred = model.predict(X_train)
        y_test_scores = model.predict_proba(X_test)    
        y_test_pred = model.predict(X_test)

           
        train_auc = roc_auc_score(y_train, y_train_scores,
                                  multi_class="ovr",
                                  average="macro",
                                  labels=model.classes_)

        test_auc = roc_auc_score(y_test, y_test_scores,
                                 multi_class="ovr",
                                 average="macro",
                                 labels=model.classes_)
        
        train_f1 = f1_score(y_train, y_train_pred, 
                   average="macro",
                   labels=model.classes_)
        
        test_f1 = f1_score(y_test, y_test_pred, 
                           average="macro",
                           labels=model.classes_)
                           
        print(f"Train ROC-AUC: {train_auc:.4f} - f1: {train_f1:.4f} \n", 
              f"Test ROC-AUC: {test_auc:.4f} - f1: {test_f1:.4f} ")
        # Append train and test accuracy, train and test AUC score to respective lists
        train_auc_fold.append(train_auc)
        train_f1_fold.append(train_f1)
        test_auc_fold.append(test_auc)
        test_f1_fold.append(test_f1)
        trained_model[i] = SklearnModel(model)

    # Average metrics across all folds
    mean_train_auc = np.mean(train_auc_fold)
    mean_train_f1 = np.mean(train_f1)
    mean_test_auc = np.mean(test_auc_fold)
    mean_test_f1 = np.mean(test_f1_fold)

    print(f"Mean over fold Train ROC-AUC Score: {mean_train_auc:.4f}")
    print(f"Mean over fold Train f1 Score: {mean_train_f1:.4f}")
    print(f"Mean over fold Test ROC-AUC Score: {mean_test_auc:.4f}")
    print(f"Mean over fold Test f1 Score: {mean_test_f1:.4f}")
    
    return trained_model

In [53]:
classifier = XGBClassifier
classifier_params = {'device': 'cuda',
  'objective': 'multi:softprob',
  'eval_metric': 'auc',
  'eta': 0.009220975613350597,
  'reg_alpha': 2.0306587420421414,
  'reg_lambda': 0.016003601665119382,
  'gamma': 0.04020765654946003,
  'max_depth': 5,
  'max_leaves': 5,
  'min_child_weight': 7.574433676104173,
  'n_jobs': 192,
  'random_state': 42,
  'subsample': 0.4971871649904771,
  'colsample_bytree': 0.9712341267382383,
  'colsample_bylevel': 0.9994344494972629,
  'colsample_bynode': 0.7014104562833469}#In case of huge imbalance. set between 1-10 is usual in case of usage


trained_model = train_classifier(dataset_fold, 
                                 classifier,
                                 classifier_params)


 20%|██        | 1/5 [00:03<00:15,  3.78s/it]

Train ROC-AUC: 0.9545 - f1: 0.7529 
 Test ROC-AUC: 0.8548 - f1: 0.5187 


 40%|████      | 2/5 [00:07<00:10,  3.64s/it]

Train ROC-AUC: 0.9703 - f1: 0.7981 
 Test ROC-AUC: 0.7624 - f1: 0.4450 


 60%|██████    | 3/5 [00:10<00:07,  3.62s/it]

Train ROC-AUC: 0.9636 - f1: 0.7680 
 Test ROC-AUC: 0.8223 - f1: 0.4813 


 80%|████████  | 4/5 [00:14<00:03,  3.58s/it]

Train ROC-AUC: 0.9634 - f1: 0.7915 
 Test ROC-AUC: 0.6984 - f1: 0.3115 


100%|██████████| 5/5 [00:18<00:00,  3.62s/it]

Train ROC-AUC: 0.9579 - f1: 0.7632 
 Test ROC-AUC: 0.8446 - f1: 0.5203 
Mean over fold Train ROC-AUC Score: 0.9619
Mean over fold Train f1 Score: 0.7632
Mean over fold Test ROC-AUC Score: 0.7965
Mean over fold Test f1 Score: 0.4554



