In [None]:
# -*- coding: utf-8 -*-
import scanpy as sc
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
import seaborn as sns

Simplified version of https://github.com/clinicalml/sc-foundation-eval/blob/main/scBERT/scbert_baselines_LR.ipynb

SAMPLING_FRACS determines the fraction of the training data on which to train
(the paper shows that logistic regression outperforms scBERT even for small fractions)
NREPS determines number of splits

If sampling_frac is 1 then NREPS should be 1, otherwise we just get repeated runs
on the same data

In [None]:
#read Zheng data. download 
zheng_data = sc.read_h5ad("/data/scBERT/Zheng68K.h5ad")
zheng_data

In [None]:
data = zheng_data.X
label = zheng_data.obs.celltype

In [None]:
NREPS = 1
SAMPLING_FRACS = [1.0]

ks = []
fracs = []
cs=[]
train_accs = []
test_accs = []
test_f1s = []
for k in np.arange(NREPS):
    for frac in SAMPLING_FRACS:
        ks.append(k)
        fracs.append(frac)
        print("frac {}, rep {}".format(frac, k))
        #downsample training set
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=2022) #update Aug 2023: hold train/val across all runs #same train/val set split for each frac in k
        for index_train, index_val in sss.split(data, label):
            np.random.seed(k)
            index_train_small = np.random.choice(index_train, round(index_train.shape[0]*frac), replace=False)
            X_train, y_train = data[index_train_small], label[index_train_small]
            X_test, y_test = data[index_val], label[index_val]

        print("Loaded data...")

        #train on train_dataset
        
        """ c=0.1 was always best, using going forward without always running tuning
        #hyperparameter tune using k-fold val on training data
        cv_results = {}
        for c in [1e-3, 1e-2, 1e-1, 1]:
            print("c={}".format(c))
            lr = LogisticRegression(random_state=0, penalty="l1", C=c, solver="liblinear")
            res = cross_validate(lr, X_train, y_train, scoring=['accuracy'])
            cv_results[c] = np.mean(res['test_accuracy'])
        print(cv_results)

        #choose best c and calc performance on val_dataset
        best_ind = np.argmax(list(cv_results.values()))
        c = list(cv_results.keys())[best_ind]
        cs.append(c)
        """
        c = 0.1
        #print("best c={}".format(c))
        lr = LogisticRegression(penalty="l1", C=c, solver="liblinear") #random_state=0, 
        lr.fit(X_train, y_train)
        train_acc = lr.score(X_train, y_train)
        test_acc = lr.score(X_test, y_test)
        print("train set accuracy: " + str(np.around(train_acc, 4)))
        print("test set accuracy: " + str(np.around(test_acc, 4)))
        val_macro_f1 = f1_score(y_test, lr.predict(X_test), average="macro")
        print("test set macro F1: " + str(np.around(val_macro_f1, 4)))
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        test_f1s.append(val_macro_f1)
        
        print("\n")