## Cross Prediction using DEGs


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

### Train a model on DEGs from Multiome and test it on scRNA-seq

In [None]:
# x_train_DEGs is a subset of gene expression matrix of multiome which contains only DEGs in Top_Degs_GC_PB_multiome.csv
# Available from Step1
X = pd.read_csv(".../x_train_DEGs.csv", index_col=False )
# Y_GC_PB.csv contain 0 for GC cells and 1 for PB cells in multiome dataset
Y = pd.read_csv(".../Y_GC_PB.csv", index_col=False)
Y = Y.loc[:, 'V1']
X = X.iloc[:, 1:]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
lasso = LogisticRegression(penalty='l1',solver='liblinear', max_iter=10000)
lasso.fit(X_train, Y_train)

Y_pred= lasso.predict(X_test)

auc = roc_auc_score(Y_test, Y_pred)
print(f"AUC : {auc:.2f}")

### Test the same model on scRNA

In [None]:
# x_test_scRNA_DEGs is a subset of gene expression matrix of scRNA-seq datset which contains only DEGs in Top_Degs_GC_PB_multiome.csv
# Available from Step1
X_test1 = pd.read_csv("/x_test_scRNA_DEGs.csv", index_col=False)
# Y_GC_PB.csv contain 0 for GC cells and 1 for PB cells in scRNA-seq dataset
Y_test1 = pd.read_csv("...r/Y_GC_PB.csv", index_col=False)

Y_test1 = Y_test1.loc[:, 'V1']
X_test1 = X_test1.iloc[:, 1:]

X_test1 = scaler.transform(X_test1)
Y_pred1= lasso.predict(X_test1)
auc = roc_auc_score(Y_test1, Y_pred1)
print(f"AUC : {auc:.2f}")

#### Plotting ROC

In [None]:
# Predictions and probabilities for the first test set
Y_pred = lasso.predict(X_test)
fpr, tpr, _ = roc_curve(Y_test, Y_pred)
auc1 = roc_auc_score(Y_test, Y_pred)

# Predictions and probabilities for the second test set
Y_pred1 = lasso.predict(X_test1)
fpr1, tpr1, _ = roc_curve(Y_test1, Y_pred1)
auc2 = roc_auc_score(Y_test1, Y_pred1)

plt.figure()
plt.plot(fpr, tpr, color='#B83636', linestyle='solid', linewidth=4, marker='o', markersize=4, alpha=0.8)

plt.plot(fpr1, tpr1, color='#7BDE7B',linestyle='dotted', linewidth=4, marker='s', markersize=4, alpha=0.8)

plt.plot([0, 1], [0, 1], color='gray', linestyle='--', linewidth=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# output path to save the ROC plot
out_path = '.../ROC_Train_Multiome_DEGs.svg'
plt.savefig(out_path, format='svg')
plt.show()

### Train a model on scRNA-seq DEGs from  and test it on Multiome

In [None]:
# x_train_DEGs in this part of the code 
# is a subset of gene expression matrix of scRNA which contains only DEGs in Top_Degs_GC_PB_scRNA.csv
# Available from Step1
X = pd.read_csv(".../x_train_DEGs.csv", index_col=False )
# Y_GC_PB.csv contain 0 for GC cells and 1 for PB cells in scRNA-seq dataset
Y = pd.read_csv(".../Y_GC_PB.csv", index_col=False)
Y = Y.loc[:, 'V1']
X = X.iloc[:, 1:]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
lasso = LogisticRegression(penalty='l1',solver='liblinear', max_iter=10000)
lasso.fit(X_train, Y_train)

Y_pred= lasso.predict(X_test)

auc = roc_auc_score(Y_test, 1-Y_pred)
print(f"AUC : {auc:.2f}")

### Test the same model on Multiome

In [None]:
# x_test_multiome_DEGs in this part of the code 
# is a subset of gene expression matrix of multiome which contains only DEGs in Top_Degs_GC_PB_scRNA.csv
# Available from Step1
X_test1 = pd.read_csv(".../x_test_multiome_DEGs.csv", index_col=False)
# Y_GC_PB.csv contain 0 for GC cells and 1 for PB cells in multiome dataset
Y_test1 = pd.read_csv(".../Y_GC_PB.csv", index_col=False)

Y_test1 = Y_test1.loc[:, 'V1']
X_test1 = X_test1.iloc[:, 1:]
X_test1 = scaler.transform(X_test1)
Y_pred1= lasso.predict(X_test1)

auc = roc_auc_score(Y_test1, Y_pred1)
print(f"AUC : {auc:.2f}")

#### Plotting ROC

In [None]:
# Predictions and probabilities for the first test set
Y_pred = lasso.predict(X_test)
fpr, tpr, _ = roc_curve(Y_test, Y_pred)
auc1 = roc_auc_score(Y_test, Y_pred)

# Predictions and probabilities for the second test set
Y_pred1 = lasso.predict(X_test1)
fpr1, tpr1, _ = roc_curve(Y_test1, Y_pred1)
auc2 = roc_auc_score(Y_test1, Y_pred1)

plt.figure()
plt.plot(fpr, tpr, color='#B83636', linestyle='solid', linewidth=4, marker='o', markersize=4, alpha=0.8)

plt.plot(fpr1, tpr1, color='#7BDE7B',linestyle='dotted', linewidth=4, marker='s', markersize=4, alpha=0.8)

plt.plot([0, 1], [0, 1], color='gray', linestyle='--', linewidth=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

#output path to save ROC  
out_path = '.../ROC_Train_scRNA_DEGs.svg'
plt.savefig(out_path, format= 'svg')
plt.close()