In [1]:
#!pip install scanpy
import pandas as pd
import numpy as np
import scipy
import scanpy as sc
from anndata import AnnData, read_h5ad
from scipy.sparse import issparse, csr_matrix
import tarfile
import gzip
import scipy.io as sio
import h5py
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [2]:
def load_h5(adata_h5):
    """Helper function to load an anndata object.
    
    Please see 
    "https://anndata.readthedocs.io/en/latest/api.html#reading" 
    for further information.
    """
    
    adata = sc.read_10x_h5(adata_h5, gex_only = False) #genome = 'GRCh38'
    #if not issparse(adata.X):
    #    adata.X = csr_matrix(adata.X)
    #adata.var_names_make_unique()
    return adata

In [3]:
df = load_h5('BRCA_GSE143423_expression.h5')

In [4]:
df = df.to_df()

In [5]:
df.head()

Unnamed: 0,FO538757.2,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,RNF223,C1orf159,...,AC136616.1,BX004987.4,AC145212.4,AC145212.2,AC011043.1,AL592183.1,AC007325.4,AL354822.1,AC004556.1,AC240274.1
tnbc1_AAACCTGCACAAGCCC,0.361251,0.62607,0.0,0.0,0.0,1.283912,0.0,0.0,0.0,0.0,...,0.0,0.361251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tnbc1_AAACCTGGTGGGTCAA,0.0,0.734069,0.0,0.0,0.0,0.432931,0.432931,0.9652,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.432931,0.0,0.0,0.0
tnbc1_AAACCTGTCAACGGCC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.380288,0.0,0.0,0.0,0.0,0.0
tnbc1_AAAGATGCATTTCAGG,0.0,0.0,0.0,0.0,0.0,0.646184,0.0,0.204622,0.0,0.204622,...,0.204622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tnbc1_AAAGATGGTTAAGGGC,0.0,0.0,0.0,0.0,0.0,0.687111,0.15243,0.0,0.0,0.0,...,0.505999,0.15243,0.15243,0.0,0.0,0.0,0.284668,0.0,0.0,0.0


In [6]:
with open('BRCA_GSE143423_CellMetainfo_table.tsv','r') as f:
    labels = pd.read_csv(f, delimiter = '\t')

In [7]:
Malignant = np.where(labels['Celltype (malignancy)'] == 'Malignant cells', 1, 0 )

In [8]:
df['Malignant'] = Malignant
df.head()

Unnamed: 0,FO538757.2,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,RNF223,C1orf159,...,BX004987.4,AC145212.4,AC145212.2,AC011043.1,AL592183.1,AC007325.4,AL354822.1,AC004556.1,AC240274.1,Malignant
tnbc1_AAACCTGCACAAGCCC,0.361251,0.62607,0.0,0.0,0.0,1.283912,0.0,0.0,0.0,0.0,...,0.361251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
tnbc1_AAACCTGGTGGGTCAA,0.0,0.734069,0.0,0.0,0.0,0.432931,0.432931,0.9652,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.432931,0.0,0.0,0.0,1
tnbc1_AAACCTGTCAACGGCC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.380288,0.0,0.0,0.0,0.0,0.0,1
tnbc1_AAAGATGCATTTCAGG,0.0,0.0,0.0,0.0,0.0,0.646184,0.0,0.204622,0.0,0.204622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
tnbc1_AAAGATGGTTAAGGGC,0.0,0.0,0.0,0.0,0.0,0.687111,0.15243,0.0,0.0,0.0,...,0.15243,0.15243,0.0,0.0,0.0,0.284668,0.0,0.0,0.0,1


In [9]:
X, y = df.drop('Malignant', axis = 1), df.Malignant
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [10]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)



In [12]:
print(f'recall {recall}')
print(f'precision {precision}')
print(f'accuracy {accuracy}')
print(f'f1 {f1}')
print(f'conf_matrix {conf_matrix}')

recall 1.0
precision 1.0
accuracy 1.0
f1 1.0
conf_matrix [[ 55   0]
 [  0 820]]


In [16]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Second Dataset

In [18]:
df = load_h5('BRCA_GSE148673_expression.h5')

In [19]:
df = df.to_df()

In [17]:
with open('BRCA_GSE148673_CellMetainfo_table.tsv','r') as f:
    labels = pd.read_csv(f, delimiter = '\t')

In [20]:
Malignant = np.where(labels['Celltype (malignancy)'] == 'Malignant cells', 1, 0 )

In [21]:
df['Malignant'] = Malignant
df.head()

Unnamed: 0,AL627309.1,AP006222.1,AL732372.1,AL732372.2,AL669831.5,FAM87B,LINC00115,FAM41C,AL645608.2,SAMD11,...,AC011484.1,AC010642.1,RNF225,AP000350.1,AP000350.2,AP000356.1,MTCO1P20,LINC01315,TRPM2-AS,Malignant
GSM4476486@GATCGTACAGATCGGA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
GSM4476486@GGGCATCCATTCCTCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
GSM4476488@AAGTCTGCAATCGGTT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
GSM4476488@ACGGAGACAGGTCTCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
GSM4476488@CACCAGGTCCTCCTAG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [22]:
X, y = df.drop('Malignant', axis = 1), df.Malignant
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [23]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [24]:
print(f'recall {recall}')
print(f'precision {precision}')
print(f'accuracy {accuracy}')
print(f'f1 {f1}')
print(f'conf_matrix {conf_matrix}')

recall 0.999083409715857
precision 0.9936189608021878
accuracy 0.9961389961389961
f1 0.9963436928702011
conf_matrix [[ 974    7]
 [   1 1090]]


In [30]:
y_pred

array([0, 1, 1, ..., 0, 0, 0])