In [1]:
import anndata
import pandas as pd
from pathlib import Path

### 读取数据文件

In [2]:
path = Path("data/laughney20_lung/")
adatas = {}
adatas['laughney'] = anndata.read_h5ad(path / "adata.h5ad")
adatas

{'laughney': AnnData object with n_obs × n_vars = 40505 × 19222
     obs: 'raw', 'major', 'major_hallmark_corrected', 'tier_0', 'tier_1', 'tier_2', 'tier_3', 'tier_0_hallmark_corrected'
     var: 'gene_symbol'}

In [3]:
anndata.read_h5ad(path / "adata.h5ad")

AnnData object with n_obs × n_vars = 40505 × 19222
    obs: 'raw', 'major', 'major_hallmark_corrected', 'tier_0', 'tier_1', 'tier_2', 'tier_3', 'tier_0_hallmark_corrected'
    var: 'gene_symbol'

### 矩阵特征

In [4]:
adatas['laughney'].X

<40505x19222 sparse matrix of type '<class 'numpy.float32'>'
	with 65068723 stored elements in Compressed Sparse Row format>

### 表格特征

In [5]:
adatas['laughney'].obs.astype('str')

Unnamed: 0,raw,major,major_hallmark_corrected,tier_0,tier_1,tier_2,tier_3,tier_0_hallmark_corrected
0,Tm,Tumor,Immune,Tumor,Immune,Lymphoid,T,Normal
1,MACROPHAGE,Tumor,Immune,Tumor,Immune,Myeloid,Macrophage,Normal
2,DENDRITIC,Tumor,Immune,Tumor,Immune,Myeloid,Dendritic,Normal
3,Tm,Tumor,Immune,Tumor,Immune,Lymphoid,T,Normal
4,Tm,Tumor,Immune,Tumor,Immune,Lymphoid,T,Normal
...,...,...,...,...,...,...,...,...
40500,Tm,Tumor,Immune,Tumor,Immune,Lymphoid,T,Normal
40501,Tm,Tumor,Immune,Tumor,Immune,Lymphoid,T,Normal
40502,Tm,Tumor,Immune,Tumor,Immune,Lymphoid,T,Normal
40503,Tm,Tumor,Immune,Tumor,Immune,Lymphoid,T,Normal


### DataLoader

In [6]:
import torch
from torch.utils.data import DataLoader, Dataset


In [7]:
anndata.read_h5ad(path / "adata.h5ad")

AnnData object with n_obs × n_vars = 40505 × 19222
    obs: 'raw', 'major', 'major_hallmark_corrected', 'tier_0', 'tier_1', 'tier_2', 'tier_3', 'tier_0_hallmark_corrected'
    var: 'gene_symbol'

In [11]:
# Define the dataset class
from sklearn.preprocessing import LabelEncoder
class MyDataset(Dataset):
    def __init__(self):
        # import dataset
        data = anndata.read_h5ad(path / "adata.h5ad") 
        # x1
        self.x1 = data.X
        # x2
        x2 = data.obs.astype('str')
        le = LabelEncoder()
        le_count = 0
        for col in x2:
            if x2[col].dtype == 'object':
                le.fit(x2[col])
                x2[col] = le.transform(x2[col])
                le_count += 1
        self.x2 = x2.drop('tier_0_hallmark_corrected',axis=1).to_numpy()
        self.y = torch.tensor(x2['tier_0_hallmark_corrected'].values, dtype=torch.long)

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.x2[idx], self.y[idx]
dataset = MyDataset()
train_loader = DataLoader(dataset=dataset, batch_size=200, shuffle=True,drop_last=False)

In [12]:
for xx ,yy in train_loader:
    print(yy)
    break

tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 1])


In [10]:
xx

tensor([[11,  3,  3,  0,  3,  1,  3],
        [14,  6,  3,  1,  3,  0,  4],
        [11,  3,  3,  0,  3,  1,  3],
        [ 7,  3,  3,  0,  3,  1,  2],
        [ 1,  6,  3,  1,  3,  1,  1],
        [13,  3,  3,  0,  3,  0,  4],
        [ 4,  6,  1,  1,  1,  2,  7],
        [19,  6,  3,  1,  3,  0,  6],
        [18,  6,  3,  1,  3,  0,  6],
        [17,  6,  3,  1,  3,  0,  6],
        [18,  6,  3,  1,  3,  0,  6],
        [17,  6,  3,  1,  3,  0,  6],
        [ 4,  6,  1,  1,  1,  2,  7],
        [17,  6,  3,  1,  3,  0,  6],
        [ 5,  6,  2,  1,  2,  2,  7],
        [ 4,  1,  1,  0,  1,  2,  7],
        [17,  6,  3,  1,  3,  0,  6],
        [ 8,  6,  3,  1,  3,  1,  5],
        [18,  3,  3,  0,  3,  0,  6],
        [17,  6,  3,  1,  3,  0,  6]])

### 均匀采样

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import numpy as np
import scanpy as sc
class MyDataset(Dataset):
    def __init__(self):
        # import dataset
        data = anndata.read_h5ad(path / "adata.h5ad") 
        sc.pp.log1p(data)
        # x1
        self.x1 = data.X
        # x2
        x2 = data.obs.astype('str')
        le = LabelEncoder()
        le_count = 0
        for col in x2:
            if x2[col].dtype == 'object':
                le.fit(x2[col])
                x2[col] = le.transform(x2[col])
                le_count += 1
        self.y = torch.tensor(x2['tier_0_hallmark_corrected'].values, dtype=torch.long)
        x2 = x2.drop('tier_0_hallmark_corrected',axis=1).to_numpy()
        scaler = StandardScaler()
        x2_scaler = scaler.fit_transform(x2)
        self.x2 = x2_scaler
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x1 = self.x1[idx].toarray()[0]
        return x1,self.x2[idx], self.y[idx]
    
    def get_sample_weights(self):
        class_sample_count = np.unique(self.y, return_counts=True)[1]
        weight = 1. / class_sample_count
        samples_weight = weight[self.y]
        return torch.from_numpy(samples_weight)
    
dataset = MyDataset()
samples_weight = dataset.get_sample_weights()
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
train_loader = DataLoader(dataset=dataset, batch_size=200, sampler=sampler)

In [14]:
for x1,x2,y in train_loader:
    print(y)
    break

tensor([1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
        0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
        0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
        1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
        0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 0])


### 划分数据集

In [61]:
import numpy as np


[0 1 2 3 4 5 6 9]


In [15]:
from torch.utils.data import random_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch
import anndata
import pandas as pd
from pathlib import Path
import numpy as np

class MyDataset(Dataset):
    def __init__(self,mode='train'):
        # import dataset
        data = anndata.read_h5ad(Path("data/laughney20_lung/") / "adata.h5ad") 
        # x1
        self.x1 = data.X
        # x2
        x2 = data.obs.astype('str')
        le = LabelEncoder()
        le_count = 0
        for col in x2:
            if x2[col].dtype == 'object':
                le.fit(x2[col])
                x2[col] = le.transform(x2[col])
                le_count += 1
        self.y = torch.tensor(x2['tier_0_hallmark_corrected'].values, dtype=torch.long)
        x2 = x2.drop('tier_0_hallmark_corrected',axis=1).to_numpy()
        scaler = StandardScaler()
        x2_scaler = scaler.fit_transform(x2)
        self.x2 = x2_scaler
        # split
        np.random.seed(0)
        random_arr = np.random.rand(len(self.y))
        mask = random_arr < 0.8
        if mode == 'train':
            self.x1, self.x2, self.y =  self.x1[mask], self.x2[mask], self.y[mask]
        else:
            mask = random_arr >= 0.8
            self.x1, self.x2, self.y =  self.x1[mask], self.x2[mask], self.y[mask]
            
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x1 = self.x1[idx].toarray()[0]
        return x1,self.x2[idx], self.y[idx]
    
    def get_sample_weights(self):
        class_sample_count = np.unique(self.y, return_counts=True)[1]
        weight = 1. / class_sample_count
        samples_weight = weight[self.y]
        return torch.from_numpy(samples_weight)
train_dataset = MyDataset('train')
test_dataset = MyDataset('test')
samples_weight = train_dataset.get_sample_weights()
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

train_loader = DataLoader(dataset=train_dataset, batch_size=200, sampler=sampler)
test_loader = DataLoader(dataset=test_dataset, batch_size=200)

In [17]:
for x1,x2,y in test_loader:
    print(y)
    break

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


In [19]:
x1.shape

torch.Size([200, 19222])

In [20]:
x2.shape

torch.Size([200, 7])

In [21]:
y

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])