## Define the current running path as the path where the main function is located

In [None]:
import os
os.chdir("/home/xiongyx/mtANN")

## 0. Import python modules

In [1]:
from model import *
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## 1. Read in data

The input data considered by the current version of mtANN is in csv format, where rows are samples and columns are features. In addition, its cell type information is stored in another csv file, and its naming format is the name of the dataset +_label.
    
Here we use the Pancreas data as an example: Download the dataset file and unzip it. Then move everything in datasets/ to data/pancreas/.

### 1.1 Get the names of all sequencing data

In [9]:
files,_=os.walk("./datasets/panc/")

In [15]:
file = []
for f in files[2]:
    file.append(f.split("_")[0])
tech = []
for f in file:
    tech.append(f.split(".")[0])
tech = np.unique(tech)
print(tech)

['human' 'muraro' 'seg' 'xin']


### 1.2 read in target data

In [18]:
dt = 0
target = tech[dt]
print("target data is {}".format(target))
target_dataset = {}
target_dataset["expression"] = pd.read_csv("./datasets/panc/{}.csv".format(target), header=0, index_col=0)
target_dataset["expression"] = target_dataset["expression"].div(target_dataset["expression"].apply(lambda x: x.sum(), axis=1), axis=0) * 10000
target_dataset["cell_type"] = pd.read_csv("./datasets/panc/{}_label.csv".format(target), header=0, index_col=False).x.values

target data is human


### 1.3 read in remaining data as reference datasets

In [19]:
epr_s = list()
label_s = list()
for ds in set(np.arange(len(tech))).difference(set([dt])):
    source = tech[ds]
    print("source is {}".format(source))
    source_dataset = {}
    source_dataset["expression"] = pd.read_csv("./datasets/panc/{}.csv".format(source), header=0, index_col=0)
    source_dataset["cell_type"] = pd.read_csv("./datasets/panc/{}_label.csv".format(source), header=0, index_col=False).x.values
    source_dataset["expression"] = source_dataset["expression"].div(source_dataset["expression"].apply(lambda x: x.sum(), axis=1), axis=0) * 10000
    epr_s.append(source_dataset["expression"])
    label_s.append(source_dataset["cell_type"])

source is muraro
source is seg
source is xin


## 2. Fit mtANN model

There are six parameters to input:

**expression_s**: A list of gene expression matrices for reference datasets, each matrix is formatted as a dataframe object where rowa are cells and columns are genes. <br>
**label_s**: A list of cell-type labels corresponding to references in expression_s. Its length is equal to the length of expression_s.<br>
**expression_t**: The gene expression matrix of target dataset whose format is the same as reference datasets.<br>
**threshold**: Either be default or a number between 0~1. This parameter indicates that the threshold for unseen cell-type identification is selected using the method's default threshold or user-defined.<br>
**gene_select**: Either be default or others. The "default" means that the default eight gene selection methods are used. Other values indicate that all the genes in expression_s are used.<br>
**CUDA**: A logic parameter. It indicates whether to use gpu.

In [20]:
mid_annotation, final_annotation, m, threshold = mtANN(expression_s = epr_s, label_s=label_s, expression_t=target_dataset["expression"], threshold="default", gene_select="default", CUDA = True)

Selecting genes with default methods
Convert 0-th ref to R object
Convert 1-th ref to R object
Convert 2-th ref to R object
the number of references is 3 
Selecting genes with gc
Gene number is 12570
Cell number is 2119
Selecting genes with gc
Gene number is 14288
Cell number is 2108
Selecting genes with gc
Gene number is 14912
Cell number is 1492
training 1-th classification model
training 2-th classification model
training 3-th classification model
training 4-th classification model
training 5-th classification model
training 6-th classification model
training 7-th classification model
training 8-th classification model
training 9-th classification model
training 10-th classification model
training 11-th classification model
training 12-th classification model
training 13-th classification model
training 14-th classification model
training 15-th classification model
training 16-th classification model
training 17-th classification model
training 18-th classification model
training 19

We can obtain four output:

**mid_annotation**: A numpy object which is the metaphase annotation results. <br>
**final_annotation**: A numpy object which is the annotation results with "unassigned" cell selected by default threshold. <br>
***m***: A numpy object which is the unseen cell-type identification metric. <br>
**threshold**: A numeric object which is the selected default threshold.

## 3. Evaluation

In [21]:
accuracy = sum(final_annotation[:,0] == target_dataset["cell_type"])/final_annotation.shape[0]
print("The annotation accuracy with the default threshold is {}".format(accuracy))

The annotation accuracy with the default threshold is 0.8704741882737678


## 4. Save all the results to output dir

In [None]:
pd.DataFrame(mid_annotation).to_csv("./output/mid_annotation_{}.csv".format(target), index=False)
pd.DataFrame(final_annotation).to_csv("./output/final_annotation_{}.csv".format(target), index=False)
pd.DataFrame(m).to_csv("./output/metric_{}.csv".format(target), index=False)
pd.DataFrame(threshold).to_csv("./output/threshold_{}.csv".format(target), index=False)