# Installing Required Packages

In [None]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install anndata
!{sys.executable} -m pip install cas-tools

In [1]:
import pandas as pd
import anndata as ad

# Checking content of PBMC spreadsheet

In [13]:
# File and sheet details
file_path = "data/blood.xlsx"
sheet_name = "PBMC3_Yoshida_2022_PBMC"

# Read metadata (rows 1-12)
metadata_df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=0, nrows=12)

# Read data (rows 13 to end)
data_df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=13)

In [14]:
metadata_df

Unnamed: 0,#rationale_dois,https://doi.org/10.1038/s41586-021-04345-x
0,#author_name,Masahiro Yoshida
1,#orcid,https://orcid.org/0000-0002-3521-5322
2,#author_list,Kaylee B. Worlock|Ni Huang|Rik G. H. Lindeboom
3,#PATH TO DATASET,/home/cap/pbmcBioNetwork/PBMC3_Yoshida/PBMC3_Y...
4,#HCA LINK,
5,#matrix_file_id,https://cellxgene.cziscience.com/e/2a498ace-87...
6,#CAP LINK,
7,#MANUAL / ALGORITHMIC,The manual blood cell type annotation was vali...
8,#OTHER INFORMATION,"Fig 3d - marker genes, Extended Data Fig. 2c -..."
9,#COMMENTS,Need synonyms and categories


In [15]:
data_df

Unnamed: 0,labelset,cell_label,marker_gene_evidence,SURFACE PROTEIN MARKERS,BCR / TCR,SYNONYMS,CATEGORIES,EVIDENCE
0,cell_type,Classical monocyte,"CD4, SELL, FCER1G, CD14, PLSCR1",CD14,,,,
1,cell_type,CD4-positive helper T cell,"CD3D, CD4, CD27, IL7R","CD3D, CD4, CD27, SELL, IL7R, CD45RO",TCR,,,
2,cell_type,"Naive thymus-derived CD8-positive, alpha-beta ...","CD3D, CD8A, CCR7, CD27, SELL, IL7R","CD3D, CD8A, CCR7, CD27, SELL, IL7R, CD45RA",TCR,,,
3,cell_type,"Naive thymus-derived CD4-positive, alpha-beta ...","CD3D, CD4, CCR7, CD27, SELL, IL7R","CD3D, CD4, CCR7, CD27, SELL, IL7R",TCR,,,
4,cell_type,"Central memory CD8-positive, alpha-beta T cell","CD3D, CD8A, CD27, IL7R","CD3D, CD8A, CD27, IL7R, CD45RO",TCR,,,
...,...,...,...,...,...,...,...,...
97,annotation_broad,Plasma,,,,,,
98,annotation_broad,Plasma,,,,,,
99,annotation_broad,DC,,,,,,
100,annotation_broad,Baso/Eos,,,,,,


# Spreadsheet to CAS Conversion

In this example, we use the [PBMC dataset](https://cellxgene.cziscience.com/e/2a498ace-872a-4935-984b-1afa70fd9886.cxg/).

## Command-line Arguments:

- `--spreadsheet`: Path to the spreadsheet file.
- `--sheet`: Target sheet name in the spreadsheet.
- `--anndata`: Path to the AnnData file. If not provided, AnnData will be downloaded using CxG LINK in the spreadsheet.
- `--labelsets`: List to determine the rank of labelsets in the spreadsheet. If not provided, ranks will be determined using the order of CELL LABELSET NAME.
- `--output`: Output CAS file name (default: output.json).

## Usage Example:

Navigate to the source directory and run the following command:

```bash
python -m cas spreadsheet2cas --spreadsheet path/to/spreadsheet.xlsx --sheet sheet_name --labelsets item1 item2 item3 --output path/to/output_file.json
```

## Current Arguments

- spreadsheet: "data/blood.xlsx"
- anndata: "2a498ace-872a-4935-984b-1afa70fd9886.h5ad"
- output: "output/blood_cas.json"

In [29]:
# Use this command if the file has not already been downloaded into the notebooks directory.
# The CLI will download it automatically, but the process will take longer as it includes the download step.
# !cas spreadsheet2cas --spreadsheet data/blood.xlsx  --labelsets annotation_detailed_fullNames annotation_broad --output output/pbmc_cas.json

In [18]:
!cas spreadsheet2cas --spreadsheet data/blood.xlsx --anndata 2a498ace-872a-4935-984b-1afa70fd9886.h5ad  --labelsets annotation_detailed_fullNames annotation_broad --output output/pbmc_cas.json

# Check the content of cas json file

In [24]:
import json

with open("output/pbmc_cas.json", "r") as f:
    cas = json.load(f)

In [25]:
cas["labelsets"]

[{'name': 'annotation_detailed_fullNames', 'description': '', 'rank': '0'},
 {'name': 'annotation_broad', 'description': '', 'rank': '1'}]

In [26]:
cas["annotations"][:5]

[{'labelset': 'annotation_detailed_fullNames',
  'cell_label': 'Classical monocyte',
  'marker_gene_evidence': ['CD4', ' SELL', ' FCER1G', ' CD14', ' PLSCR1'],
  'cell_ids': ['CV001_KM10202384-CV001_KM10202394_AAAGCAAAGGGTCTCC-1',
   'S12_TTGACTTCACCTCGTT-1',
   'S20_GACGTTATCCCTAACC-1',
   'S14_TACTCATTCCAAACAC-1',
   'S13_CTGATAGCAAAGGCGT-1',
   'S12_GTTCGGGCAGCCAGAA-1',
   'S12_CGAGAAGCAAGACACG-1',
   'CV001_KM10202404-CV001_KM10202416_GTTTCTACACCAGGTC-1',
   'CV001_KM10202384-CV001_KM10202394_CGTTGGGTCAGGCCCA-1',
   'CV001_KM10202390-CV001_KM10202400_CGAGCACCAAGCCATT-1',
   'S23_TCGCGTTTCAAGGTAA-1',
   'S14_TCACAAGGTGCCTGTG-1',
   'CV001_KM10202406-CV001_KM10202418_TCATTACAGACTAGAT-1',
   'S24_TTGAACGGTAAAGTCA-1',
   'CV001_KM10202407-CV001_KM10202419_GATCTAGAGACTTGAA-1',
   'S19_TGCCAAACAATCGAAA-1',
   'S23_GAATAAGTCGGTGTCG-1',
   'S14_GACTACACAAGAGGCT-1',
   'S24_GTCCTCAGTAGAGTGC-1',
   'CV001_KM10202389-CV001_KM10202399_GCAGTTATCGGATGTT-1',
   'S16_GCAGCCATCTTCGGTC-1',
   'S23_C