In [1]:
import scanpy as sc

In [2]:
import pandas as pd
import numpy as np

# Download data

In [63]:
adata = sc.datasets.pbmc3k()

In [15]:
#include sys path of functions
import sys
sys.path.append("..")


{'title': 'dendrogram', 'type': 'object', 'properties': {'adata': {'title': 'Adata', 'description': 'Annotated data matrix\n This is the object type:AnnData', 'type': 'string'}, 'groupby': {'title': 'Groupby', 'description': 'No description available.\n This is the object type:str | Sequence[str]', 'type': 'string'}, 'n_pcs': {'title': 'N Pcs', 'description': 'Use this many PCs. If `n_pcs==0` use `.X` if `use_rep is None`.\n This is the object type:int | None', 'type': 'string'}, 'use_rep': {'title': 'Use Rep', 'description': "Use the indicated representation. `'X'` or any key for `.obsm` is valid.\nIf `None`, the representation is chosen automatically:\nFor `.n_vars` < :attr:`~scanpy._settings.ScanpyConfig.N_PCS` (default: 50), `.X` is used, otherwise 'X_pca' is used.\nIf 'X_pca' is not present, it’s computed with default parameters or `n_pcs` if present.\n This is the object type:str | None", 'type': 'string'}, 'var_names': {'title': 'Var Names', 'description': 'List of var_names to 

# Preprocessing PP

In [37]:
# Get the pydantic classes for all the methods in scanpy.pp
from functions.autogenerate_with_defaults_and_method_name import generate_pydantic_classes
#from functions.og_autogenerate_func import generate_pydantic_classes
generated_classes = generate_pydantic_classes(sc.pp)

In [39]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticToolsParser

llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
llm_with_tools = llm.bind_tools(generated_classes)
chain = llm_with_tools | PydanticToolsParser(tools=generated_classes)

#### Mark mitochondrial genes to use for QC

Expected code snippet:

`adata.var["mt"] = adata.var_names.str.startswith("MT-")`

In [25]:
query = [
	("system", "You're an expert data scientist"), 
	("human", "I want to mark mitochondrial genes of my adata object by checking the basic strings in var_names"),
]
result = chain.invoke(query)
result

KeyboardInterrupt: 

#### calculate qc_metrics including mitochondrial as covariate

Expected code snippet:

```
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)
```

In [40]:
query = [
	("system", "You're an expert data scientist. You know how to use scanpy. Please make sure you parameterize a function call according to the provided schema"), 
	("human", "I want to use scanpy pp to compute quality control metrics"),
]
chain = llm_with_tools | PydanticToolsParser(tools=generated_classes)
result = chain.invoke(query)
result

[]

#### filter genes detected in less than 3 cells
Expected code snippet:

```
sc.pp.filter_genes(adata, min_cells=3)
```

In [41]:
query = [
	("system", "You're an expert data scientist. You know how to use scanpy. You are supposed to output a correctly parametized function call"), 
	("human", "I want to use scanpy pp to filter genes with at least 3 cells"),
]
result = chain.invoke(query)
result

[filter_genes(data={'n_obs': 10, 'n_vars': 5}, min_counts=None, min_cells=3, max_counts=None, max_cells=None, inplace=True, copy_param=False)]

#### filter cells with less than 200 genes
Expected code snippet:

```
sc.pp.filter_cells(adata, min_genes=200)
```

In [44]:
query = [
	("system", "You're an expert data scientist. You know how to use scanpy. You are supposed to output a correctly parametized function call"), 
	("human", "I want to use scanpy pp to filter cells with at least 200 genes"),
]
result = chain.invoke(query)
result

[filter_cells(data={'n_obs': 1000, 'n_vars': 2000}, min_counts=None, min_genes=200, max_counts=None, max_genes=None, inplace=True, copy_param=False)]

#### filter cells with more than 2500 genes
Expected code snippet:

```
sc.pp.filter_cells(adata, max_genes=2500)
```

#### filter cells with less than 5% mitochondrial fraction
Expected code snippet:

```
adata = adata[adata.obs.pct_counts_mt < 5, :].copy()

```

#### store raw counts in a separate layer
Expected code snippet:

```
adata.layers['counts'] = adata.X.copy()
```

#### normalize expression
Expected code snippet:

```
sc.pp.normalize_total(adata, target_sum=1e4)
```

In [45]:
query = [
	("system", "You're an expert data scientist. You know how to use scanpy. You are supposed to output a correctly parametized function call"), 
	("human", "I want to normalize my cells to a target sum of 10000"),
]
result = chain.invoke(query)
result

[normalize_total(adata={}, target_sum=10000, exclude_highly_expressed=False, max_fraction=0.05, key_added=None, layer=None, layers=None, layer_norm=None, inplace=True, copy_param=False)]

#### log transform
Expected code snippet:

```
sc.pp.log1p(adata)
```

#### calculate highly variable genes
Expected code snippet:

```
sc.pp.highly_variable_genes(adata)
```

#### compute PCA
Expected code snippet:

```
sc.pp.pca(adata) | sc.tl.pca(adata)
```

#### compute neighbors
Expected code snippet:

```
sc.pp.neighbors(adata)
```

# Tools TL

## Embeddings

#### UMAP
Expected code snippet:

```
sc.tl.umap(adata)
```

#### TSNE
Expected code snippet:

```
sc.tl.tsne(adata)
```

#### Diffusion maps
Expected code snippet:

```
sc.tl.diffmap(adata)
```

#### force directed graph
Expected code snippet:

```
sc.tl.draw_graph(adata)
```

## Clustering & trajectories

#### Leiden
Expected code snippet:

```
sc.tl.leiden(adata, resolution = X)
```

#### Pseudotime
Expected code snippet:

```
sc.tl.dpt(adata)
```

#### marker genes of leiden clusters
Expected code snippet:

```
sc.tl.rank_genes_by_groups(adata, groupby = "leiden")
```

# Plotting PL

## QC

#### Plot highest expressed genes

Expected code snippets:
```
scanpy.pl.highest_expr_genes(adata, n_top=20)
```

#### Plot highly variable genes

Expected code snippets:
```
sc.pl.highly_variable_genes(adata)
```

## Scatters

#### scatter plot

Pick the categories in obs you want to as x and y axes (must be continuous variables)
Pick the category to color the dots with, can be continuos and categorical

Expected code snippet:

```
sc.pl.scatter(adata, x = "category_x", y = "category_y", color = color_category"])
```

## Embeddings

#### UMAP

Pick the category in obs/var you want to plot

Expected code snippet:

```
sc.pl.umap(adata, color = 'category')
```

#### UMAP multiple categories

Pick the categories in obs/var you want to plot

Expected code snippet:

```
sc.pl.umap(adata, color = ["category_1", ..., "category_N"])
```

#### TSNE

Pick the category in obs/var you want to plot

Expected code snippet:

```
sc.pl.tsne(adata, color = 'category')
```

#### draw_graph

Pick the category in obs/var you want to plot

Expected code snippet:

```
sc.pl.draw_graph(adata, color = 'category')
```

#### pca

Pick the category in obs/var you want to plot

Expected code snippet:

```
sc.pl.pca(adata, color = 'category')
```

## Generic

#### Heatmap

pick the list of genes you want to plot. e.g.:
```
[
'KIR3DL2-1',
'AL590523.1',
'CT476828.1',
'PNRC2-1',
'SRSF10-1',
'AC145205.1',
'BAGE5',
'CU459201.1',
'AC002321.2',
'AC002321.1'
]
```

pick the category to group the heatmap by, from the .obs

Expected code snippets:

```
sc.pl.heatmap(adata, groupby = "grouping category", var_names = list_of_genes)
```


#### Dotplot

pick the list of genes you want to plot. e.g.:
```
[
'KIR3DL2-1',
'AL590523.1',
'CT476828.1',
'PNRC2-1',
'SRSF10-1',
'AC145205.1',
'BAGE5',
'CU459201.1',
'AC002321.2',
'AC002321.1'
]
```

pick the category to group the dotplot by, from the .obs

Expected code snippets:

```
sc.pl.dotplot(adata, groupby = "grouping category", var_names = list_of_genes)
```


#### Matrixplot

pick the list of genes you want to plot. e.g.:
```
[
'KIR3DL2-1',
'AL590523.1',
'CT476828.1',
'PNRC2-1',
'SRSF10-1',
'AC145205.1',
'BAGE5',
'CU459201.1',
'AC002321.2',
'AC002321.1'
]
```

pick the category to group the matrix by, from the .obs

Expected code snippets:

```
sc.pl.matrixplot(adata, groupby = "grouping category", var_names = list_of_genes)
```


#### Violinplot

pick the list of genes you want to plot. e.g.:
```
[
'KIR3DL2-1',
'AL590523.1',
'CT476828.1',
'PNRC2-1',
'SRSF10-1',
'AC145205.1',
'BAGE5',
'CU459201.1',
'AC002321.2',
'AC002321.1'
]
```

pick the category to group the violin columns by, from the .obs

Expected code snippets:

```
sc.pl.violin(adata, keys = gene_list, groupby="grouping category")
```


#### Cluster Map


pick the category to group cells by, from the .obs

Expected code snippets:

```
sc.pl.clustermap(adata, obs_keys=None)
```


## Marker genes

### Heatmap

Expected code snippets
```
sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, groupby=None, values_)
```

### Dotplot with logfoldchanges

Set vmin and vmax to +/- 2.5

Expected code snippets
```
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, groupby=None, values_to_plot="logfoldchanges", vmin = -2.5, vmax = 2.5)
```

### Matrixplot with logfoldchanges

Set vmin and vmax to +/- 2.5

Expected code snippets
```
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5, groupby=None, values_to_plot="logfoldchanges", vmin = -2.5, vmax = 2.5)
```