## install required packages
run the following command line in terminal

       pip install -r requirements.txt

## Import packages

In [1]:
import pandas as pd
import numpy as np
import os
import json
import obonet
import inflect
import networkx as nx
import matplotlib.pyplot as plt
from openai import OpenAI
from pydantic import BaseModel
import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

## Define output format

In [2]:
class cellTypeFormat(BaseModel):
       cellType: str

## Read cell clusters file 
file path set to ./Data/all.csv

In [3]:
def read_cluster(path):
      dataframe = pd.read_csv(path)
      print(dataframe['marker'])
      return dataframe 
gene_list = read_cluster('./Data/all.csv')
print(gene_list.iloc[0])

0       MS4A1, TNFRSF13B, IGHM, IGHD, AIM2, CD79A, LIN...
1       MS4A1, COCH, AIM2, BANK1, SSPN, CD79A, TEX9, R...
2       IGHM, IGHD, CD79A, IL4R, MS4A1, CXCR4, BTG1, T...
3       IGHA2, MZB1, TNFRSF17, DERL3, TXNDC5, TNFRSF13...
4       GZMH, CD4, FGFBP2, ITGB1, GZMA, CST7, GNLY, B2...
                              ...                        
1125    KLRD1,NKG7,XCL2,CTSW,XCL1,GNLY,GZMA,KLRB1,KLRC...
1126    JCHAIN,MZB1,DERL3,IGHG2,IGHA2,TNFRSF17,SDC1,FC...
1127    TFF3,PKHD1L1,PROX1,NTS,FLT4,RELN,GPR182,STAB2,...
1128    VWF,ACKR1,PLVAP,PECAM1,AQP1,RAMP3,CLEC14A,SOX1...
1129    MSLN,UPK3B,MUC16,CALB2,KLK11,BICDL1,CHAC1,TGM1...
Name: marker, Length: 1130, dtype: object
dataset                                                               Azimuth
tissue                                                                   PBMC
marker                      MS4A1, TNFRSF13B, IGHM, IGHD, AIM2, CD79A, LIN...
manual_annotation                                         Intermediate B cell
manual_C

## Split genelist
split genelist to multiple chunks so that it won't exceed model input size

In [5]:
# Generate the user message for each chunk
def generate_user_message(gene_list:  pd.DataFrame, max_chunk_size=100) -> list:
    # Split the gene_list into chunks
    chunks = [group for _, group in gene_list.groupby(gene_list.index // max_chunk_size)]
    messages = []
    for chunk in chunks:
        # Construct the user message for each chunk
        user_message = (
            chunk['tissue'] + ' : '
            + chunk['marker']
        )
        messages.append(user_message)
    return messages

# Generate messages with a tunable chunk size
messages = generate_user_message(gene_list, max_chunk_size=10)  # Adjust max_chunk_size as needed
print(messages[0])

0    PBMC : MS4A1, TNFRSF13B, IGHM, IGHD, AIM2, CD7...
1    PBMC : MS4A1, COCH, AIM2, BANK1, SSPN, CD79A, ...
2    PBMC : IGHM, IGHD, CD79A, IL4R, MS4A1, CXCR4, ...
3    PBMC : IGHA2, MZB1, TNFRSF17, DERL3, TXNDC5, T...
4    PBMC : GZMH, CD4, FGFBP2, ITGB1, GZMA, CST7, G...
5    PBMC : TCF7, CD4, CCR7, IL7R, FHIT, LEF1, MAL,...
6    PBMC : MKI67, TOP2A, PCLAF, CENPF, TYMS, NUSAP...
7    PBMC : IL7R, TMSB10, CD4, ITGB1, LTB, TRAC, AQ...
8    PBMC : IL7R, CCL5, FYB1, GZMK, IL32, GZMA, KLR...
9    PBMC : RTKN2, FOXP3, AC133644.2, CD4, IL2RA, T...
dtype: object


## Gemeni ai

### model configuration
set the GOOGLE_API_KEY under .env file

In [6]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
gemeni_model = genai.GenerativeModel("gemini-1.5-flash")
gemeni_config = genai.GenerationConfig(response_mime_type="application/json",response_schema=list[cellTypeFormat])

### Send prompt to gemini

In [7]:
earlyStop = 2
Prompt = "Identify cell types using the following tissue name and markers separately for each row. Only provide the cell type name. Do not show numbers before the name. Some can be a mixture of multiple cell types."
def annotateCell_Gemini(messages: list)->list:
    global gemeni_model, gemeni_config 
    responses = []
    i = 0
    for message in messages:
       i+=1
       if i> earlyStop:
              break
       prompt = Prompt
       for cluster in message:
              prompt += cluster
              prompt += '\n'
       print(prompt)
       response = gemeni_model.generate_content(
              prompt,
              generation_config=gemeni_config
       )
       responses.append(response.text)
    return responses


In [8]:
annotateCell_Gemini_results = annotateCell_Gemini(messages)
print(annotateCell_Gemini_results)

Identify cell types using the following tissue name and markers separately for each row. Only provide the cell type name. Do not show numbers before the name. Some can be a mixture of multiple cell types.PBMC : MS4A1, TNFRSF13B, IGHM, IGHD, AIM2, CD79A, LINC01857, RALGPS2, BANK1, CD79B
PBMC : MS4A1, COCH, AIM2, BANK1, SSPN, CD79A, TEX9, RALGPS2, TNFRSF13C, LINC01781
PBMC : IGHM, IGHD, CD79A, IL4R, MS4A1, CXCR4, BTG1, TCL1A, CD79B, YBX3
PBMC : IGHA2, MZB1, TNFRSF17, DERL3, TXNDC5, TNFRSF13B, POU2AF1, CPNE5, HRASLS2, NT5DC2
PBMC : GZMH, CD4, FGFBP2, ITGB1, GZMA, CST7, GNLY, B2M, IL32, NKG7
PBMC : TCF7, CD4, CCR7, IL7R, FHIT, LEF1, MAL, NOSIP, LDHB, PIK3IP1
PBMC : MKI67, TOP2A, PCLAF, CENPF, TYMS, NUSAP1, ASPM, PTTG1, TPX2, RRM2
PBMC : IL7R, TMSB10, CD4, ITGB1, LTB, TRAC, AQP3, LDHB, IL32, MAL
PBMC : IL7R, CCL5, FYB1, GZMK, IL32, GZMA, KLRB1, TRAC, LTB, AQP3
PBMC : RTKN2, FOXP3, AC133644.2, CD4, IL2RA, TIGIT, CTLA4, FCRL3, LAIR2, IKZF2

Identify cell types using the following tissue name 

## GPT

### model configuration

In [None]:
client = OpenAI()

### upload file

In [None]:
def upload_cell_cluster(filename):
       global client
       client.files.create(
              file=open(filename, "rb"),
              purpose="assistant"
       )
       print(client.files.list())

### send prompt to GPT

In [None]:
def annotateCell_GPT(messages: list)->list:
    global client
    responses = []
    for message in messages:
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content":  "Identify cell types using the following tissue name and markers separately for each row. "
            + "Only provide the cell type name. "
            + "Do not show numbers before the name. "
            + "Some can be a mixture of multiple cell types."},
                {"role": "user", "content": message}
            ],
            response_format=cellTypeFormat,
        )
        responses.append(completion.choices[0].message.parsed)
    return responses


## Scoring system

### result parsing

In [9]:
def json_parsing(annotateCell_results: json)->list:
       # Parse the strings into Python objects
       parsed_results = [json.loads(item.strip()) for item in annotateCell_results]
       # Flatten the results to get a list of all cell types
       cell_types = [entry['cellType'] for result in parsed_results for entry in result]
       return cell_types

In [15]:
cell_types = json_parsing(annotateCell_Gemini_results)
print(cell_types)

['B cells', 'B cells', 'B cells', 'B cells', 'Cytotoxic T cells', 'T cells', 'Proliferating cells', 'T cells', 'Cytotoxic T cells', 'T regulatory cells', 'T cells, activated NK cells', 'T cells', 'T cells', 'Cytotoxic T cells', 'Monocytes', 'Dendritic cells', 'Dendritic cells', 'B cells', 'Monocytes', 'Macrophages']


### CL correspondence
match the cell_types to get CLID from Cell Ontology

In [11]:
# URL for Cell Ontology (CO) OBO file
CO_URL = 'http://purl.obolibrary.org/obo/CL.obo'
OBO_FILE_PATH = 'oboNet/cl.obo'
def load_ontology(url):
    graph = obonet.read_obo(url)
    return graph

inflector = inflect.engine()

def lookup_clid_by_name(graph: nx.graph, cell_name: str)->str:
   
    normalized_name = inflector.singular_noun(cell_name.lower()) or cell_name.lower()

    # Search for the term by normalized name
    for node, data in graph.nodes(data=True):
        if 'name' in data:
            # Singularize and normalize the graph's 'name'
            graph_name = inflector.singular_noun(data['name'].lower()) or data['name'].lower()

            # Compare normalized names
            if graph_name == normalized_name:
                return node  # Return the term ID if a match is found

    return None

# Load the Cell Ontology
graph = load_ontology(OBO_FILE_PATH)


In [12]:
clids = []
for cell_type in cell_types:
  clids.append(lookup_clid_by_name(graph, cell_type))

print(clids)

['CL:0000236', 'CL:0000236', 'CL:0000236', 'CL:0000236', 'CL:0000910', 'CL:0000084', None, 'CL:0000084', 'CL:0000910', None, None, 'CL:0000084', 'CL:0000084', 'CL:0000910', 'CL:0000576', 'CL:0000451', 'CL:0000451', 'CL:0000236', 'CL:0000576', 'CL:0000235']


### Calculate difference with two CLID
change the graph from Cell Ontology to undirected graph
and calculate the distance between two nodes ( manual CLID and LLM annotated CLID)

In [44]:
ugraph = graph.to_undirected()

def calculate_difference(graph:nx.graph, clid_1:str, clid_2:str)->int:
    try:
        return nx.shortest_path_length(graph, source=clid_1, target=clid_2)
    except:
        return -1
def calculate_difference_name(graph:nx.graph, type_1:str, type_2:str)->int:
    clid_1 = lookup_clid_by_name(graph, type_1)
    clid_2 = lookup_clid_by_name(graph, type_2)
    return calculate_difference(graph, clid_1, clid_2)


In [47]:
print(calculate_difference(graph, 'CL:0002250', 'CL:0009016'))
print(calculate_difference(graph,  'CL:0009016','CL:0002250'))
# intestinal crypt stem cell 0002250
# intestinal crypt stem cell of large intestine 0009016
# should put the broader type (i.e. LLM annotated ) behind
print(calculate_difference_name(graph, 'T cells', 'Cytotoxic T cells'))
print(calculate_difference_name(graph, 'Cytotoxic T cells', 'T cells'))

-1
1
-1
3


In [43]:
diff_result=[]
for i in range(len(clids)):
  clid_1 =clids[i] #LLM annotated CLID
  clid_2 =gene_list["manual_CLID"][i] 
  if(clid_1 != None and clid_2!="nan"):# manual annotated CLID
    difference = calculate_difference(graph, clid_2, clid_1)
    diff_result.append(difference)
  else:
    diff_result.append(-1)
  print(clid_1, clid_2)
  # print(difference)
print(diff_result)

CL:0000236 CL:0000818
CL:0000236 CL:0000787
CL:0000236 CL:0000788
CL:0000236 CL:0000980
CL:0000910 CL:0000934
CL:0000084 CL:0000895
None nan
CL:0000084 CL:0000904
CL:0000910 CL:0000905
None CL:0000815
None CL:0000900
CL:0000084 nan
CL:0000084 CL:0000907
CL:0000910 CL:0000913
CL:0000576 nan
CL:0000451 CL:0000990
CL:0000451 CL:0000990
CL:0000236 CL:0000784
CL:0000576 CL:0001054
CL:0000235 nan
[2, 3, 3, 3, -1, 3, -1, 5, -1, -1, -1, -1, 5, -1, -1, 1, 1, -1, 1, -1]


### Assign new scoring system to all.csv

In [39]:
MODELS = ['gpt4aug3', 'gpt4mar23','gpt3.5aug3', 'CellMarker2.0','SingleR', 'ScType']
MANUAL = 'manual'

In [40]:
def assign_score(gene_list: pd.DataFrame):
       global MODELS
       global MANUAL
       global graph
       gene_list_scored = pd.DataFrame()
       gene_list_scored = gene_list.copy()
       for MODEL in MODELS:
              scores = []
              for i in range(len(gene_list)):
                     clid_1 = gene_list[MANUAL+'_CLID'][i] 
                     clid_2 = gene_list[MODEL+'_CLID'][i]
                     scores.append(calculate_difference(graph, clid_1, clid_2))
              gene_list_scored[MODEL+'_aggrement_modified'] = scores
       return gene_list_scored


In [41]:
gene_list_scored = assign_score(gene_list)
print(gene_list_scored)

            dataset       tissue  \
0           Azimuth         PBMC   
1           Azimuth         PBMC   
2           Azimuth         PBMC   
3           Azimuth         PBMC   
4           Azimuth         PBMC   
...             ...          ...   
1125  tabulasapiens  Vasculature   
1126  tabulasapiens  Vasculature   
1127  tabulasapiens  Vasculature   
1128  tabulasapiens  Vasculature   
1129  tabulasapiens  Vasculature   

                                                 marker  \
0     MS4A1, TNFRSF13B, IGHM, IGHD, AIM2, CD79A, LIN...   
1     MS4A1, COCH, AIM2, BANK1, SSPN, CD79A, TEX9, R...   
2     IGHM, IGHD, CD79A, IL4R, MS4A1, CXCR4, BTG1, T...   
3     IGHA2, MZB1, TNFRSF17, DERL3, TXNDC5, TNFRSF13...   
4     GZMH, CD4, FGFBP2, ITGB1, GZMA, CST7, GNLY, B2...   
...                                                 ...   
1125  KLRD1,NKG7,XCL2,CTSW,XCL1,GNLY,GZMA,KLRB1,KLRC...   
1126  JCHAIN,MZB1,DERL3,IGHG2,IGHA2,TNFRSF17,SDC1,FC...   
1127  TFF3,PKHD1L1,PROX1,NTS,FLT4,RE

In [42]:
gene_list_scored.to_csv('Data/all_modified.csv')