In [1]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.llms import Ollama
#from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings



# log in to VPN or harvard wifi for endpoint

In [12]:
#create the LLM 
ccb_endpoint = 'http://compute-gc-17-255.o2.rc.hms.harvard.edu:11434'

llm2 = Ollama(base_url= ccb_endpoint, model="llama2", temperature=0)
llm3 = Ollama(base_url= ccb_endpoint, model="llama3", temperature=0)
#create the embedding model
oembed = OllamaEmbeddings(base_url=ccb_endpoint, model="nomic-embed-text")
#oembed = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
llmcode = Ollama(base_url= ccb_endpoint, model="codellama", temperature=0)


In [8]:
#this doesnt really need LCEL so we'll just pass everthing through a template
from langchain_core.prompts import PromptTemplate

code_template = PromptTemplate.from_template(
    """ This is the code I have: '{query}', and the error i'm getting is: '{error}'. Please fix the error in my code. """
)

code_template.format(query="funny", error="chickens")

" This is the code I have: 'funny', and the error i'm getting is: 'chickens'. Please fix the error in my code. "

In [5]:
print(llm.invoke(code_template.format(query="funny", error="chickens")))


 The error in your code is that you are trying to use a string literal as an argument for a function that expects a variable. In Python, string literals are immutable, so you can't pass them as variables.

To fix the error, you need to define a variable that holds the string value you want to pass to the function, like this:
```
chicken = 'funny'
my_function(chicken)
```
Alternatively, you can use a tuple or list to pass multiple values to a function, like this:
```
my_function((chicken, 'funny'))
```
I hope this helps! Let me know if you have any other questions.


In [6]:
#programmatic access to csv
import polars as pl
df = pl.read_csv('BiocCodeErrors.csv')
questions = df['question'].to_list()
errors =  df['error'].to_list()

for q, e in zip(questions,errors):
    print(q)
    print(e)

    

library(SummarizedExperiment)

nrows <- 200; ncols <- 6
counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)
rowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)),
                     IRanges(floor(runif(200, 1e5, 1e6)), width=100),
                     strand=sample(c("+", "-"), 200, TRUE),
                     feature_id=sprintf("ID%03d", 1:200))
colData <- DataFrame(Treatment=rep(c("ChIP", "Input"), 3),
                     row.names=LETTERS[1:6]) 
rse <- SummarizedExperiment(assays=SimpleList(counts=counts),
                            rowRanges=colData, colData=rowRanges)
Error in validObject(.Object) :
  invalid class “RangedSummarizedExperiment” object: 1: invalid object for slot "rowRanges" in class "RangedSummarizedExperiment": got class "DFrame", should be or extend class "GenomicRanges_OR_GRangesList"
invalid class “RangedSummarizedExperiment” object: 2:
        'x@assays' is not parallel to 'x'
library(SummarizedExperiment)

nrows <- 200
ncols <- 6
counts <- matrix(runi

In [11]:
#run llama2 
resllama = []
for q, e in zip(questions,errors):
    result = llm.invoke(code_template.format(query=q, error=e))
    resllama.append(result)

resllama

['\nThe error message you\'re getting is because you are trying to pass a `DataFrame` object as the `rowRanges` argument in the `SummarizedExperiment()` function, but the `rowRanges` argument expects an object of class `GenomicRanges_OR_GRangesList`.\n\nYou can fix this error by converting the `DataFrame` object to a `GenomicRanges_OR_GRangesList` object using the `toGRangesList()` method. Here\'s an example of how you can modify your code:\n```\nlibrary(SummarizedExperiment)\n\n# Create a matrix of counts\ncounts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)\n\n# Create a GRanges object for the row ranges\nrowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)),\n                     IRanges(floor(runif(200, 1e5, 1e6)), width=100),\n                     strand=sample(c("+", "-"), 200, TRUE),\n                     feature_id=sprintf("ID%03d", 1:200))\n\n# Convert the DataFrame to a GRangesList object\ncolData <- DataFrame(Treatment=rep(c("ChIP", "Input"), 3),\n                     row

In [12]:
#run codellama
resllamacode = []
for q, e in zip(questions,errors):
    result = llmcode.invoke(code_template.format(query=q, error=e))
    resllamacode.append(result)

resllamacode

['\nIt looks like you are trying to create a `SummarizedExperiment` object using the `SimpleList` function, but it is expecting an object of class `GRanges_OR_GRangesList`. You can try changing your code to use the `GRanges` function instead of `DFrame`, like this:\n```\nlibrary(SummarizedExperiment)\n\nnrows <- 200; ncols <- 6\ncounts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)\nrowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)),\n                     IRanges(floor(runif(200, 1e5, 1e6)), width=100),\n                     strand=sample(c("+", "-"), 200, TRUE),\n                     feature_id=sprintf("ID%03d", 1:200))\ncolData <- DataFrame(Treatment=rep(c("ChIP", "Input"), 3),\n                     row.names=LETTERS[1:6]) \nrse <- SummarizedExperiment(assays=SimpleList(counts=counts),\n                            rowRanges=rowRanges, colData=colData)\n```\nThis should fix the error and allow you to create a `SummarizedExperiment` object.',
 '\nThe issue is that you are trying t

In [13]:
df = df.with_columns(pl.Series(name="Response_llama2_temp0", values=resllama)) 
df = df.with_columns(pl.Series(name="Response_codellama_temp0", values=resllamacode)) 
df

category,ground_truth,question,error,Response_llama2_temp0,Response_codellama_temp0
str,str,str,str,str,str
"""Swapped argume…","""library(Summar…","""library(Summar…","""Error in valid…",""" The error mes…",""" It looks like…"
"""Inconsistent a…","""library(Summar…","""library(Summar…","""Error in valid…",""" The error mes…",""" The issue is …"
"""Invalid input …","""library(Summar…","""library(Summar…","""Error in Summa…",""" The issue is …",""" The issue is …"
"""Antipattern: a…","""library(Summar…","""library(Summar…",,""" The issue wit…",""" It looks like…"
"""Antipattern: R…","""df <- data.fra…","""df <- data.fra…",,""" The issue wit…",""" The issue wit…"
…,…,…,…,…,…
"""Accessing by r…","""library(scRNAs…","""library(scRNAs…","""Error in Summa…",""" The error mes…",""" The issue is …"
"""Old install me…","""if (!require(""…","""source(""https:…","""Error: With R …","""  The error me…",""" The error mes…"
"""Antipattern: U…","""m <- matrix(1:…","""m <- matrix(1:…","""Error in if (c…",""" The issue wit…",""" The issue wit…"
"""Package not lo…","""library(Genomi…","""rowRanges <- G…","""Error in GRang…",""" The `GRanges`…",""" The issue is …"


In [14]:
import pathlib

path: pathlib.Path = "llama2BiocCodeErrors.csv"
df.write_csv(path, separator=",")

In [4]:
## rag retreival

#load from persist 
T500bioc_db = Chroma(persist_directory="./T500-vignettes-vectordb/", embedding_function=oembed)
#test retrival

retriever = T500bioc_db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("How many classes are there in a SummarizedExperiment object?")
print(retrieved_docs[0].page_content)

#> 4 253949420929     4 1_4       old  62
#> 5 253949420929     5 2_1     young  25
Using SummarizedExperiment object
GWENA is also compatible with the use of SummarizedExperiment. The previous dataset can
therefore be transformed as one and used in the next steps
se_kuehne <- SummarizedExperiment::SummarizedExperiment(
  assays = list(expr = t(kuehne_expr)),
  colData = S4Vectors::DataFrame(kuehne_traits)
)
S4Vectors::metadata(se_kuehne) <- list(


In [20]:
# Retrieve and generate using the relevant snippets of the code.
retriever = T500bioc_db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
from langchain_core.prompts import ChatPromptTemplate

template = """ 
I have a code snippet shown below that either throws an error or performs poorly. After that I will provide context from Bioconductor's top 500 most used packages. Using the context given, fix the code as your final answer.  
Code:{question}
=========
Context:
{context}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm2
    | StrOutputParser()
)

In [6]:
#programmatic access to csv
import polars as pl
df = pl.read_csv('BiocCodeErrors.csv')
questions = df['question'].to_list()
errors =  df['error'].to_list()

for q, e in zip(questions,errors):
    print(q)
    print(e)


library(SummarizedExperiment)

nrows <- 200; ncols <- 6
counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)
rowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)),
                     IRanges(floor(runif(200, 1e5, 1e6)), width=100),
                     strand=sample(c("+", "-"), 200, TRUE),
                     feature_id=sprintf("ID%03d", 1:200))
colData <- DataFrame(Treatment=rep(c("ChIP", "Input"), 3),
                     row.names=LETTERS[1:6]) 
rse <- SummarizedExperiment(assays=SimpleList(counts=counts),
                            rowRanges=colData, colData=rowRanges)
Error in validObject(.Object) :
  invalid class “RangedSummarizedExperiment” object: 1: invalid object for slot "rowRanges" in class "RangedSummarizedExperiment": got class "DFrame", should be or extend class "GenomicRanges_OR_GRangesList"
invalid class “RangedSummarizedExperiment” object: 2:
        'x@assays' is not parallel to 'x'
library(SummarizedExperiment)

nrows <- 200
ncols <- 6
counts <- matrix(runi

In [21]:
#run llama2 rag
resllama2 = []
result = []
for q, e in zip(questions,errors):
    result = rag_chain.invoke((q,"\n\n",e))
    resllama2.append(result)

resllama2

['The issue with your code is that you are trying to create a `RangedSummarizedExperiment` object, but you are missing some of the required fields. Here\'s how you can fix it:\n\n1. Add the `rowRanges` field to the `SummarizedExperiment` object:\n```R\ndata <- SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=colData)\n```\n2. Update the `rowRanges` field to include the `strand` and `feature_id` fields:\n```R\nrowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)),\n                     IRanges(floor(runif(200, 1e5, 1e6)), width=100),\n                     strand=sample(c("+", "-"), 200, TRUE),\n                     feature_id=sprintf("ID%03d", 1:200))\nnames(rowRanges) <- paste0("cg", 1:200)\n```\n3. Update the `colData` field to include the `Treatment` and `row.names` fields:\n```R\ncolData <- DataFrame(Treatment=rep(c("ChIP", "Input"), 5),\n                     row.names=paste0("S", 1:20),\n                     group=rep(c("group1", "group2"), c(10

In [24]:
df = pl.read_csv('llama2BiocCodeErrors.csv')
#s = pl.Series("Response_llama2_Bioc_RAG",answers_rag)
df = df.with_columns(pl.Series(name="Response_llama2_Bioc_RAG", values=resllama2)) 
df 


category,ground_truth,question,error,Response_llama2_temp0,Response_codellama_temp0,Response_llama2_Bioc_RAG
str,str,str,str,str,str,str
"""Swapped argume…","""library(Summar…","""library(Summar…","""Error in valid…",""" The error mes…",""" It looks like…","""The issue with…"
"""Inconsistent a…","""library(Summar…","""library(Summar…","""Error in valid…",""" The error mes…",""" The issue is …","""The code you p…"
"""Invalid input …","""library(Summar…","""library(Summar…","""Error in Summa…",""" The issue is …",""" The issue is …","""The issue with…"
"""Antipattern: a…","""library(Summar…","""library(Summar…",,""" The issue wit…",""" It looks like…","""The code you p…"
"""Antipattern: R…","""df <- data.fra…","""df <- data.fra…",,""" The issue wit…",""" The issue wit…","""The code you p…"
…,…,…,…,…,…,…
"""Accessing by r…","""library(scRNAs…","""library(scRNAs…","""Error in Summa…",""" The error mes…",""" The issue is …","""The issue with…"
"""Old install me…","""if (!require(""…","""source(""https:…","""Error: With R …","""  The error me…",""" The error mes…","""The code you p…"
"""Antipattern: U…","""m <- matrix(1:…","""m <- matrix(1:…","""Error in if (c…",""" The issue wit…",""" The issue wit…","""The issue with…"
"""Package not lo…","""library(Genomi…","""rowRanges <- G…","""Error in GRang…",""" The `GRanges`…",""" The issue is …","""The error mess…"


In [25]:
import pathlib

path: pathlib.Path = "llama2BiocCodeErrors.csv"
df.write_csv(path, separator=",")