In [3]:
!pip install crewai==0.28.8 crewai_tools==0.1.6 langchain_community==0.0.29



In [4]:
!pip install --quiet bitsandbytes
!pip install --quiet --upgrade accelerate
!pip install --quiet sentencepiece
# !pip install flash-attn --no-build-isolation


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.1/342.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h

In [5]:
!pip uninstall bitsandbytes -y
!pip install bitsandbytes==0.45.0


Found existing installation: bitsandbytes 0.45.3
Uninstalling bitsandbytes-0.45.3:
  Successfully uninstalled bitsandbytes-0.45.3
Collecting bitsandbytes==0.45.0
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


In [10]:
import os
import torch
import json
import logging
from torch import bfloat16
from crewai import Agent, Task, Crew
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain_core.tools import StructuredTool
from langchain_core.pydantic_v1 import BaseModel, Field
from IPython.display import Markdown
import re

In [11]:
logging.basicConfig(level=logging.DEBUG)


In [9]:
# Load LLaMA model for parameter extraction
model_id_llama = "alokabhishek/Llama-2-7b-chat-hf-bnb-8bit"
tokenizer_llama = AutoTokenizer.from_pretrained(model_id_llama, use_fast=True)
model_llama = AutoModelForCausalLM.from_pretrained(model_id_llama, device_map="auto")
pipe_llama = pipeline(model=model_llama, tokenizer=tokenizer_llama, task='text-generation')
llm = HuggingFacePipeline(pipeline=pipe_llama)


tokenizer_config.json:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors.index.json:   0%|          | 0.00/41.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

Device set to use cuda:0


In [12]:
# Load ProtGPT2 model for protein sequence generation
protgpt2_pipeline = pipeline('text-generation', model="nferruz/ProtGPT2")


config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/655k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/314k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Device set to use cuda:0


In [26]:
# Define Parameter Extraction Schema
class ParameterExtractionInput(BaseModel):
    user_prompt: str

In [27]:
class ParameterExtractionOutput(BaseModel):
    context_sequence: str = Field(default="MKVSA", description="User-provided starting sequence")
    max_length: int = Field(default=100, description="Max sequence length")
    do_sample: bool = Field(default=True, description="Enable sampling")
    top_k: int = Field(default=950, description="Top-k filtering")
    repetition_penalty: float = Field(default=1.2, description="Repetition penalty")
    num_return_sequences: int = Field(default=10, description="Number of sequences to generate")
    eos_token_id: int = Field(default=0, description="End-of-sequence token ID")


In [28]:
def extract_parameters(user_prompt: str) -> dict:
    """Extracts structured parameters for protein sequence generation."""
    system_prompt = (
        "Extract the following parameters from the given user prompt and return them strictly in valid JSON format:\n"
        "{\n"
        '  "context_sequence": "string",\n'
        '  "max_length": "integer",\n'
        '  "do_sample": "boolean",\n'
        '  "top_k": "integer",\n'
        '  "repetition_penalty": "float",\n'
        '  "num_return_sequences": "integer",\n'
        '  "eos_token_id": "integer"\n'
        "}"
    )
    input_text = f"{system_prompt}\nUser Prompt: {user_prompt}"
    response = pipe_llama(input_text)[0]["generated_text"]
    
    try:
        json_match = re.search(r"\{.*\}", response, re.DOTALL)
        extracted_params = json.loads(json_match.group()) if json_match else {}
        extracted_params = ParameterExtractionOutput.parse_obj(extracted_params).dict()
    except Exception as e:
        logging.warning(f"Error parsing LLM response: {e}. Using default values.")
        extracted_params = ParameterExtractionOutput().dict()

    return extracted_params

In [29]:
# Define Sequence Generation Schema
class SequenceGenerationInput(BaseModel):
    context_sequence: str = Field(description="Starting protein sequence")
    max_length: int = Field(description="Max sequence length")
    do_sample: bool = Field(description="Enable sampling")
    top_k: int = Field(description="Top-k filtering")
    repetition_penalty: float = Field(description="Repetition penalty")
    num_return_sequences: int = Field(description="Number of sequences to generate")
    eos_token_id: int = Field(description="End-of-sequence token ID")


In [30]:
def generate_sequence(params: dict) -> list:
    """Generates protein sequences based on extracted parameters."""
    logging.info(f"Generating sequences with parameters: {params}")
    sequences = protgpt2_pipeline(
        params["context_sequence"],
        max_length=params["max_length"],
        do_sample=params["do_sample"],
        top_k=params["top_k"],
        repetition_penalty=params["repetition_penalty"],
        num_return_sequences=params["num_return_sequences"],
        eos_token_id=params["eos_token_id"]
    )
    return [seq["generated_text"] for seq in sequences]


In [32]:
# Define Agents
param_extraction_agent = Agent(
    role="Parameter Extraction Specialist",
    goal="Extract structured parameters for protein generation",
    backstory="Expert in translating user intent into precise model parameters.",
    allow_delegation=False,
    verbose=True,
    llm=llm
)

generation_agent = Agent(
    role="Protein Sequence Generator",
    goal="Generate protein sequences based on structured parameters.",
    backstory="A specialist in protein language modeling, capable of generating realistic sequences.",
    allow_delegation=False,
    verbose=True,
    llm=llm
)


In [33]:
# Define Tools
from crewai_tools import BaseTool

class ParameterExtractionTool(BaseTool):
    name: str = "Parameter Extraction Tool"
    description: str = "Extracts structured parameters for ProtGPT2 from a user prompt."

    def _run(self, user_prompt: str) -> dict:
        return extract_parameters(user_prompt)

class SequenceGenerationTool(BaseTool):
    name: str = "Sequence Generation Tool"
    description: str = "Generates protein sequences based on extracted parameters."

    def _run(self, params: dict) -> list:
        return generate_sequence(params)

In [34]:
extract_tool = ParameterExtractionTool()
generate_tool = SequenceGenerationTool()


In [35]:
# Define Tasks
param_extraction_task = Task(
    description="Extracts necessary parameters from the user input.",
    expected_output="A structured parameter list for sequence generation.",
    tools=[extract_tool],
    agent=param_extraction_agent,
    verbose=True
)

generation_task = Task(
    description="Generates protein sequences based on extracted parameters.",
    expected_output="A list of generated protein sequences.",
    tools=[generate_tool],
    agent=generation_agent,
    verbose=True
)


In [43]:
from crewai import Process  # Ensure Process is imported
# Initialize Crew
crew = Crew(
    agents=[param_extraction_agent, generation_agent],
    tasks=[param_extraction_task, generation_task],
    verbose=2,
    process=Process.hierarchical,  # Set the process to hierarchical
    manager_llm=llm, 
)

In [44]:
# User Input
user_input = input("Enter your protein generation request (including a starting sequence if desired): ")


Enter your protein generation request (including a starting sequence if desired):  generate 5 protein sequences starting with MKVSA and with maximum length 210 


In [None]:
# Run the Full Workflow
result = crew.kickoff(inputs={"user_prompt": user_input})


[1m[92m [DEBUG]: Working Agent: Crew Manager[00m
[1m[92m [INFO]: Starting Task: Extracts necessary parameters from the user input.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3m[0m[32;1m[1;3m[0m[32;1m[1;3m[0m[32;1m[1;3m[0m

In [46]:
user_test_input = "Generate a protein sequence starting with MKVSA, max length 210, top_k 899."

# Call parameter extraction manually
extracted_params = extract_parameters(user_test_input)

print(extracted_params)


{'context_sequence': 'MKVSA', 'max_length': 100, 'do_sample': True, 'top_k': 950, 'repetition_penalty': 1.2, 'num_return_sequences': 10, 'eos_token_id': 0}
