# LLMs

In [1]:
pip install -q python-dotenv openpyxl

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [3]:
df = pd.read_excel('Raw Data.xlsx')

In [4]:
from dotenv import load_dotenv
import os
load_dotenv('key.env')
OAI_KEY = os.environ.get('OPENAI_API_KEY')
GROQ_KEY = os.environ.get('GROQ_API_KEY')

In [5]:
import openai
from openai import OpenAI
# get client

choice = input('Willing to use paid or free?')

if choice=='paid':
    # paid, get your key and add in key.env file
    client = OpenAI(api_key=OAI_KEY)
else:
    # free, get your free key and add in key.env file
    client = openai.OpenAI(
    base_url="https://api.groq.com/openai/v1",
    # create your api key https://console.groq.com/keys
    api_key=GROQ_KEY
    )

Willing to use paid or free? free


# Few Shot inference using LLMs

In [6]:
def get_predicted_linker(heavy_chain,light_chain,payload):
    completion = client.chat.completions.create(

        # List of models
        # Free:- https://console.groq.com/docs/models
        # Paid:- https://openai.com/api/pricing/
        model= 'llama-3.1-8b-instant',# chatgpt cheapest and efficient -> "gpt-4o-mini",
        messages=[
            {"role": "system", "content": """
            You are an AI assistant in Medical Domain specifically in Antibody-drug conjugate. Your task is to predict or suggest the Linker SMILES based on sequences such as Heavy Chain Sequences, Light Chain Sequences, and Payload SMILES.
            
            Here are some examples:
            
            Example 1:
            Heavy Chain Sequences: QVQLVQSGAEVKKPGSSVKVSCKASGGTFSNYWMHWVRQAPGQGLEWMGATYRGHSDTYYNQKFKGRVTITADKSTSTAYMELSSLRSEDTAVYYCARGAIYDGYDVLDNWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK
            Light Chain Sequences: DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKLLIYYTSNLHSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYRKLPWTFGQGTKLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
            Payload SMILES: CC[C@H](C)[C@@H]([C@@H](CC(=O)N1CCC[C@H]1[C@@H]([C@@H](C)C(=O)N[C@@H](CC2=CC=CC=C2)C(=O)O)OC)OC)N(C)C(=O)[C@H](C(C)C)NC(=O)[C@H](C(C)C)NC
            Linker SMILES: O=C(O)CCCCCN1C(=O)C=CC1=O
            
            Example 2:
            Heavy Chain Sequences: EVQLVESGGGLVQPGGSLRLSCAASGYTFSSYWIEWVRQAPGKGLEWIGEILPGGGDTNYNEIFKGRATFSADTSKNTAYLQMNSLRAEDTAVYYCTRRVPIRLDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK
            Light Chain Sequences: DIQLTQSPSSLSASVGDRVTITCKASQSVDYEGDSFLNWYQQKPGKAPKLLIYAASNLESGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSNEDPLTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
            Payload SMILES: CC[C@H](C)[C@@H]([C@@H](CC(=O)N1CCC[C@H]1[C@@H]([C@@H](C)C(=O)N[C@H](C)[C@H](C2=CC=CC=C2)O)OC)OC)N(C)C(=O)[C@H](C(C)C)NC(=O)[C@H](C(C)C)NC
            Linker SMILES: CC(C)[C@@H](C(=O)N[C@@H](CCCNC(=O)N)C(=O)NC1=CC=C(C=C1)CO)NC(=O)CCCCCN2C(=O)C=CC2=O
    
            Example 3:
            Heavy Chain Sequences: EVQLLESGGGLVQPGGSLRLSCAASGFTFSNYAMSWVRQAPGKGLEWVSSISGSGDYTYYTDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARSPWGYYLDSWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKRVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK
            Light Chain Sequences: DIQMTQSPPSLSASAGDRVTITCRASQGISSRLAWYQQKPEKAPKSLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPYTFGQGTKLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
            Payload SMILES: CC[C@H](C)[C@@H]([C@@H](CC(=O)N1CCC[C@H]1[C@@H]([C@@H](C)C(=O)N[C@H](C)[C@H](C2=CC=CC=C2)O)OC)OC)N(C)C(=O)[C@H](C(C)C)NC(=O)[C@H](C(C)C)NC
            Linker SMILES: CC(C)[C@@H](C(=O)N[C@@H](CCCNC(=O)N)C(=O)NC1=CC=C(C=C1)CO)NC(=O)CCCCCN2C(=O)C=CC2=O
            
            When given new sequences, respond with only the predicted Linker SMILES and nothing else.
            """},
            {"role": "user", "content": f"""
            Heavy Chain Sequences: {heavy_chain}
            Light Chain Sequences: {light_chain}
            Payload SMILES: {payload}
            """}
        ]
    )
    return completion.choices[0].message.content

In [7]:
row_num = 10
heavy = df['Heavy Chain Sequences'][row_num]
light = df['Light Chain Sequences'][row_num]
payload = df['Payload SMILES'][row_num]

get_predicted_linker(heavy,light,payload)

'CC(C)[C@H](C(=O)N[C@@H](CCCNC(=O)N)C(=O)NC1=CC=C(C=C1)CO)NC(=O)CCCCCN2C(=O)C=CC2=C'

In [8]:
row_num = 12
heavy = df['Heavy Chain Sequences'][row_num]
light = df['Light Chain Sequences'][row_num]
payload = df['Payload SMILES'][row_num]

get_predicted_linker(heavy,light,payload)

'CC(C)[C@@H](C(=O)N[C@H](CCC1=C2C=CC(=C3)S(C=O)N(C3)C(=O)CC2)[C@@H]2CCC(=O)NC(C([C@H](C)O)O)O)NC(=O)CCCCCN4C(=O)C=CC4=O'