## Setup

N.B. Based on an [OpenAI example notebook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_finetune_chat_models.ipynb). [gptchem](https://github.com/kjappelbaum/gptchem) uses an outdated version of the OpenAI API.

In [7]:

from typing import Union, List, Dict
import pathlib as pl
import json
import os
from datetime import datetime

import openai
import pandas as pd
import numpy as np

from ai4chem.tokenizers import gpt_num_tokens_from_messages
from ai4chem.data import Deep4ChemDataset, ChemFluorDataset, split_on_unique_smiles

with open('/Users/azadoks/.zshrc.d/10_openai_bot.sh', 'r') as f:
    api_key = f.read().split('=')[1].strip()

client = openai.OpenAI(api_key=api_key)

## Data preparation

In [8]:
def get_system_message(property_name: str) -> str:
    return f"You predict properties of molecules with high accuracy to assist chemists. You are to predict the {property_name} maximum wavelength in nanometers of the provided molecule when dissolved in the provided solvent."

def get_user_message(row, representation: Union[str, List[str]]="smiles") -> str:
    def _get_molecule_line(row, representation: str) -> str:
        return f"Molecule {representation}: {row[f'chromophore_{representation}']}"
    def _get_solvent_line(row, representation: str) -> str:
        return f"Solvent {representation}: {row[f'solvent_{representation}']}"

    if isinstance(representation, str):
        molecule_lines = _get_molecule_line(row, representation)
        solvent_lines = _get_solvent_line(row, representation)
    else:
        molecule_lines = "\n".join([_get_molecule_line(row, rep) for rep in representation])
        solvent_lines = "\n".join([_get_solvent_line(row, rep) for rep in representation])

    return f'{molecule_lines}\n\n{solvent_lines}'

def get_train_conversation(row, property_name: str, representation: Union[str, List[str]]="smiles") -> Dict:
    messages = []
    messages.append({'role': 'system', 'content': get_system_message(property_name)})
    messages.append({'role': 'user', 'content': get_user_message(row, representation)})
    property_key = f'{property_name.lower()}_max'
    messages.append({'role': 'assistant', 'content': f'{int(row[property_key]):3d} nm'})
    return {'messages': messages}

def get_test_conversation(row, property_name: str, representation: Union[str, List[str]]="smiles") -> Dict:
    messages = []
    messages.append({'role': 'system', 'content': get_system_message(property_name)})
    messages.append({'role': 'user', 'content': get_user_message(row, representation)})
    property_key = f'{property_name.lower()}_max'
    return {'messages': messages}, f'{int(row[property_key]):3d} nm'

def get_prompt(row, property_name: str, representation: Union[str, List[str]]="smiles") -> str:
    return f"What is the {property_name} maximum wavelength of {row[f'chromophore_{representation}']} dissolved in {row[f'solvent_{representation}']}?"

def get_completion(row, property_name: str) -> str:
    property_key = f'{property_name.lower()}_max'
    return f' {int(row[property_key]):3d} nm'

def get_train_prompt_completion(row, property_name: str, representation: Union[str, List[str]]="smiles") -> Dict:
    return {
        'prompt': get_prompt(row, property_name, representation),
        'completion': get_completion(row, property_name)
    }

def get_inverse_prompt(row, property_name: str, representation: str="smiles") -> str:
    property_key = f'{property_name.lower()}_max'
    return f"What is a chromophore + solvent pair with an {property_name} maximum wavelength of {int(row[property_key]):3d} nm?"

def get_inverse_completion(row, representation: str="smiles") -> str:
    return f"Molecule {representation}: {row[f'chromophore_{representation}']}$$$ Solvent {representation}: {row[f'solvent_{representation}']}$$$"

def get_inverse_prompt_completion(row, property_name: str, representation: str="smiles") -> Dict:
    return {
        'prompt': get_inverse_prompt(row, property_name, representation),
        'completion': get_inverse_completion(row, representation)
    }

def write_jsonl(data: list, filename: os.PathLike) -> None:
    with open(filename, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

def count_tokens(data: List[Dict]) -> int:
    return sum(
        gpt_num_tokens_from_messages(row['messages'], model="gpt-3.5-turbo-0613")
        for row in data
    )

def make_completion_conversations(data: pd.DataFrame, property_name: str, direction: str, representation: Union[str, List[str]]="smiles") -> List[Dict]:
    if direction == 'forward':
        return data.apply(lambda x: get_train_prompt_completion(x, property_name, representation), axis=1)
    if direction == 'inverse':
        return data.apply(lambda x: get_inverse_prompt_completion(x, property_name, representation), axis=1)
    raise ValueError(f"Invalid direction: {direction}")

In [9]:
chemfluor_path = '../data/chemfluor/data.csv'
deep4chem_path = '../data/deep4chem/data.csv'

chemfluor = ChemFluorDataset(chemfluor_path, canonicalize_smiles=True)
deep4chem = Deep4ChemDataset(deep4chem_path, canonicalize_smiles=True)

combined_df = pd.concat([chemfluor.clean_data, deep4chem.clean_data], ignore_index=True).reset_index(drop=True)
n_unique_smiles = len(combined_df['chromophore_smiles'].unique())

In [10]:
from rdkit import Chem

gpt_gen = [
    {
        'chromophore_smiles': 'CC1=CC(=O)C2=C(O1)C=CC=C2',
        'solvent_smiles': 'CC#N'
    },
    {
        'chromophore_smiles': 'CC1=CC=C(C=C1)[N+](=O)[O-]',
        'solvent_smiles': 'ClCCl'
    },
    {
        'chromophore_smiles': 'C1=CC(=CC=C1C=CC2=CC=CC=C2)C=O',
        'solvent_smiles': 'C1CCOC1'
    },
    {
        'chromophore_smiles': 'CCN(CC)C(=O)C1=CC=CC2=C1C=CC=C2',
        'solvent_smiles': 'CCOCC'
    },
    {
        'chromophore_smiles': 'CC1=CC(=O)C=C2C1C=CC(=CC2=O)C#N',
        'solvent_smiles': 'OCC'
    },
]

for (i, pair) in enumerate(gpt_gen):
    gpt_gen[i] = {k: Chem.MolToSmiles(Chem.MolFromSmiles(v)) for k, v in pair.items()}

for pair in gpt_gen:
    for (k, v) in pair.items():
        print(f'{k}: {v} {v in combined_df[k].values}')
    print()

chromophore_smiles: Cc1cc(=O)c2ccccc2o1 False
solvent_smiles: CC#N True

chromophore_smiles: Cc1ccc([N+](=O)[O-])cc1 False
solvent_smiles: ClCCl True

chromophore_smiles: O=Cc1ccc(C=Cc2ccccc2)cc1 False
solvent_smiles: C1CCOC1 True

chromophore_smiles: CCN(CC)C(=O)c1cccc2ccccc12 False
solvent_smiles: CCOCC True

chromophore_smiles: CC1=CC(=O)C=C2C(=O)C=C(C#N)C=CC12 False
solvent_smiles: CCO True



In [18]:
# n_trains = (10, 50, 100, 500, 1000)
n_trains = (1000, )
seed = 9997
directions = ('forward', 'inverse')
property_names = ('emission', 'absorption')
representation = 'smiles'
model_name = 'babbage-002'

for n_train in n_trains:
    splits = (n_train, 0, n_unique_smiles - n_train)
    train_df, val_df, test_df = split_on_unique_smiles(combined_df, splits, seed=seed)
    print(n_train, train_df.shape, val_df.shape, test_df.shape)
    for property_name in property_names:
        for direction in directions:
            # experiment_name = f'{n_train:d}-{representation.lower()[:3]}-{property_name[:3].lower()}-{direction[:1].lower()}{model_name[:1].lower()}{datetime.now().strftime("%d%H%M")}'
            experiment_name = f'{n_train:d}{property_name[:1].lower()}{direction[:1].lower()}{model_name[:1].lower()}{datetime.now().strftime("%d%H%M")}'
            experiment_data_path = pl.Path(f'../data/{experiment_name}')
            experiment_data_path.mkdir(exist_ok=True)

            train_convos = make_completion_conversations(train_df, property_name, direction, representation)
            val_convos = make_completion_conversations(val_df, property_name, direction, representation)
            test_convos = make_completion_conversations(test_df, property_name, direction, representation)

            train_filename = str(experiment_data_path / "train.jsonl")
            write_jsonl(train_convos, train_filename)
            if len(val_convos) > 0:
                val_filename = str(experiment_data_path / "validate.jsonl")
                write_jsonl(val_convos, val_filename)
            test_filename = str(experiment_data_path / "test.jsonl")
            write_jsonl(test_convos, test_filename)

            with open(train_filename, "rb") as train_fd:
                training_response = client.files.create(
                    file=train_fd, purpose="fine-tune"
                )
            train_file_id = training_response.id
            print("Training file ID:", train_file_id)

            if len(val_convos) > 0:
                with open(val_filename, "rb") as val_fd:
                    val_response = client.files.create(
                        file=val_fd, purpose="fine-tune"
                    )
                val_file_id = val_response.id
                print("Validation file ID:", val_file_id)
            else:
                val_file_id = openai.NOT_GIVEN

            response = client.fine_tuning.jobs.create(
                training_file=train_file_id,
                validation_file=val_file_id,
                model="babbage-002",
                suffix=experiment_name,
            )

            job_id = response.id

            print("Job ID:", response.id)
            print("Status:", response.status)

            with open(experiment_data_path / "job_id.txt", "w") as f:
                f.write(job_id)

1000 (2460, 4) (0, 4) (16899, 4)
Training file ID: file-UxONBSdgKf5SefPiUZnwk5s7
Job ID: ftjob-MxXjF6V2LFHxrt6Wvuo3TuDS
Status: validating_files
Training file ID: file-7l0p7LkuWxApdJkR7bZ69OmE
Job ID: ftjob-NWjQY5JkLod5upQuXVINcaaZ
Status: validating_files
Training file ID: file-G4yh7TXVGKhKVRQJcMSsQotb
Job ID: ftjob-f6VT5IebaJQCO6ZyayXaCT1x
Status: validating_files
Training file ID: file-WcDg8mzKb3L6w7HcTk0Fa1Vc


RateLimitError: Error code: 429 - {'error': {'message': "This fine-tune request has been rate-limited. Your organization has reached the maximum of 3 active requests (0 running, 3 pending) for the model 'babbage-002'.", 'type': 'invalid_request_error', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
direction = 'inverse'
property_name = "absorption"
representation = "SMILES"
model_name = "babbage-002"

experiment_name = f'{representation.lower()[:3]}-{property_name[:3].lower()}-{direction[:1].lower()}{model_name[:1].lower()}{datetime.now().strftime("%d%H%M")}'

if direction == 'forward':
    if 'gpt' in model_name:
        train_data = train_df.apply(lambda row: get_train_conversation(row, property_name, representation), axis=1).tolist()
        validate_data = validate_df.apply(lambda row: get_train_conversation(row, property_name, representation), axis=1).tolist()
        test_data = test_df.apply(lambda row: get_train_conversation(row, property_name, representation), axis=1).tolist()
    elif 'babbage' or 'davinci' in model_name:
        train_data = train_df.apply(lambda row: get_train_prompt_completion(row, property_name, representation), axis=1).tolist()
        validate_data = validate_df.apply(lambda row: get_train_prompt_completion(row, property_name, representation), axis=1).tolist()
        test_data = test_df.apply(lambda row: get_train_prompt_completion(row, property_name, representation), axis=1).tolist()
elif direction == 'inverse':
    if 'babbage' or 'davinci' in model_name:
        train_data = train_df.apply(lambda row: get_inverse_prompt_completion(row, property_name, representation), axis=1).tolist()
        validate_data = validate_df.apply(lambda row: get_inverse_prompt_completion(row, property_name, representation), axis=1).tolist()
        test_data = test_df.apply(lambda row: get_inverse_prompt_completion(row, property_name, representation), axis=1).tolist()

print(experiment_name)
for conversation in train_data[:5]:
    print(conversation)

In [None]:
experiment_data_path = pl.Path(f'../data/{experiment_name}')
experiment_data_path.mkdir(exist_ok=True)

train_filename = str(experiment_data_path / "train.jsonl")
write_jsonl(train_data, train_filename)

validate_filename = str(experiment_data_path / "validate.jsonl")
write_jsonl(validate_data, validate_filename)

test_filename = str(experiment_data_path / "test.jsonl")
write_jsonl(test_data, test_filename)

!head -n 5 $train_filename
!wc -l $train_filename
!wc -l $validate_filename
!wc -l $test_filename

In [None]:
with open(train_filename, "rb") as train_fd:
    training_response = client.files.create(
        file=train_fd, purpose="fine-tune"
    )

train_file_id = training_response.id

with open(validate_filename, "rb") as validate_fd:
    validate_response = client.files.create(
        file=validate_fd, purpose="fine-tune"
    )
validate_file_id = validate_response.id

print(experiment_name)
print("Training file ID:", train_file_id)
print("Validation file ID:", validate_file_id)

In [None]:
response = client.fine_tuning.jobs.create(
    training_file=train_file_id,
    validation_file=validate_file_id,
    model="babbage-002",
    suffix=experiment_name,
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

with open(experiment_data_path / "job_id.txt", "w") as f:
    f.write(job_id)