In [1]:
from typing import Union, List, Dict
import pathlib as pl
import json
import os
from datetime import datetime

import openai
import tiktoken
import pandas as pd
from rdkit import Chem
from sklearn.model_selection import train_test_split

with open('/Users/azadoks/.zshrc.d/10_openai_bot.sh', 'r') as f:
    api_key = f.read().split('=')[1].strip()

client = openai.OpenAI(api_key=api_key)

In [58]:
def load_jsonl(filename: os.PathLike) -> List[dict]:
    data = []
    with open(filename, 'r') as f:
        for line in f.readlines():
            data.append(json.loads(line.strip()))
    return data

def predict(job_id: str, prompts) -> List:
    response = client.fine_tuning.jobs.retrieve(job_id)
    fine_tuned_model_id = response.fine_tuned_model

    responses = []
    for prompt in prompts:
        if isinstance(prompt, dict):
            prompt = prompt['prompt']
        response = client.completions.create(
            model=fine_tuned_model_id,
            prompt=prompt,
            stop='nm',
            temperature=0
        )
        responses.append(response.to_dict())

    return responses

In [59]:
experiment_names = ['sel-abs-b231212', 'sel-emi-b231204', 'smi-abs-b231214', 'smi-emi-b231214']

experiment_data = {}
for experiment_name in experiment_names:
    experiment_dir = pl.Path(f'../data/{experiment_name}/')
    with open(experiment_dir / 'job_id.txt', 'r') as f:
        job_id = f.read().strip()
    train_data = load_jsonl(experiment_dir / 'train.jsonl')
    validate_data = load_jsonl(experiment_dir / 'validate.jsonl')
    test_data = load_jsonl(experiment_dir / 'test.jsonl')
    experiment_data[experiment_name] = {
        'job_id': job_id,
        'train_data': train_data,
        'validate_data': validate_data,
        'test_data': test_data
    }

In [60]:
def get_molecule(prompt: str) -> str:
    predicate = prompt.split('molecule')[1]
    molecule = predicate.split()[0]
    return molecule

def get_solvent(prompt: str) -> str:
    predicate = prompt.split('dissolved in')[1]
    solvent = predicate.split()[0]
    return solvent

In [63]:
N = 500
results = {}
for experiment_name, experiment in experiment_data.items():
    print(experiment_name)
    results[experiment_name] = {}
    data = experiment['test_data'][:N]
    prompts = [d['prompt'] for d in data]
    completions = predict(experiment['job_id'], prompts)

    results[experiment_name]['y'] = [int(d['completion'].strip().split()[0]) for d in data]
    results[experiment_name]['completions'] = completions
    results[experiment_name]['yhat'] = [int(c['choices'][0]['text'].strip()) for c in completions]
    results[experiment_name]['molecules'] = [get_molecule(d['prompt']) for d in data]
    results[experiment_name]['solvents'] = [get_solvent(d['prompt']) for d in data]

    results_filename = pl.Path(f'../data/{experiment_name}/results.json')
    if results_filename.exists():
        old_results = json.load(open(results_filename, 'r'))
        for key in results[experiment_name].keys():
            results[experiment_name][key] = old_results[key] + results[experiment_name][key]

    results_filename.rename(results_filename.with_suffix(f'.{datetime.now().isoformat()}.json'))

    with open(f'../data/{experiment_name}/results.json', 'w') as f:
        json.dump(results[experiment_name], f)

sel-abs-b231212
sel-emi-b231204
smi-abs-b231214
smi-emi-b231214


In [None]:
y = []
yhat = []
for (i, row) in validate_df.iloc[:500].iterrows():
    messages, expected = get_validate_converation(row, property_name, representation)
    response = client.chat.completions.create(
        model=fine_tuned_model_id,
        messages=messages['messages'],
        temperature=0,
        max_tokens=12
    )
    content = response.choices[0].message.content
    print(expected, content)
    y.append(int(expected.strip().split()[0]))
    yhat.append(int(content.strip().split()[0]))

with open('smi-emis-gpt-3.5-turbo-0125-5k.json', 'w') as f:
    json.dump({'y': y, 'yhat': yhat}, f)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(dpi=300, figsize=(6,5))

colors = [
    cs_srgb.spec_to_rgb(wavelength_to_spectrum(l))
    for l in yhat
]
ax.plot([-2000, 2000], [-2000, 2000], linestyle='--', color='tab:gray', zorder=0)
ax.scatter(y, yhat, c='tab:blue', marker='.')
ax.set_xlabel('True')
ax.set_ylabel('Predicted')
ax.set_title('$\lambda_{\mathrm{max}}$ (nm)')
ax.set_aspect('equal')
ax.set_xlim(-50, 1050)
ax.set_ylim(-50, 1050)

fig.savefig('smi-emis-gpt-3.5-turbo-0125-5k.png')