In [3]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    max_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
)

from gptchem.gpt_regressor import GPTRegressor
from gptchem.tuner import Tuner


In [11]:

def split_dictionary(input_dict, chunk_size):
    res = []
    new_dict = {}
    for k, v in input_dict.items():
        if len(new_dict) < chunk_size:
            new_dict[k] = v
        else:
            res.append(new_dict)
            new_dict = {k: v}
    res.append(new_dict)
    return res


In [18]:
with open("qm9_key_smiles_0_val_u0_atom_b3lyp.pickle", "rb") as handle:
        qm9_data = pickle.load(handle)

print(len(qm9_data))  

dicts_list = split_dictionary(qm9_data, 10)


130258


In [None]:
for i, mydict in enumerate(dicts_list):

    print(f"fitting dict {i}")
    print(len(mydict))
    # Convert the keys and values of the dictionary into separate lists
    smiles_list = list(mydict.keys())
    energies_list = list(mydict.values())

    # Split the dataset into train (90%) and test (10%) sets
    train_smiles, test_smiles, train_energies, test_energies = train_test_split(
        smiles_list, energies_list, test_size=0.1, random_state=42
    )

    regressor = GPTRegressor(
        property_name="atomization energy in kcal/mol", # this is the property name we will use in the prompt template
        tuner=Tuner(n_epochs=8, learning_rate_multiplier=0.02, wandb_sync=False),
    )
    # Fit the regressor with the train set
    regressor.fit(train_smiles, train_energies)
    
    
    
    
    
    
    
    

fitting dict 0
10


Upload progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.19k/1.19k [00:00<00:00, 990kit/s]


Uploaded file from /afs/cs.wisc.edu/u/g/m/gmerz2/public/hackathon/chemgpt_qm9/out/20230329_164907/train.jsonl: file-lvDnAsHdMpVyHLEl4iQhhGWl


In [7]:


# Make predictions using the test set
y_pred = regressor.predict(test_smiles)
y_true = test_energies

# Calculate the regression metrics between predictions and test_energies
regression_metrics = {
    "r2": r2_score(y_true, y_pred),
    "max_error": max_error(y_true, y_pred),
    "mean_absolute_error": mean_absolute_error(y_true, y_pred),
    "mean_squared_error": mean_squared_error(y_true, y_pred),
    "rmse": mean_squared_error(y_true, y_pred, squared=False),
    "mean_absolute_percentage_error": mean_absolute_percentage_error(y_true, y_pred),
    }

print(regression_metrics)

Upload progress: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18.8M/18.8M [00:00<00:00, 17.9Git/s]


KeyboardInterrupt: 