In [1]:
import os
from sklearn.metrics import confusion_matrix, accuracy_score
from openai import OpenAI
import pandas as pd
import numpy as np
import json

from prepare_plasticc import config
from tokenizer import LCTokenizer

In [2]:
api_key = os.environ.get('OPENAI_API_KEY', None)

In [3]:
client = OpenAI(api_key=api_key)

In [4]:
df_meta = pd.read_csv('plasticc/plasticc_train_metadata.csv.gz')
df_meta = df_meta.sample(frac=1, random_state=42)

df = pd.read_csv('plasticc/plasticc_train_lightcurves.csv.gz')

In [5]:
train_size= int(0.8 * len(df_meta))
val_size = int(0.1 * len(df_meta))

# Tokens 

In [53]:
tokenizer = LCTokenizer(-10000, 10000, 500, 1000, 500, bands=config["bands"],
                            transform=np.arcsinh, inverse_transform=np.sinh,
                            min_sn=3, window_size=1)

In [54]:
system = 'You are an assistant that can analyse tokens that encode the flux and time intervals between observations in different passbands of the LSST. You can also use the photometric redshift. \
Use this data to infer the class of the object. Answers should be 6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95.'

train_messages = []
val_messages = []
for i, row in df_meta.iterrows():
    df_object = df.loc[(df["object_id"] == row["object_id"]), :]
    tokens = tokenizer.encode(df_object)
    message = []
    message.append({
        'role': 'system',
        'content': system
    })
    message.append({
        'role': 'user',
        'content': 'The photometric redshift is %s. The tokens are %s' % (row['hostgal_photoz'], json.dumps(tokens))  
    })
    message.append({
        'role': 'assistant',
        'content': json.dumps(int(row['true_target']))
    })
    if i <= train_size:
        train_messages.append({'messages': message})
    elif train_size < i <= train_size + val_size:
        val_messages.append({'messages':message})

In [55]:
with open("chatgpt/train_data_tokens.jsonl", 'w') as f:
    for message in train_messages:
        f.write(json.dumps(message) + "\n")
        
with open("chatgpt/val_data_tokens.jsonl", 'w') as f:
    for message in val_messages:
        f.write(json.dumps(message) + "\n")

In [56]:
y_pred, y_true = [], []
for i, message in enumerate(val_messages[0:100]):
    print(i)
    resp = client.chat.completions.create(
                    #model='ft:gpt-3.5-turbo-0125:personal:plasticc4:9GwzYWLy', # 100
                    model='ft:gpt-3.5-turbo-0125:personal:plasticc:9GoMgdwp', # 500
                    messages=message['messages'][:2],
                    temperature=0,
    )
    y_pred.append(int(resp.choices[0].message.content))
    y_true.append(int(message['messages'][2]['content']))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [57]:
confusion_matrix(y_true, y_pred)

array([[10,  0,  0,  0,  1,  0,  1,  0,  0,  1,  0,  0],
       [ 0, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  0,  8,  0,  3,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  0,  2,  0,  0],
       [ 0,  0,  1,  0,  3,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  2,  0,  0,  0,  0,  9,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0],
       [ 1,  1,  8,  0,  4,  0,  2,  0,  0, 11,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0],
       [ 1,  0,  2,  0,  1,  0,  1,  0,  0,  0,  0,  1]])

In [58]:
accuracy_score(y_true, y_pred)

0.59

In [48]:
val_messages[0]

{'messages': [{'role': 'system',
   'content': 'You are an assistant that can analyse tokens that encode the flux and time intervals between observations in different passbands of the LSST. You can also use the photometric redshift. Use this data to infer the class of the object. Answers should be 6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95.'},
  {'role': 'user',
   'content': 'The photometric redshift is 0.424. The tokens are [93, 678, 48, 274, 12, 274, 35, 578, 45, 372, 12, 475, 58, 570, 19, 471, 48, 366, 28, 366, 43, 468]'},
  {'role': 'assistant', 'content': '90'}]}

# Fluxes

In [12]:
system = 'You are an assistant that can analyse time, passband and flux observations from the LSST.  You can also use the photometric redshift.\
Use them to infer the class of the object. Answers should be 6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95.'

In [13]:
train_messages = []
val_messages = []

for i, row in df_meta.iterrows():
    df_object = df.loc[(df["object_id"] == row["object_id"]) & (df["detected_bool"] == 1), :].reset_index(drop=True)
    df_object.loc[:, 'mjd'] = df_object.loc[:, 'mjd'] - min(df_object['mjd'])
    df_object.loc[:, 'flux'] = round(np.arcsinh(df_object.loc[:, 'flux']), 1)
    df_object.loc[:, 'mjd'] = round(df_object.loc[:, 'mjd'], 1)
    obs = df_object[['mjd', 'passband', 'flux']].to_json(orient="split", index=False)
    message = []
    message.append({
        'role': 'system',
        'content': system
    })
    message.append({
        'role': 'user',
        'content': 'The photometric redshift is %s. The measurements are %s' % (row['hostgal_photoz'], obs) 
    })
    message.append({
        'role': 'assistant',
        'content': json.dumps(int(row['true_target']))
    })
    if i <= train_size:
        train_messages.append({'messages': message})
    elif train_size < i <= train_size + val_size:
        val_messages.append({'messages':message})

In [14]:
train_messages[1]

{'messages': [{'role': 'system',
   'content': 'You are an assistant that can analyse time, passband and flux observations from the LSST.  You can also use the photometric redshift.Use them to infer the class of the object. Answers should be 6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95.'},
  {'role': 'user',
   'content': 'The photometric redshift is 0.0. The measurements are {"columns":["mjd","passband","flux"],"data":[[0.0,3,-8.7],[77.7,4,-8.2],[353.8,2,-6.3],[396.0,3,-8.4],[458.8,2,-9.4]]}'},
  {'role': 'assistant', 'content': '16'}]}

In [15]:
with open("chatgpt/train_data_fluxes.jsonl", 'w') as f:
    for message in train_messages:
        f.write(json.dumps(message) + "\n")
with open("chatgpt/val_data_fluxes.jsonl", 'w') as f:
    for message in val_messages:
        f.write(json.dumps(message) + "\n")

In [16]:
y_pred, y_true = [], []
for i, message in enumerate(val_messages[0:10]):
    print(i)
    resp = client.chat.completions.create(
                    model='ft:gpt-3.5-turbo-0125:personal:plasticc2:9GuAZ5lr',
                    messages=message['messages'][:2],
                    temperature=0,
    )
    y_pred.append(int(resp.choices[0].message.content))
    y_true.append(int(message['messages'][2]['content']))

0
1
2
3
4
5
6
7
8
9


In [17]:
confusion_matrix(y_true, y_pred)

array([[0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 3, 1, 0],
       [0, 0, 0, 0, 0, 1, 0]])

In [18]:
accuracy_score(y_true, y_pred)

0.1