In [1]:
import json
from datetime import datetime, timedelta
import configparser
import re
import time
import pickle

import numpy as np
import openai

from tqdm import tqdm


def embed_text(text):
    return openai.Embedding.create(input=text,
                            model="text-embedding-ada-002",
                            )["data"][0]['embedding']
    

def make_random_date(
        start: datetime = datetime(2000, 1, 1), 
        end: datetime = datetime(2050, 12, 31)):
    start_ts = start.timestamp()
    end_ts = end.timestamp()
    random_ts = np.random.randint(start_ts, end_ts)
    dt = datetime.fromtimestamp(random_ts)
    return dt, dt.strftime("%A, %B %d, %Y"), dt.strftime("%Y-%m-%d")


def generate_dates(prompts, n=20):
    times = {"w": "weeks",
             "d": "days"}
    pat = re.compile(r"(?:|\-(\d+)([dw]|mo))$")

    sample = []
    for _ in range(n):
        for prompt in prompts:
            m = pat.match(prompt["td"])
            assert m, prompt
            mult, tm = m.group(1), m.group(2)
            if mult is None:
                td = None
            elif tm == "mo":
                td = timedelta(days=30*int(mult))
            else:
                td = timedelta(**{times[tm]: int(mult)})
            
            curr_date, curr_date_str, curr_date_str2 = make_random_date()
            if td:
                after_date = (curr_date - td)
                label = (after_date.year, after_date.month, after_date.day)
            else:
                label = (-1,-1,-1)

            new_prompt = f"Today is {curr_date_str}. {prompt['prompt']}"
            sample.append((curr_date_str2, new_prompt, label))
    return sample


def make_samples(dates):
    for date_str, prompt, label in tqdm(dates):
        while True:
            try:
                emb = embed_text(prompt)
                break
            except:
                time.sleep(1)
        yield (date_str, prompt, label, emb)


Make sure you set your OpenAI API token in the config file 'config.ini'

In [2]:
# Initial prompts, you can add more if you feel like it
PROMPTS_PATH = "prompts.json"

# Configuration. Only the OPENAI_TOKEN is required
CONFIG_PATH = "config.ini"

# Where to save the generated samples
SAVE_PATH = "full_samples2.pkl"

# How many iterations through the prompts in PROMPTS_PATH to make
N_ITER = 250

config = configparser.ConfigParser()
config.read(CONFIG_PATH)
openai.api_key = config["MAIN"]["OPENAI_TOKEN"]

with open(PROMPTS_PATH, "r") as f:
    prompts = json.load(f)

Generate training data

In [3]:
full_dates = generate_dates(prompts, n=N_ITER)
full_samples = make_samples(full_dates)

samples = []
for i, sample in enumerate(full_samples):
    samples.append(sample)
    if i%100 == 0:
        with open(SAVE_PATH, "wb") as f:
            pickle.dump(samples, f)

with open(SAVE_PATH, "wb") as f:
    pickle.dump(samples, f)


  1%|          | 35/6500 [00:18<41:48,  2.58it/s]  