# Data preparation

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys

# Do this to enable importing modules
src_path = os.path.join(os.path.abspath(""), "..")
sys.path.insert(0, src_path)

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv()) # read local .env file

True

In [3]:
import pandas as pd
df = pd.read_csv("../data/articles.csv")

In [33]:
USER_PROMPT = (
    f"I will give you batches of contents of articles. For each article, extract all named entities such as cryptocurrency, company, place, date, etc, into a list of objects with the properties entity_type and value"
    f"Please structure the answer in json format ready to be loaded by json.loads(). The answer should be a"
    f"list of objects only with fields called content and entities, where entities is the list described previously. For the content field, copy the number of the content only!."
    f"Please do not add any extra characters and make sure it is a list with objects in valid json format!\n"
)

class DataFormatter:
    @classmethod
    def format_data(cls, data_points: list, is_example: bool, start_index: int) -> str:
        text = ""
        for index, data_point in enumerate(data_points):
            if not is_example:
                text += f"Content number {start_index + index }\n"
            text += str(data_point) + "\n"
        return text

    @classmethod
    def format_batch(cls, context_msg: str, data_points: list, start_index: int) -> str:
        delimiter_msg = context_msg
        delimiter_msg += cls.format_data(data_points, False, start_index)
        return delimiter_msg

    @classmethod
    def format_prompt(cls, inference_posts: list, start_index: int):
        initial_prompt = USER_PROMPT
        initial_prompt += f"You must generate exactly a list of {len(inference_posts)} json objects, using the contents provided under CONTENTS FOR GENERATION\n"
        initial_prompt += cls.format_batch(
            "\nCONTENTS FOR GENERATION: \n", inference_posts, start_index
        )
        return initial_prompt

In [34]:
from openai import OpenAI
import logging
import json
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

MAX_LENGTH = 16384
SYSTEM_PROMPT = "You are an asistant whose job is crypto traders make informed decisions based on market sentiment"

class OpenAIHandler:
    def __init__(self, gpt_model: str = "gpt-3.5-turbo"):
        self.api_key = os.getenv("OPENAI_API_KEY")
        self.gpt_model = gpt_model

    def request(self, prompt: str) -> list:
            try:
                client = OpenAI(api_key=self.api_key)
                logging.info("Sending batch to LLM")
                chat_completion = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": prompt[:MAX_LENGTH]},
                    ],
                    model=self.gpt_model,
                )
                response = chat_completion.choices[0].message.content
                return json.loads(self.clean_response(response))
            except Exception as e:
                logging.error(f"Skipping batch! An error occurred while communicating with API: {e}")
                return []

    @staticmethod
    def clean_response(response: str) -> str:
        start_index = response.find("[")
        end_index = response.rfind("]")
        return response[start_index : end_index + 1]

In [35]:
class DatasetGenerator:
    def __init__(
        self,
        api_communicator: OpenAIHandler,
        data_formatter: DataFormatter,
    ):
        self.api_communicator = api_communicator
        self.data_formatter = data_formatter

    def generate_training_data(self, df, batch_size: int = 1):
        
        response = []
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i : i + batch_size]["content"].to_list()
            initial_prompt = self.data_formatter.format_prompt(batch, i)
            response += self.api_communicator.request(initial_prompt)
            for j in range(i, i + batch_size):
                response[j]["content"] = batch[j]

        # self.push_to_comet(response, collection_name)
        return response

In [36]:
batch = df.iloc[0 : 3]["content"].to_list()


In [37]:
# check prompt

test = df.iloc[0]["content"]
prompt = DataFormatter.format_prompt([test], 0)
import textwrap
# print(textwrap.fill(prompt, 150))
print(prompt)

I will give you batches of contents of articles. For each article, extract all named entities such as cryptocurrency, company, place, date, etc, into a list of objects with the properties entity_type and valuePlease structure the answer in json format ready to be loaded by json.loads(). The answer should be alist of objects only with fields called content and entities, where entities is the list described previously. For the content field, copy the number of the content only!.Please do not add any extra characters and make sure it is a list with objects in valid json format!
You must generate exactly a list of 1 json objects, using the contents provided under CONTENTS FOR GENERATION

CONTENTS FOR GENERATION: 
Content number 0
Bitcoin  BTC  $68,460  dipped 2% on May 28 after wallets labeled as belonging to the collapsed crypto exchange Mt. Gox moved 107,547 BTC worth nearly $7.3 billion to an unknown wallet — with more transactions coming in by the hour.   BTC   $68,460  The move is ahe

In [38]:
response = []
batch_size = 1
for i in range(0, len(df), batch_size):
    batch = df.iloc[i : i + batch_size]["content"].to_list()
    initial_prompt = DataFormatter.format_prompt(batch, i)
    response += OpenAIHandler().request(initial_prompt)

    # for j in range(i, i + batch_size):
    #     response[j]["content"] = batch[j]


2024-05-28 13:50:03,381 - INFO - Sending batch to LLM
2024-05-28 13:50:12,108 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-28 13:50:12,125 - INFO - Sending batch to LLM
2024-05-28 13:50:16,997 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-28 13:50:17,015 - INFO - Sending batch to LLM
2024-05-28 13:50:25,522 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-28 13:50:25,537 - INFO - Sending batch to LLM
2024-05-28 13:50:30,436 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-28 13:50:30,452 - INFO - Sending batch to LLM
2024-05-28 13:50:35,456 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-28 13:50:35,485 - INFO - Sending batch to LLM
2024-05-28 13:50:39,346 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK

In [39]:
response

[{'content': '0',
  'entities': [{'entity_type': 'cryptocurrency', 'value': 'Bitcoin'},
   {'entity_type': 'cryptocurrency', 'value': 'BTC'},
   {'entity_type': 'amount', 'value': '$68,460'},
   {'entity_type': 'company', 'value': 'Mt. Gox'},
   {'entity_type': 'amount', 'value': '107,547 BTC'},
   {'entity_type': 'amount', 'value': '$7.3 billion'},
   {'entity_type': 'date', 'value': 'May 28'},
   {'entity_type': 'company', 'value': 'Whale Alert'},
   {'entity_type': 'amount', 'value': '3,999 BTC'},
   {'entity_type': 'amount', 'value': '32,499 BTC'},
   {'entity_type': 'time', 'value': '1:41 am to 4:46 am UTC'},
   {'entity_type': 'amount', 'value': '2,000 BTC'},
   {'entity_type': 'amount', 'value': '$7.29 billion'},
   {'entity_type': 'company', 'value': 'Arkham Intelligence'},
   {'entity_type': 'company', 'value': 'Nagashima Ohno and Tsunematsu'},
   {'entity_type': 'amount', 'value': '$67,875'},
   {'entity_type': 'amount', 'value': '$69,374'},
   {'entity_type': 'company', 'val

In [10]:
from rag.retriever import QueryMetaExtractor

QueryMetaExtractor().generate_response("should i yolo in?")

OutputParserException: Could not parse function call: 'function_call'