# Data preparation

In [5]:
%load_ext autoreload
%autoreload 2

from qdrant_client import QdrantClient, models
import os
import sys

# Do this to enable importing modules
src_path = os.path.join(os.path.abspath(""), "..")
sys.path.insert(0, src_path)

from feature_pipeline.settings import settings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
CLEAN_COLLECTION = False

_client = QdrantClient(
    host=settings.QDRANT_DATABASE_HOST,
    port=settings.QDRANT_DATABASE_PORT,
)

_client.delete(
    collection_name="cleaned_articles",
    points_selector=models.FilterSelector(
        filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="source",
                    match=models.MatchValue(value="news_api"),
                ),
            ],
        )
    ),
)

if CLEAN_COLLECTION:
    _client.delete_collection(collection_name="cleaned_articles")

In [7]:
from feature_pipeline.db.qdrant import connection as client
    
def fetch_all_cleaned_content(collection_name: str) -> list:
        all_cleaned_contents = []

        scroll_response = client.scroll(collection_name=collection_name, limit=10000)
        points = scroll_response[0]

        for point in points:
            # cleaned_content = point.payload["cleaned_content"]
            cleaned_content = point.payload
            if cleaned_content:
                all_cleaned_contents.append(cleaned_content)

        return all_cleaned_contents

In [8]:
articles = fetch_all_cleaned_content("cleaned_articles")
len(articles)

9

In [31]:
import textwrap

i = 2

# for article in articles:
#     wrapped_string = textwrap.fill(article["cleaned_content"], width=100)
#     print("\n")
#     print(article["source"])
#     print(wrapped_string)

In [18]:
USER_PROMPT = (
    f"I will give you batches of contents of articles. Please generate me exactly 1 instruction for each of them. The article "
    f"for which you have to generate the instructions is under Content number x lines. Please structure the answer in json format,"
    f"ready to be loaded by json.loads(), a list of objects only with fields called instruction and content. For the content field, copy the number of the content only!."
    f"Please do not add any extra characters and make sure it is a list with objects in valid json format!\n"
)


class DataFormatter:
    @classmethod
    def format_data(cls, data_points: list, is_example: bool, start_index: int) -> str:
        text = ""
        for index, data_point in enumerate(data_points):
            if not is_example:
                text += f"Content number {start_index + index }\n"
            text += str(data_point) + "\n"
        return text

    @classmethod
    def format_batch(cls, context_msg: str, data_points: list, start_index: int) -> str:
        delimiter_msg = context_msg
        delimiter_msg += cls.format_data(data_points, False, start_index)
        return delimiter_msg

    @classmethod
    def format_prompt(cls, inference_posts: list, start_index: int):
        initial_prompt = USER_PROMPT
        initial_prompt += f"You must generate exactly a list of {len(inference_posts)} json objects, using the contents provided under CONTENTS FOR GENERATION\n"
        initial_prompt += cls.format_batch(
            "\nCONTENTS FOR GENERATION: \n", inference_posts, start_index
        )
        return initial_prompt

In [10]:
from openai import OpenAI
import logging
import json
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

MAX_LENGTH = 16384
SYSTEM_PROMPT = "You are a news writer whose job is reporting about the cryptocurrency market trends and developments"

class OpenAIHandler:
    def __init__(self, gpt_model: str = "gpt-3.5-turbo"):
        self.api_key = settings.OPENAI_API_KEY
        self.gpt_model = gpt_model

    def request(self, prompt: str) -> list:
            try:
                client = OpenAI(api_key=self.api_key)
                logging.info("Sending batch to LLM")
                chat_completion = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": prompt[:MAX_LENGTH]},
                    ],
                    model=self.gpt_model,
                )
                response = chat_completion.choices[0].message.content
                return json.loads(self.clean_response(response))
            except Exception as e:
                logging.error(f"Skipping batch! An error occurred while communicating with API: {e}")
                return []

    @staticmethod
    def clean_response(response: str) -> str:
        start_index = response.find("[")
        end_index = response.rfind("]")
        return response[start_index : end_index + 1]

In [15]:
OpenAIHandler().clean_response("some text")

''

In [14]:
OpenAIHandler().request("Bitcoin crash 2024")

2024-05-23 12:57:14,567 - INFO - Sending batch to LLM
2024-05-23 12:57:17,614 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-23 12:59:14,350 - ERROR - Skipping batch! An error occurred while communicating with API: Expecting value: line 1 column 1 (char 0)


[]

In [33]:
class DatasetGenerator:
    def __init__(
        self,
        api_communicator: OpenAIHandler,
        data_formatter: DataFormatter,
    ):
        self.api_communicator = api_communicator
        self.data_formatter = data_formatter

    def generate_training_data(self, collection_name: str, batch_size: int = 1):
        all_contents = self.fetch_all_cleaned_content(collection_name)
        response = []
        for i in range(0, len(all_contents), batch_size):
            batch = all_contents[i : i + batch_size]
            initial_prompt = self.data_formatter.format_prompt(batch, i)
            response += self.api_communicator.request(initial_prompt)
            for j in range(i, i + batch_size):
                response[j]["content"] = all_contents[j]

        # self.push_to_comet(response, collection_name)
        return response
    
    def fetch_all_cleaned_content(self, collection_name: str) -> list:
        all_cleaned_contents = []

        scroll_response = client.scroll(collection_name=collection_name, limit=10000)
        points = scroll_response[0]

        for point in points:
            cleaned_content = point.payload["cleaned_content"]
            # cleaned_content = point.payload
            if cleaned_content:
                all_cleaned_contents.append(cleaned_content)

        return all_cleaned_contents

In [37]:
articles[0]["cleaned_content"]

'Hong Kongs Office of the Privacy Commissioner for Personal Data (PCPD) has concluded its inquiry into the Worldcoin project, determining that its operations in Hong Kong violated its Personal Data (Privacy) Ordinance (PDPO). In a May 22 notice, Privacy Commissioner Ada Chung Lai-lingissued an enforcement notice to Worldcoin, ordering the immediate halt of all project operations in Hong Kong that involve scanning and collecting iris and facial images of the public using iris scanning devices. The PCPD started its investigationagainst the Worldcoin project in January 2024 to determine whether the identity verification methods posed serious risks to citizens personal data privacy and violated the requirements of the PDPO. The PCPD conducted 10 covert visits at six premises involved in operating the Worldcoin project from December 2023 to January 2024. According to the PCPD, collecting face images was unnecessary for verifying the humanness of participants, as the iris scanning device ope

In [47]:
# check prompt
prompt = DataFormatter.format_prompt([articles[0]["cleaned_content"]], 0)
import textwrap
# print(textwrap.fill(prompt, 150))
print(prompt)

I will give you batches of contents of articles. Please generate me exactly 1 instruction for each of them. The article for which you have to generate the instructions is under Content number x lines. Please structure the answer in json format,ready to be loaded by json.loads(), a list of objects only with fields called instruction and content. For the content field, copy the number of the content only!.Please do not add any extra characters and make sure it is a list with objects in valid json format!
You must generate exactly a list of 1 json objects, using the contents provided under CONTENTS FOR GENERATION

CONTENTS FOR GENERATION: 
Content number 0
Hong Kongs Office of the Privacy Commissioner for Personal Data (PCPD) has concluded its inquiry into the Worldcoin project, determining that its operations in Hong Kong violated its Personal Data (Privacy) Ordinance (PDPO). In a May 22 notice, Privacy Commissioner Ada Chung Lai-lingissued an enforcement notice to Worldcoin, ordering th

In [48]:
response = DatasetGenerator(
    api_communicator=OpenAIHandler(),
    data_formatter=DataFormatter()    
).generate_training_data("cleaned_articles")

2024-05-23 13:50:39,334 - INFO - HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
2024-05-23 13:50:39,349 - INFO - Sending batch to LLM
2024-05-23 13:50:41,098 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-23 13:50:41,113 - INFO - Sending batch to LLM
2024-05-23 13:50:42,735 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-23 13:50:42,753 - INFO - Sending batch to LLM
2024-05-23 13:50:44,685 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-23 13:50:44,698 - INFO - Sending batch to LLM
2024-05-23 13:50:45,806 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-23 13:50:45,818 - INFO - Sending batch to LLM
2024-05-23 13:50:46,933 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-05-23 13:50:46,946 - INFO

In [49]:
response

[{'instruction': 'Investigate the Worldcoin project and determine whether their operations in Hong Kong violated the Personal Data (Privacy) Ordinance, as concluded by the Office of the Privacy Commissioner for Personal Data (PCPD)',
  'content': 'Hong Kongs Office of the Privacy Commissioner for Personal Data (PCPD) has concluded its inquiry into the Worldcoin project, determining that its operations in Hong Kong violated its Personal Data (Privacy) Ordinance (PDPO). In a May 22 notice, Privacy Commissioner Ada Chung Lai-lingissued an enforcement notice to Worldcoin, ordering the immediate halt of all project operations in Hong Kong that involve scanning and collecting iris and facial images of the public using iris scanning devices. The PCPD started its investigationagainst the Worldcoin project in January 2024 to determine whether the identity verification methods posed serious risks to citizens personal data privacy and violated the requirements of the PDPO. The PCPD conducted 10 c