# Data preparation

In [1]:
%load_ext autoreload
%autoreload 2

from qdrant_client import QdrantClient
import os
import sys

# Do this to enable importing modules
src_path = os.path.join(os.path.abspath(""), "..")
sys.path.insert(0, src_path)

## MongoDb data

In [13]:
CLEAN_MONGODB = False

if CLEAN_MONGODB:
    from pymongo import MongoClient
    host = "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set"
    mongo_client = MongoClient(host)

    db = mongo_client["crypto-articles"]
    db["articles"].drop()

## Qdrant Data

In [14]:
from data_generation.settings import settings
from qdrant_client.http.models import Batch, Distance, VectorParams

EMBEDDING_SIZE=384
CLEAN_COLLECTION = False

_client = QdrantClient(
    host=settings.QDRANT_DATABASE_HOST,
    port=settings.QDRANT_DATABASE_PORT,
)


if CLEAN_COLLECTION:
    _client.delete_collection(collection_name="cleaned_articles")
    _client.delete_collection(collection_name="vector_articles")

    _client.create_collection(collection_name="cleaned_articles", vectors_config={})
    _client.create_collection(collection_name="vector_articles", vectors_config=VectorParams(size=EMBEDDING_SIZE, distance=Distance.COSINE))

_client.get_collections()

2024-06-12 13:08:51,161 - INFO - HTTP Request: GET http://localhost:6333/collections "HTTP/1.1 200 OK"


CollectionsResponse(collections=[CollectionDescription(name='vector_articles'), CollectionDescription(name='cleaned_articles')])

In [31]:
import textwrap

i = 2

# for article in articles:
#     wrapped_string = textwrap.fill(article["cleaned_content"], width=100)
#     print("\n")
#     print(article["source"])
#     print(wrapped_string)

In [15]:
from data_generation.utils.openai_helper import OpenAIHandler
from data_generation.utils.data_formatter import DataFormatter
from data_generation.data_generator import DatasetGenerator

In [19]:
collection_name = "cleaned_articles"
openai_handler = OpenAIHandler()
formatter = DataFormatter()
generator = DatasetGenerator(openai_handler, formatter)
all_contents = generator.fetch_all_cleaned_content(collection_name)
training_data = generator.generate_training_data(all_contents[:200], 1)

2024-06-12 13:15:38,269 - INFO - HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
2024-06-12 13:15:38,285 - INFO - Sending batch to LLM
2024-06-12 13:15:39,795 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-12 13:15:39,816 - INFO - Sending batch to LLM
2024-06-12 13:15:41,465 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-12 13:15:41,482 - INFO - Sending batch to LLM
2024-06-12 13:15:43,135 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-12 13:15:43,155 - INFO - Sending batch to LLM
2024-06-12 13:15:44,602 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-12 13:15:44,620 - INFO - Sending batch to LLM
2024-06-12 13:15:45,806 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-06-12 13:15:45,821 - INFO

In [20]:
generator.push_to_comet(training_data, collection_name)

2024-06-12 13:23:39,948 - INFO - Starting to push data to Comet: cleaned_articles
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/cancamilo/crypto-reporter/818bb19458cd47168551642fe8078982

2024-06-12 13:23:41,779 - INFO - Writing data to file: cleaned_articles.json
2024-06-12 13:23:41,783 - INFO - Data written to file successfully
2024-06-12 13:23:41,784 - INFO - Artifact created and file added: cleaned_articles.json
[1;38;5;39mCOMET INFO:[0m Artifact 'cleaned_articles' version 6.0.0 created (previous was: 5.0.0)
[1;38;5;39mCOMET INFO:[0m Scheduling the upload of 1 assets for a size of 334.75 KB, this can take some time
[1;38;5;39mCOMET INFO:[0m Artifact 'cancamilo/cleaned_articles:6.0.0' has started uploading asynchronously
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ------------------------

In [61]:
cleaned_articles = generator.fetch_all_cleaned_content("cleaned_articles")
len(cleaned_articles)

2024-06-04 17:00:12,985 - INFO - HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"


30

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser

all_contents = cleaned_articles[140:150]
batch_size = 1


openai_handler = OpenAIHandler()
formatter = DataFormatter()

response = []

for i in range(0, len(all_contents), batch_size):
    batch = all_contents[i : i + batch_size]
    initial_prompt = formatter.format_prompt(batch, i)
    print(initial_prompt)
    batch_result = openai_handler.request(initial_prompt)

    # only process batch if response is valid
    if len(batch_result) > 0:
        response += batch_result
        for j in range(i, i + batch_size):
            response[j]["content"] = all_contents[j]

response


This data can be pushed to commet in order to keep in as an asset. This asset will serve as the input for fine tuning a model. 

In [1]:
# self.push_to_comet(response, collection_name)

In [19]:
import pandas as pd

df = pd.DataFrame(cleaned_articles, columns=["content"])
df["length"] = df["content"].apply(lambda x: len(x.split(" ")))

Unnamed: 0,length
count,323.0
mean,175.275542
std,98.662932
min,13.0
25%,112.0
50%,169.0
75%,219.0
max,1023.0


In [27]:
df["content"].isna().sum()

0