In [94]:
import numpy as np
from typing import List, Dict, Tuple, DefaultDict, Any
from collections import defaultdict
import time
import pickle
import os
from pinecone import Pinecone
import concurrent.futures

import openai
try:
    import config
    openai.api_key = config.OPENAI_API_KEY
except ImportError:
    openai.api_key = os.environ.get("OPENAI_API_KEY")


PATH_TO_DATASET_PKL = "..\\..\\dataset\\embeddings\\dataset.pkl"
PATH_TO_DATASET_DICT_PKL = "..\\..\\dataset\\embeddings\\dataset_dict.pkl"
LEN_EMBEDDINGS = 1536
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

from text_splitter import TokenSplitter, split_into_sentences


error_count_dict = {
    "Entry has no source.": 0,
    "Entry has no title.": 0,
    "Entry has no text.": 0,
    "Entry has no URL.": 0,
    "Entry has wrong citation level.": 0
}


class MissingDataException(Exception):
    pass


class Dataset:
    def __init__(self,
            min_tokens_per_block: int = 300, # Minimum number of tokens per block.
            max_tokens_per_block: int = 400, # Maximum number of tokens per block.
        ):
        
        self.min_tokens_per_block = min_tokens_per_block  # for the text splitter
        self.max_tokens_per_block = max_tokens_per_block  # for the text splitter
        
        self.metadata: List[Tuple[str]] = []  # List of tuples, each containing the title, author, date, URL, and tags of an article.
        self.embedding_strings: List[str] = []  # List of strings, each being a few paragraphs from a single article (not exceeding max_tokens_per_block tokens).
        self.embeddings_metadata_index: List[int] = [] # List of integers, each being the index of the article from which the embedding string was taken.

        self.articles_count: DefaultDict[str, int] = defaultdict(int)  # Number of articles per source. E.g.: {'source1': 10, 'source2': 20, 'total': 30}

        self.total_articles_count = 0
        
        self.total_char_count = 0
        self.total_word_count = 0
        self.total_sentence_count = 0
        self.total_block_count = 0
        
        self.sources_so_far: List[str] = []
        self.info_types: Dict[str, List[str]] = {}
           
    def get_alignment_texts(self):
        text_splitter = TokenSplitter(self.min_tokens_per_block, self.max_tokens_per_block)

        folder_path = "..\\..\\dataset\\selected_texts"
        

        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r") as file:
                try:                    
                    # Title = file name
                    title = filename.replace(".txt", "")
                    # print(title)

                    text = file.read()

                    #we're keeping the text so we inc the article count
                    self.total_articles_count += 1
                    
                    # Get signature
                    signature = ""
                    if title: signature += f"Title: {title}, "
                    else: signature += f"Title: None, "
                    # if author: signature += f"Author: {author}"
                    # else: signature += f"Author: None"

                    signature = signature.replace("\n", " ")
                    
                    # Add info to metadata and embedding strings
                    # self.metadata.append((title))
                    blocks = text_splitter.split(text, signature)
                    block_metadata = {}
                    i = 0
                    for block in blocks:
                        i += 1
                        block_metadata["title"] = title + f": Section {i}"
                        print (block_metadata["title"])
                        block_metadata["text"] = block
                        self.metadata.append(block_metadata)

                    self.embedding_strings.extend(blocks)
                    self.embeddings_metadata_index.extend([self.total_articles_count-1] * len(blocks))
                    
                    # Update counts
                    self.total_char_count += len(text)
                    self.total_word_count += len(text.split())
                    self.total_sentence_count += len(split_into_sentences(text))
                    self.total_block_count += len(blocks)
                    
                except MissingDataException as e:
                    if str(e) not in error_count_dict:
                        error_count_dict[str(e)] = 0
                    error_count_dict[str(e)] += 1

    def get_embeddings(self):
        def get_embeddings_at_index(texts: str, batch_idx: int, batch_size: int = 200): # int, np.ndarray
            embeddings = np.zeros((batch_size, LEN_EMBEDDINGS))
            openai_output = openai.Embedding.create(
                model="text-embedding-ada-002", 
                input=texts
            )['data']
            for i, embedding in enumerate(openai_output):
                embeddings[i] = embedding['embedding']
            return batch_idx, embeddings

        batch_size = 5
        rate_limit = 3500 / 60  # Maximum embeddings per second

        start = time.time()
        self.embeddings = np.zeros((len(self.embedding_strings), LEN_EMBEDDINGS))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(
                get_embeddings_at_index, 
                self.embedding_strings[batch_idx:batch_idx+batch_size], 
                batch_idx,
                len(self.embedding_strings[batch_idx:batch_idx+batch_size])
            ) for batch_idx in range(0, len(self.embedding_strings), batch_size)]
            num_completed = 0
            for future in concurrent.futures.as_completed(futures):
                batch_idx, embeddings = future.result()
                num_completed += embeddings.shape[0]
                self.embeddings[batch_idx:batch_idx+embeddings.shape[0]] = embeddings

                elapsed_time = time.time() - start
                expected_time = num_completed / rate_limit
                sleep_time = max(expected_time - elapsed_time, 0)
                time.sleep(sleep_time)

                print(f"Completed {num_completed}/{len(self.embedding_strings)} embeddings in {elapsed_time:.2f} seconds.")


    def post_embeddings_to_pinecone(self):
        pc = Pinecone(api_key=PINECONE_API_KEY)
        index = pc.Index("justitia")

        # Post embeddings to Pinecone
        print("Posting embeddings to Pinecone...")
        #  Format of the embeddings
        # index.upsert(
        #     vectors=[
        #         {
        #             "id": "vec1", 
        #             "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 
        #             "metadata": {"genre": "drama"}
        #         }, {
        #             "id": "vec2", 
        #             "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2], 
        #             "metadata": {"genre": "action"}
        #         }, {
        #             "id": "vec3", 
        #             "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3], 
        #             "metadata": {"genre": "drama"}
        #         }, {
        #             "id": "vec4", 
        #             "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4], 
        #             "metadata": {"genre": "action"}
        #         }
        #     ],
        #     namespace= "ns1"
        # )
        CHUNK_SIZE = 100  # Adjust this value based on your requirements
        vectors = [{"id": str(i), "values": embedding.tolist(), "metadata": self.metadata[i]} for i, embedding in enumerate(self.embeddings)]
        for i in range(0, len(vectors), CHUNK_SIZE):
            chunk_vectors = vectors[i:i+CHUNK_SIZE]
            index.upsert(vectors=chunk_vectors, namespace="ns1")
        print("Embeddings posted to Pinecone.")


    def save_embeddings(self, path: str):
        np.save(path, self.embeddings)
        
    def load_embeddings(self, path: str):
        self.embeddings = np.load(path)
        
    def save_class(self, path: str = PATH_TO_DATASET_PKL):
        # Save the class to a pickle file
        print(f"Saving class to {path}...")
        with open(path, 'wb') as f:
            pickle.dump(self, f)
    
    def save_data(self, path: str = PATH_TO_DATASET_DICT_PKL):
        # Save the data to a pickle file
        print(f"Saving data to {path}...")
        data = {
            "metadata": self.metadata,
            "embedding_strings": self.embedding_strings,
            "embeddings_metadata_index": self.embeddings_metadata_index,
            "embeddings": self.embeddings.astype(np.float32),
            "articles_count": self.articles_count,
            "total_articles_count": self.total_articles_count,
            "total_char_count": self.total_char_count,
            "total_word_count": self.total_word_count,
            "total_sentence_count": self.total_sentence_count,
            "total_block_count": self.total_block_count
        }
        with open(path, 'wb') as f:
            pickle.dump(data, f)

In [95]:
dataset = Dataset(
    min_tokens_per_block=200,
    max_tokens_per_block=300
)

dataset.get_alignment_texts()

Disney+_PrivacyPolicy: Section 1
Disney+_PrivacyPolicy: Section 2
Disney+_PrivacyPolicy: Section 3
Disney+_PrivacyPolicy: Section 4
Disney+_PrivacyPolicy: Section 5
Disney+_PrivacyPolicy: Section 6
Disney+_PrivacyPolicy: Section 7
Disney+_PrivacyPolicy: Section 8
Disney+_PrivacyPolicy: Section 9
Disney+_PrivacyPolicy: Section 10
Disney+_PrivacyPolicy: Section 11
Disney+_PrivacyPolicy: Section 12
Disney+_PrivacyPolicy: Section 13
Disney+_PrivacyPolicy: Section 14
Disney+_PrivacyPolicy: Section 15
Disney+_PrivacyPolicy: Section 16
Disney+_PrivacyPolicy: Section 17
Disney+_UkandEUprivacyrights: Section 1
Disney+_UkandEUprivacyrights: Section 2
Google_PrivacyPolicy: Section 1
Google_PrivacyPolicy: Section 2
Google_PrivacyPolicy: Section 3
Google_PrivacyPolicy: Section 4
Google_PrivacyPolicy: Section 5
Google_PrivacyPolicy: Section 6
Google_PrivacyPolicy: Section 7
Google_PrivacyPolicy: Section 8
Google_PrivacyPolicy: Section 9
Google_PrivacyPolicy: Section 10
Google_PrivacyPolicy: Section 

In [None]:
# dataset.get_embeddings()

In [90]:
# dataset.save_embeddings("../../dataset/embeddings/embeddings.npy")
dataset.load_embeddings("../../dataset/embeddings/embeddings.npy")

In [91]:
PATH_TO_DATASET = "../../dataset/embeddings/dataset.pkl"
dataset.save_class(PATH_TO_DATASET)

Saving class to ../../dataset/embeddings/dataset.pkl...


In [92]:
dataset = pickle.load(open(PATH_TO_DATASET, "rb"))

In [93]:
dataset.post_embeddings_to_pinecone()

Posting embeddings to Pinecone...
Embeddings posted to Pinecone.
