# Week 1 - Notebook 1 - Ada 

In this notebook, I try to create vector embeddings using OpenAI ada embeddings.

In [54]:
#standard libraries
import json
import os
import time
from collections import defaultdict
from typing import List, Dict, Tuple, Union, Callable
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from torch import cuda
from tqdm.notebook import tqdm
import tiktoken 
from llama_index.text_splitter import SentenceSplitter
from llama_index import ServiceContext
from llama_index.embeddings import OpenAIEmbedding
from tiktoken import Encoding
import openai
from getpass import getpass

#external files
from preprocessing import FileIO


## Step 1 -->  Import YouTube/Podcast Transcripts
***

In [6]:
#root folder on Google Colab is: /content/
root_folder = 'data/'
data_file = 'impact_theory_data.json'
data_path = os.path.join(root_folder, data_file)
data_path

'data/impact_theory_data.json'

You should see 384 unique podcast entries, imported as a list of dictionaries, with each dictionary being a single episode.

In [7]:
with open(data_path) as f:
    data =  json.load(f)
print(f'Total # of episodes: {len(data)}')

# Tokenize the content

In [11]:
#instantiate tokenizer for use with ChatGPT-3.5-Turbo
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')

#### OpenAI Ada Embeddings

In [18]:
# set API key
openai.api_key = getpass()

In [19]:
#define the model we want to use for the tokenizer
model_type = "text-embedding-ada-002"
encoding = tiktoken.encoding_for_model(model_type)

### Extract Contents

In [20]:
#only need the contents from our data - every episode text is saved in content
contents = [d['content'] for d in data]

## Split Contents


In [None]:
#set chunk size and instantiate your SentenceSplitter
chunk_size = 256
gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=0)
splits = [gpt35_txt_splitter.split_text(episode) for episode in tqdm(contents)] # a list of lists where the first element is each episode, second element is first chunk

## Create Metadata

Create metadata in this step before losing the episode information for the chunks.

In [161]:
def add_metadata(content_splits: List[List[str]],
                 corpus: List[dict]) -> List[dict]:
    
    docs =[]

    for i, episode in enumerate(content_splits):
        for j, split in enumerate(episode):
            doc = {k:v for k,v in corpus[i].items() if k!= 'content'}
            doc_id = f"{n['video_id']}_{j}"
            doc["doc_id"] = doc_id
            doc['content'] = split
            docs.append(doc)
    return docs

In [212]:
docs = add_metadata(splits,data)

### Estimate the cost of the job

In [22]:
#should match previously calculated token count for corpus
def get_cost(encoder: Encoding,
             corpus: Union[str, List[str]],
             price: float=0.0001,
             return_tokens: bool=False
             ) -> Union[float, Tuple[float, List[int]]]:
    '''
    Given a text encoder and a corpus of text, this function will calculate the total
    cost, as determined by the price param, of using the OpenAI API. The price is
    per 1,000 tokens.
    '''
    if isinstance(corpus, str):
        encoded = encoder.encode(corpus)
        token_count = len(encoded)
    elif isinstance(corpus, list):
        encoded = encoder.encode_batch(corpus)
        token_count = sum(list(map(len, encoded)))
    print(f'Total Tokens: {token_count}')
    cost = token_count//1000 * price
    print(f'Total Cost: ${cost:.2f}')
    if return_tokens:
        return cost, encoded
    return cost

In [23]:
get_cost(encoding, contents)

0.5958

In [213]:
embed_model = OpenAIEmbedding(embed_batch_size=150)
service_context = ServiceContext.from_defaults(embed_model=embed_model)

We flatten a list of list of strings into a list of strings because OpenAIEmbedding from LlamaIndex only accepts a list of strings. This allows the method to process the whole corpus embeddings in batches. However, we lose the episode information here because we lose the first dimension of a list (episodes). 

In [56]:
#converts list of lists of text (strings) into a single list of text (strings)
flattened_content = [chunk for alist in splits for chunk in alist]

In [58]:
# now we just have a list of all the chunks from the podcast, without episode information
len(flattened_content)

26448

In [59]:
%%time
#get ada embeddings
ada_embs = embed_model.get_text_embedding_batch(flattened_content, show_progress=True)

Generating embeddings:   0%|          | 0/26448 [00:00<?, ?it/s]

CPU times: user 10.8 s, sys: 1.79 s, total: 12.6 s
Wall time: 2min 59s


ada_embds is a list of lists, each sublist contains embeddings for one chunk of 256. 

In [214]:
# add embeddings in the same order to the main dictionary with metadata
for i, part in enumerate(docs):
    part['content_embedding'] = ada_embs[i]

In [207]:
#convert output to DataFrame to visualize and then save results as parquet
llama_df = pd.DataFrame({'text': flattened_content, 'embeddings': ada_embs})

In [208]:
# save to parquet for later use
folder_path = './embeddings'
file_name = 'impact_theory_ada_embs.parquet'
file_path = os.path.join(folder_path, file_name)

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
llama_df.to_parquet(file_path, index=False)

In [216]:
#instantiate FileIO Class
io = FileIO()

#Define your output path
outpath = "data/impact-theory_ada_1216.parquet"

#save to disk
io.save_as_parquet(file_path=outpath, data=docs, overwrite=False)

[32m2023-12-16 14:52:15.435[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m41[0m - [1mDataFrame saved as parquet file here: data/impact-theory_ada_1216.parquet[0m


In [217]:
#Verify that you can reload data

data_with_vectors = io.load_parquet(file_path=outpath)
data_with_vectors[1]

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


{'title': "THE BIG AI RESET: The Next Global SuperPower Isn't Who You Think | Ian Bremmer",
 'video_id': 'nXJBccSwtB8',
 'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
 'length': 5410,
 'thumbnail_url': 'https://i.ytimg.com/vi/nXJBccSwtB8/hq720.jpg',
 'views': 138628,
 'episode_url': 'https://www.youtube.com/watch?v=nXJBccSwtB8&list=PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
 'guest': 'Ian Bremmer',
 'summary': "In this episode, Ian Bremmer discusses the rise of big tech as a third superpower and the potential dangers and opportunities it presents. He highlights the immense power held by tech companies in shaping society, the economy, and national security, emphasizing their sovereignty over the digital world. Bremmer expresses concerns about the growing influence of AI and its potential to outstrip government regulation, leading to a reality where tech companies wield significant power over individuals. He also delves into the risks associated with AI proliferation, including the pote

## Index with Weaviate