In [1]:
import os
import json
import pandas as pd
from tqdm.auto import tqdm
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec


In [2]:
# Set up API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or input("Enter your OpenAI API key: ")

# Set up Pinecone API key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or input("Enter your Pinecone API key: ")

In [3]:
#Load and flatten the playlist_transcripts.json file.
# Load the JSON file
with open("playlist_transcripts.json", "r") as f:
    data = json.load(f)

# Flatten the data to extract only the text
texts = []
for video_id, transcripts in data.items():
    for entry in transcripts:
        if isinstance(entry, dict):  # Ensure entry is a dictionary
            texts.append(entry["text"])  # Collect only the transcript text

# Convert to DataFrame for processing
df = pd.DataFrame({"text": texts})

# Check the structure of the DataFrame
print(df.head())

                                                text
0  - 19 things to know before\nyou go to Tokyo, J...
1                         I'm Chris, this is Topher.
2                   We are the internet's number one
3             human and stuffed panda traveling duo.
4                        This is Yellow Productions,


In [4]:
# Initialize variables for chunking
chunk_size = 4000  # Number of characters per chunk
chunk_overlap = 800  # Overlapping characters between chunks
chunks = []  # To store the resulting chunks
current_chunk = ""  # Accumulator for the current chunk

# Iterate through all rows in the DataFrame
for row in df["text"]:
    if len(current_chunk) + len(row) + 1 <= chunk_size:  # Add row if it fits in the current chunk
        current_chunk += row + " "  # Add a space between sentences
    else:
        chunks.append(current_chunk.strip())  # Save the full chunk
        current_chunk = row + " "  # Start a new chunk with overlap
        # Add overlap from the end of the previous chunk
        if chunk_overlap > 0 and len(chunks[-1]) > chunk_overlap:
            current_chunk = chunks[-1][-chunk_overlap:] + current_chunk

# Add the last chunk if it has content
if current_chunk:
    chunks.append(current_chunk.strip())

# Convert chunks into a DataFrame for embedding
chunk_df = pd.DataFrame({"text": chunks})

# Inspect the chunked DataFrame
print(f"Number of chunks: {len(chunk_df)}")
print(chunk_df.head())

Number of chunks: 490
                                                text
0  - 19 things to know before\nyou go to Tokyo, J...
1  city is the friendly limousine bus. It's a big...
2  an really understand English, all that well. W...
3  you're at a ticket window\nto buy Shinkansen t...
4  e, the staff might be running after you, after...


In [5]:
# Initialize the OpenAI embedding model
embed = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key=OPENAI_API_KEY,
)

  embed = OpenAIEmbeddings(


In [6]:
#Connect to Pinecone and create the index
# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or input("Enter your Pinecone API key: ")
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define the Pinecone index
index_name = "japan-travel-guide"  # Corrected index name to use lower case alphanumeric characters and hyphens - "youtube-transcripts-index-updated"

# Delete an existing index if the limit is reached
if len(pc.list_indexes()) >= 5:
    pc.delete_index(pc.list_indexes()[0].name)  # Extract the name from the IndexModel object

if index_name not in pc.list_indexes():
    pc.create_index(
        index_name,  # Corrected positional argument
        dimension=1536,  # Dimension for "text-embedding-ada-002"
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

# Connect to the index
index = pc.Index(index_name)

In [7]:
###ChatGBT with Source Metadata

# Add source metadata during indexing
batch_size = 100  # Define batch size for efficiency

# Process the DataFrame in batches
for i in tqdm(range(0, len(chunk_df), batch_size)):
    i_end = min(len(chunk_df), i + batch_size)
    batch = chunk_df.iloc[i:i_end]

    # Extract chunks for embedding
    documents = batch["text"].tolist()

    # Create embeddings for the text
    embeds = embed.embed_documents(documents)

    # Generate unique IDs and add source metadata
    ids = [f"chunk-{i+j}" for j in range(len(batch))]
    metadata = [{"text": chunk, "source": f"chunk-{i+j}"} for j, chunk in enumerate(documents)]

    # Add embeddings and metadata to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
#direct queries to the Pinecone index - just to test if it works
# Query the Pinecone index
query = "Where can you find the best Ramen in Tokyo?" # Examples:How much percent of Japan's population live in Tokyo?
query_embedding = embed.embed_query(query)

# Search the index with metadata included
search_results = index.query(vector=query_embedding, top_k=1, include_metadata=True)

# Display the result
print("Search Results:")
for result in search_results["matches"]:
    print(f"Chunk ID: {result['id']}")
    print(f"Score: {result['score']}")
    if "metadata" in result and "text" in result["metadata"]:
        print(f"Matched Text: {result['metadata']['text']}")
    else:
        print("No metadata found.")

Search Results:
Chunk ID: chunk-230
Score: 0.865899384
Matched Text: f you are with a party, the little booths can open up, so you can be next to them. But it's super great if you
don't like to see people. It's super well-thought out. I also want to point out in the bathroom, it has the most rolls of toilet paper I have ever seen on a wall before. It was amazing. Well, let me tell you,
if you come to Fukuoka, check out Ichiran, the
ramen is really good and you don't have to talk to anybody. Well, I hope you enjoyed this video. If this was your first time here, please click here to subscribe, or click here and here to
watch some of my other videos. Thanks, bye bye! for some of the best shopping in Fukuoka visit Canal City this huge complex right here it's called a city in a city it's that big it's Japan's largest private development ever there's a canal thatruns right through the middle of it there's a fountain show in the center complex hundreds of shops there's the Grand Hyatt Hotel t

In [11]:
import os
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

In [12]:
#
# Define metadata field for text
text_field = "text"

# Initialize the vector store using the Pinecone index you've created ->  initialzie so I can access Database
vectorstore = Pinecone(
    index=index,  # Use the `index` object you already initialized in Step 2
    embedding=embed,  # Pass the embedding object
    text_key=text_field  # Key for metadata
)


  vectorstore = Pinecone(
