# 👉Import all Libraries

In [1]:
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_openai import OpenAI
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import json
import os



load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

llm = OpenAI(api_key=api_key)

# 👉Load All the data

In [2]:
def load_metadata(metadata_file):
    with open(metadata_file, 'r') as file:
        metadata = json.load(file)
    return metadata

def load_transcript(transcript_file, metadata):
    with open(transcript_file, 'r') as file:
        data = json.load(file)
    
    transcript_text = ""

    for channel in data['results']['channels']:
        for alternative in channel['alternatives']:
            for word_info in alternative['words']:
                word = word_info['punctuated_word']
                transcript_text += word + " "
    
    video_id = data['metadata']['sha256']
    title = metadata.get('title', 'Unknown Title')
    
    return {
        "text": transcript_text.strip(),
        "metadata": {
            "video_id": video_id,
            "title": title
        }
    }

# 👉Make chunks of the data


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_transcript(transcript, max_chunk_size, overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size,
        chunk_overlap=overlap,
        length_function=len,
        separators=['.', ',', '\n', '\n\n']
    )
    
    chunks = text_splitter.split_text(transcript['text'])
    
    chunked_data = []

    for i, chunk in enumerate(chunks):
        chunked_data.append({
            "chunk_id": f"{transcript['metadata']['video_id']}_{i}",
            "title": transcript['metadata']['title'],
            "text": chunk.strip()
        })

    return chunked_data


# 👉Now Save the Chunks into json file

In [4]:
def save_chunks_to_file(combined_chunks, output_file):
    output_dir = "Chunks"
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_file_path = os.path.join(output_dir, output_file)
    
    with open(output_file_path, 'w') as file:
        json.dump(combined_chunks, file, indent=4)
    
    print(f"All chunks saved to: {output_file_path}")

videos_directory = os.path.join("transcripts", "videos")
combined_chunks = []

for video_folder in os.listdir(videos_directory):
    video_path = os.path.join(videos_directory, video_folder)
    transcript_path = os.path.join(video_path, "transcript.json")
    metadata_path = os.path.join(video_path, "metadata.json")
    
    if os.path.isfile(transcript_path) and os.path.isfile(metadata_path):
        metadata = load_metadata(metadata_path)
        transcript = load_transcript(transcript_path, metadata)
        chunked_data = chunk_transcript(transcript, max_chunk_size=800, overlap=100)
        combined_chunks.extend(chunked_data)

save_chunks_to_file(combined_chunks, "chunks.json")


All chunks saved to: Chunks\chunks.json


# 👉Create Embeddings and store these into vector database


In [5]:

embedding = OpenAIEmbeddings(api_key=api_key)

current_dir = os.getcwd()

chunk_file_path = os.path.join(current_dir, "Chunks", "chunks.json")
print(f"Loading chunks from: {chunk_file_path}")

# Load the chunks
with open(chunk_file_path, "r") as file:
    chunks = json.load(file)


# Convert chunks to Document objects
documents = [Document(page_content=chunk["text"], metadata=chunk) for chunk in chunks]

vectorstore = Chroma.from_documents(documents=documents, embedding=embedding, persist_directory='./chroma_db')


Loading chunks from: e:\ML and Data Science work\Challenge\datawars-llm-challenges\Chunks\chunks.json


# 👉Set up retriever

In [9]:
# Set up retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

# Perform a retrieval
response = retriever.invoke("HOW TO GROUP A DATAFRAME IN PANDAS?")

print("Retrive Relavent Documents")
for result in response:
    print("Metadata:")
    for key, value in result.metadata.items():
        print(f"{key}: {value}")
    print()


Retrive Relavent Documents
Metadata:
chunk_id: 90912b0487bb8ee8331333237b0103d9dc06f1c317ff932fcdc9f44b1a76d489_22
text: . So, this object contains a bunch of groups, and to better understand what this is, let's take a look at an individual group, that this DataFrame has. Now, before we do that, I am going to set this as a variable so that we can reuse this, and not have to retype our code over and over, and also it will be easier to read. So I am going to call this country group, and I'm just going to set this equal to this df.groupby. And now, instead of typing this every time, we can just reference this country group variable here. So now let's take a look at one of these groups. So since we grouped our rows by country, then we can grab a specific group by country name. So I'll grab the group for the United States
title: Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data



##### As you can see it retrieve the relevant result from the chunks. 



# 👉Creating the LLM-powered RAG Chain for Q&A

In [7]:
# Initialize the LLM with the API key
llm = OpenAI(api_key=api_key)


prompt = ChatPromptTemplate.from_template(
    """
human

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {input} 

Context: {context} 

Answer:

    """
)

document_chain = create_stuff_documents_chain(llm, prompt)

rag_chain = create_retrieval_chain(retriever, document_chain)

# 👉 Now Generate Response 

In [8]:
query = "How can I group a DataFrame in Pandas?"

response = rag_chain.invoke({"input": query})

# Print the response
print(response['answer'])

To group a DataFrame in Pandas, you can use the groupby function. This function involves splitting the object, applying a function, and then combining the results. You can specify which column you want to group by, such as country, and then apply a specific function to the grouped data.
     The groupby function will return a DataFrameGroupBy object, which allows you to perform various operations on the grouped data. This object can be reused by setting it as a variable for easier reference and readability.
     To see specific results based on a certain column, you will need to group the data by that column. The groupby function is specifically designed for this purpose and is used in combination with other functions to split, apply, and combine the data. Referencing the Pandas documentation can provide more information on how this function works.
