In [73]:
from dotenv import load_dotenv
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_openai import OpenAI

# Load the .env file
load_dotenv()

# Get the API key from the environment variables
api_key = os.getenv("OPENAI_API_KEY")

# Initialize the LLM with the API key
llm = OpenAI(api_key=api_key)

In [74]:
import os
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Function to load the transcript from a JSON file
def load_transcript(transcript_file):
    with open(transcript_file, 'r') as file:
        data = json.load(file)
    
    transcript_text = ""
    for channel in data['results']['channels']:
        for alternative in channel['alternatives']:
            transcript_text += alternative['transcript'] + " "
    
    metadata = {
        "video_id": data['metadata']['sha256'],
        "title": data['metadata'].get('title', 'Unknown Title'),
    }
    
    return {
        "text": transcript_text.strip(),
        "metadata": metadata
    }

# Function to chunk the transcript using a RecursiveCharacterTextSplitter
def chunk_transcript(transcript, max_chunk_size=1000, overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=0,
        length_function=len,
        keep_separator=True,
        separators=['.', '؟', '!', '،', '\n'] 
    )
    chunks = text_splitter.split_text(transcript['text'])
    
    chunked_data = []
    for i, chunk in enumerate(chunks):
        chunked_data.append({
            "chunk_id": f"{transcript['metadata']['video_id']}_{i}",
            "text": chunk
        })
    return chunked_data


In [75]:
def save_chunks_to_file(chunked_data, output_file):
    output_dir = os.path.join("E:\\ML and Data Science work\\Challenge\\datawars-llm-challenges\\Chunks", chunked_data[0]["chunk_id"].split('_')[0])
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_file_path = os.path.join(output_dir, os.path.basename(output_file))
    with open(output_file_path, 'w') as file:
        json.dump(chunked_data, file, indent=4)
    
    print(f"Chunks saved to: {output_file_path}")


In [76]:
def combine_chunks(chunks_dir):
    combined_chunks = []
    for folder_name in os.listdir(chunks_dir):
        folder_path = os.path.join(chunks_dir, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                if file_name.startswith('chunked_') and file_name.endswith('.json'):
                    file_path = os.path.join(folder_path, file_name)
                    with open(file_path, 'r') as file:
                        chunks = json.load(file)
                        combined_chunks.extend(chunks)
    return combined_chunks


In [77]:
videos_directory = "E:\\ML and Data Science work\\Challenge\\datawars-llm-challenges\\transcripts\\videos"
chunks_directory = "E:\\ML and Data Science work\\Challenge\\datawars-llm-challenges\\Chunks"

# Step 1: Process each transcript in the videos directory and chunk it
for video_folder in os.listdir(videos_directory):
    video_path = os.path.join(videos_directory, video_folder)
    transcript_path = os.path.join(video_path, "transcript.json")
    
    if os.path.isfile(transcript_path):
        # Load and chunk the transcript
        transcript = load_transcript(transcript_path)
        chunked_data = chunk_transcript(transcript, max_chunk_size=500, overlap=50)
        
        # Save the chunked data to the chunks directory
        output_filename = f"chunked_{video_folder}.json"
        save_chunks_to_file(chunked_data, output_filename)


Chunks saved to: E:\ML and Data Science work\Challenge\datawars-llm-challenges\Chunks\6417f2648d1cf2ee83a5ffaca82bbd60e289d62069c78e3a195816c1530790df\chunked_DCDe29sIKcE.json
Chunks saved to: E:\ML and Data Science work\Challenge\datawars-llm-challenges\Chunks\55a3448fbe97406fc667959b350b0c6b4d38afbe1de9e5bef56c471bc2c1ce0f\chunked_Ercd-Ip5PfQ.json
Chunks saved to: E:\ML and Data Science work\Challenge\datawars-llm-challenges\Chunks\5ef5aa2dcb3fdc44663f30bb71fd0c84ea582477bfbccb6f3e0fb4ba227f2c4b\chunked_HQ6XO9eT-fc.json
Chunks saved to: E:\ML and Data Science work\Challenge\datawars-llm-challenges\Chunks\2f8e35742eda12bdb3310f0ce74751429a280bab561b2f9ce6691910876c8a72\chunked_KdmPHEnPJPs.json
Chunks saved to: E:\ML and Data Science work\Challenge\datawars-llm-challenges\Chunks\8e045af20f5437f5159cbd38f71ed138cba8cafbc1ff340ca9c19d6f382d3161\chunked_Lw2rlcxScZY.json
Chunks saved to: E:\ML and Data Science work\Challenge\datawars-llm-challenges\Chunks\03eb1dcf661c916eb29c469b444d299bc4

In [78]:
combined_chunks_output = "E:\\ML and Data Science work\\Challenge\\datawars-llm-challenges\\combined_chunks.json"

# Step 2: Combine all the chunks into a single JSON file
combined_chunks = combine_chunks(chunks_directory)
with open(combined_chunks_output, "w") as outfile:
    json.dump(combined_chunks, outfile, indent=4)

print(f"Combined chunks saved to: {combined_chunks_output}")


Combined chunks saved to: E:\ML and Data Science work\Challenge\datawars-llm-challenges\combined_chunks.json


# Create embeddings


In [79]:
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import json


# Initialize the OpenAI GPT-3 embedding model
embedding = OpenAIEmbeddings(api_key=api_key)

# Load the chunks
with open("E:\\ML and Data Science work\\Challenge\\datawars-llm-challenges\\combined_chunks.json", "r") as file:
    chunks = json.load(file)

# Convert chunks to Document objects
documents = [Document(page_content=chunk["text"], metadata=chunk) for chunk in chunks]

# Create a new Chroma vector store from the documents
vectorstore = Chroma.from_documents(documents, embedding=embedding)

# You can now use the vectorstore as needed, for example:
retriever = vectorstore.as_retriever()

# Use the retriever to answer a query
response = retriever.get_relevant_documents("How can I group a DataFrame in Pandas?")
print(response)


[Document(metadata={'chunk_id': '90912b0487bb8ee8331333237b0103d9dc06f1c317ff932fcdc9f44b1a76d489_33', 'text': "a bunch of groups, and to better understand what this is, let's take a look at an individual group, that this DataFrame has. Now, before we do that, I am going to set this as a variable so that we can reuse this, and not have to retype our code over and over, and also it will be easier to read. So I am going to call this country group, and I'm just going to set this equal to this df.groupby. And now, instead of typing this every time, we can just reference this country group variable here. So"}, page_content="a bunch of groups, and to better understand what this is, let's take a look at an individual group, that this DataFrame has. Now, before we do that, I am going to set this as a variable so that we can reuse this, and not have to retype our code over and over, and also it will be easier to read. So I am going to call this country group, and I'm just going to set this equa

# create_stuff_documents_chain

In [91]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_openai import OpenAI
from langchain.prompts import ChatPromptTemplate

# Initialize the LLM with the API key
llm = OpenAI(api_key=api_key)

# Define the prompt template
prompt_template = ChatPromptTemplate.from_template(
    """
    Answer the questions based on the provided context only.
    Please provide the most accurate response based on the question.
    <context>
    {context}
    <context>
    Questions: {input}
    """
)

# Create the document chain using the prompt template
document_chain = create_stuff_documents_chain(llm, prompt_template)

# Set up the retriever with search parameters (e.g., returning top 5 documents)
retriever = vectorstore.as_retriever()

# Create the retrieval chain
chain = create_retrieval_chain(retriever, document_chain)

# Define the query
query = "How can I group a DataFrame in Pandas?"

# Stream the response
response = chain.invoke({"input": query, "question": query})
    
       


In [92]:
for document in response["context"]:
    print(document)
    print()

page_content='a bunch of groups, and to better understand what this is, let's take a look at an individual group, that this DataFrame has. Now, before we do that, I am going to set this as a variable so that we can reuse this, and not have to retype our code over and over, and also it will be easier to read. So I am going to call this country group, and I'm just going to set this equal to this df.groupby. And now, instead of typing this every time, we can just reference this country group variable here. So' metadata={'chunk_id': '90912b0487bb8ee8331333237b0103d9dc06f1c317ff932fcdc9f44b1a76d489_33', 'text': "a bunch of groups, and to better understand what this is, let's take a look at an individual group, that this DataFrame has. Now, before we do that, I am going to set this as a variable so that we can reuse this, and not have to retype our code over and over, and also it will be easier to read. So I am going to call this country group, and I'm just going to set this equal to this df