In [None]:
import os,re
import yt_dlp
import json
import time
import math 
import httplib2
import requests
import pinecone 
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
from bs4.element import Comment
import matplotlib.pyplot as plt
from langchain.llms import OpenAIChat
from bs4 import BeautifulSoup, SoupStrainer
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Zak George GPT

`Here, we will prepare the VectorDB index for Zak George's YouTube channel:`

* Use Whisper to transcribe episodes 
* Chunk data
* Embed it to Pinecone
* Test VectorDBQA chain on it 
 
`1. Get video urls -` 

In [None]:
import os
import requests
import pandas as pd
from youtubesearchpython import *
import yt_dlp
import logging

In [None]:
def clean_channel_title(s: str) -> str:
    s = s.lower().replace(" ", "-")
    return "".join(c for c in s if c.isalnum() or c == "-")

In [None]:
channel_title = 'Dog Training by Kikopup'
cleaned_channel_title = cleaned_channel_title(channel_title)
channelsSearch = ChannelsSearch(channel_title, limit = 1, region = 'US')
channel_id = channelsSearch.result()['result'][0]['id']
print(f"Retrived {channelsSearch.result()['result'][0]['title']} channel ID: {channelsSearch.result()['result'][0]['id']}")

# https://pypi.org/project/youtube-search-python/
playlist = Playlist(playlist_from_channel_id(channel_id))

while playlist.hasMoreVideos:
    print('Getting more videos...')
    playlist.getNextVideos()
    print(f'Videos Retrieved: {len(playlist.videos)}')
    
# Episode data
stor_metadata=pd.DataFrame()
for v in playlist.videos:
    try:
        stor_metadata.loc[v['title'],'link']=v['link']
        stor_metadata.loc[v['title'],'title']=v['title']
        stor_metadata.loc[v['title'],'img']=v['thumbnails'][3]['url']
    except:
        print("Failed on %s", v['title'])
        
# Adds a numerical index to the dataframe which we'll use later
stor_metadata = stor_metadata.reset_index()

`2. Get audio -` 

In [None]:
def download_episode(ep_number, img_url, ep_link, channel_title):
    """Downloads an episode of a YouTube channel.

    Args:
        ep_number (int): The episode number.
        img_url (str): The URL of the episode image.
        ep_link (str): The URL of the episode video.
        channel_title (str): The title of the YouTube channel as shown on the channel.
        

    Returns:
        None.
    """

    _cleaned_title = clean_channel_title(channel_title)
    
    # Check if the directory exists
    directory_name = f"../public/{_cleaned_title}"
    if not os.path.exists(directory_name):
        # Create the directory
        os.makedirs(directory_name)

    # Write the image to the directory
    with open(f"{directory_name}/0{ep_number}.jpg", 'wb') as f:
        response = requests.get(img_url)
        f.write(response.content)

    # Write the audio to the directory
    ydl_opts = {
        'format': 'm4a/bestaudio/best',
        'outtmpl': f'audio/{_cleaned_title}/{ep_number}.m4a',
        'noplaylist': True,
        'postprocessors': [{  
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'm4a',
        }]}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download(ep_link)

    # Log the episode information
    logging.info(f"Downloaded episode {ep_number}")


# Iterate through episodes 
for ix in stor_metadata.index:
    ep_number=ix
    print("EPISODE: %s"%ep_number)
    img_url=stor_metadata.loc[ix,'img']
    ep_link=stor_metadata.loc[ix,'link']
    # Write img 
    download_episode(ep_number, img_url, ep_link)

# Write the metadata to a CSV file
stor_metadata.to_csv(f"audio_transcription/{cleaned_channel_title}/episodes.csv")

`3. Run Whisper -`
 
* On GPU, ideally: 10-20 min / video on 2080Ti with `medium` model
* Run `python run_whisper.py`

If running this step on a remote machine:
* scp the transcription: `audio_transcription/episodes.csv`
* scp the audio files: `audio/*`
* Run `python run_whisper.py`
* Then, scp the `audio_transcription/` back to local 

In [None]:
! python run_whisper.py

`4. Get transcripts -`

In [None]:
# *** Chunk size: key parameter *** 
chunks = 1500
splits_new = [ ]
metadatas_new = [ ]

# Read the csv file
new_ep=pd.read_csv("audio_transcription/episodes.csv",index_col=None)

for ix in new_ep.index:

    # Get data
    title=new_ep.loc[ix,'title']
    ep_number=int(new_ep.loc[ix,'number'])
    
    # Ep
    episode_id="0"+str(ep_number) 
    file_path='audio_transcription/%s.txt'%str(episode_id)
    transcript=pd.read_csv(file_path,sep='\t',header=None)
    transcript.columns=['links','time','chunks']
    
    # Clean text chunks 
    transcript['clean_chunks']=transcript['chunks'].astype(str).apply(lambda x: x.strip())
    links = list(transcript['links'])
    texts = transcript['clean_chunks'].str.cat(sep=' ')

    # Splits 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunks, 
                                                   chunk_overlap=50) 
    splits = text_splitter.split_text(texts)
    print(len(splits)) 

    # Metadata 
    N = len(splits) 
    bins = np.linspace(0, len(links)-1, N, dtype=int)
    sampled_links = [links[i] for i in bins]
    
    # Here we can add "link", "title", etc that can be fetched in the app 
    metadatas=[{"source":title + " " +link,"id":episode_id,"link":link,"title":title} for link in sampled_links]
    print(len(metadatas)) 

    # Append to output 
    splits_new.append(splits)
    metadatas_new.append(metadatas)

`5. Assemble final list -`

In [None]:
# Join the list of lists 
splits_all = []
for sublist in splits_new:
    splits_all.extend(sublist)

metadatas_all = []
for sublist in metadatas_new:
    metadatas_all.extend(sublist)

`6. Embed full dataset in Pinecone VectorDB -`

In [None]:
# Pinecone
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-west4-gcp")

# Update - 
index_name = "zac-george-gpt"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

In [None]:
# Add data in chunk to avoid data ingest errors
chunk_size = 100
last_chunk = 0
num_chunks = math.ceil(len(splits_all) / chunk_size)
for i in range(last_chunk,num_chunks):
    
    print(i)
    start_time = time.time()
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, len(splits_all))
    
    # Extract the current chunk
    current_splits = splits_all[start_idx:end_idx]
    current_metadatas = metadatas_all[start_idx:end_idx]
    
    # Add the current chunk to the vector database
    p.add_texts(texts = current_splits, metadatas=current_metadatas)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

`7. Read in VectorDB for testing` 

In [None]:
# Pinecone
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-west4-gcp")
index_name = "zac-george-gpt"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

In [None]:
def run_retrievalQA_sources_chain(llm,query,docstore):

    start_time = time.time()
    chain = RetrievalQAWithSourcesChain.from_chain_type(llm,chain_type="stuff",retriever=docstore.as_retriever(k=3))
    a = chain({"question": query},return_only_outputs=True)
    print(a["answer"])
    print(a["sources"])
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

def run_vectorDBQA_sources_chain(llm,query,docstore,k):

    start_time = time.time()
    chain = VectorDBQAWithSourcesChain.from_chain_type(llm,chain_type="stuff",vectorstore=docstore,k=k)
    a = chain({"question": query},return_only_outputs=True)
    print(a["answer"])
    print(a["sources"])
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

In [None]:
llm = OpenAIChat(temperature=0)
q = "What are the first things I need to train my new dog?"
run_vectorDBQA_sources_chain(llm,q,p,4)

In [None]:
llm = OpenAIChat(model_name="gpt-4",temperature=0)
q = "What is the first thing to do when I get a new puppy?"
run_vectorDBQA_sources_chain(llm,q,p,8)

`8. Evaluation` 

In [None]:
import json
with open('eval/final_eval.json', 'r') as f:
    eval_set = json.load(f)

In [None]:
from langchain.chains import VectorDBQA

llm = OpenAIChat(temperature=0)
chain_gpt3_k_1 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=1,input_key = "question")
chain_gpt3_k_4 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=4,input_key = "question")

llm = OpenAIChat(model_name="gpt-4",temperature=0)
chain_gpt4_k_1 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=1,input_key = "question")
chain_gpt4_k_4 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=4,input_key = "question")
chain_gpt4_k_8 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=8,input_key = "question")
 
from langchain.evaluation.qa import QAEvalChain
from langchain.chat_models import ChatOpenAI 
eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(temperature=0))

In [None]:
def run_eval(chain):

    predictions = []
    predicted_dataset = []
    latency = []

    for data in eval_set:
        
        start_time = time.time()
        new_data = {"question": data["question"],"answer": data["answer"]}
        predictions.append(chain(new_data))
        predicted_dataset.append(new_data)
        end_time = time.time()
        elapsed_time = end_time - start_time
        latency.append(elapsed_time)

    return predictions,predicted_dataset,latency

predictions_list = []
scores_list = []
latency_list = []

# Eval on chains 
for i,chain in enumerate([chain_gpt3_k_1,chain_gpt4_k_1,chain_gpt3_k_4,chain_gpt4_k_4,chain_gpt4_k_8]):    
    print(f"Evaluating chain {i+1}")
    predictions,predicted_dataset,latency=run_eval(chain)
    predictions_list.append(predictions)
    graded_outputs = eval_chain.evaluate(predicted_dataset, predictions, question_key="question", prediction_key="result")
    scores_list.append(graded_outputs)
    latency_list.append(latency)

In [None]:
# Results
stor=pd.DataFrame()

for i,chunk_size in enumerate(["GPT3.5_k_4","GPT4_k_4","GPT4_k_8"]):
    d=scores_list[i]
    incorrect_counts = []
    for dictionary in d:
        if dictionary['text'] == 'INCORRECT':
            incorrect_counts.append(1)
        else:
            incorrect_counts.append(0)
    stor.loc[chunk_size,'num_incorrect']=sum(incorrect_counts)

stor['pct_incorrect'] = stor['num_incorrect']  / len(eval_set)
stor['pct_correct'] = 1 - stor['pct_incorrect']
stor['pct_correct'].plot(kind='bar')
plt.title('Percentage of Correct Answers')
plt.xlabel('Chain')
plt.ylabel('Percentage')
plt.show()

In [None]:
latency=pd.DataFrame(latency_list).T
latency.columns = ["GPT3.5_k_4","GPT4_k_4","GPT4_k_8"]
latency.to_csv("results/latency.csv")
latency.boxplot()
plt.xlabel("Model")
plt.ylabel("Latency per query (seconds)")
plt.title("Latency for QA comparing ChatGPT vs GPT4 \n $\mu$ per model = 4.7s,13.3s,19.1s, $N=52$")

In [None]:
def eval_summary(i):
    d=pd.DataFrame(predictions_list[i])
    d['score']=list(score["text"] for score in scores_list[i])
    return d

GPT35_k_4_result=eval_summary(0)
GPT4_k_4_result=eval_summary(1)
GPT4_k_8_result=eval_summary(2)

In [None]:
GPT35_k_4_result.to_csv("results/GPT35_k_4_result.csv")
GPT4_k_4_result.to_csv("results/GPT4_k_4_result.csv")
GPT4_k_8_result.to_csv("results/GPT4_k_8_result.csv")

In [None]:
wrong3_5=GPT35_k_4_result[GPT35_k_4_result.score != "CORRECT"]
wrong3_5.to_csv("results/wrong3_5.csv")

In [None]:
wrong4=GPT4_k_4_result[GPT4_k_4_result.score != "CORRECT"]
wrong4.to_csv("results/wrong4_k4.csv")

In [None]:
wrong4=GPT4_k_4_result[GPT4_k_4_result.score != "CORRECT"]
wrong4.to_csv("results/wrong4_k8.csv")

In [None]:
GPT35_k_4_result=pd.read_csv("results/GPT35_k_4_result.csv")
GPT4_k_4_result=pd.read_csv("results/GPT4_k_4_result.csv")
GPT4_k_8_result=pd.read_csv("results/GPT4_k_8_result.csv")

Include corrections to eval (see below):

https://docs.google.com/spreadsheets/d/1zc3lmm23lRkbU0k0j3ueo69_s5RBb-1qbXGwnx0PW00/edit#gid=1347589790

In [None]:
d=pd.DataFrame([48/51.,50/51.,51/51.]).T
d.columns=["GPT3.5_k_4","GPT4_k_4","GPT4_k_8"]
d_=d.T

In [None]:
latency = pd.read_csv("results/latency.csv",index_col=None)
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
axs[0].set_title("Percentage of Correct Answers \n % per model: 94%, 98%, 100%, $N=52$")
axs[1].set_title("Latency for QA comparing ChatGPT vs GPT4 \n $\mu$ per model: 4.7s,13.3s,19.1s, $N=52$")
axs[0].set_xlabel('Chain')
axs[0].set_ylabel('Fraction correct')
axs[1].set_xlabel("Model")
axs[1].set_ylabel("Latency per query (seconds)")
d_.plot(kind='bar', ax=axs[0], legend=False)
latency[["GPT3.5_k_4","GPT4_k_4","GPT4_k_8"]].boxplot(ax=axs[1])
plt.show()