In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import sys
sys.path.append('../../')


from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

In [161]:
from src.preprocessor.preprocessing import FileIO, Utilities
from src.llm.llm_interface import LLM
from src.llm.llm_utils import get_token_count, load_azure_openai
from tqdm import tqdm
import tiktoken
from math import ceil

import os
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [25]:
long_summary = '''
In this episode of the Huberman Lab podcast, Dr. Andy Galpin, Professor of Kinesiology at Cal State 
University Fullerton, joins host Andrew Huberman to discuss optimal protocols for building strength and muscle 
growth. They begin by highlighting the importance of strength and hypertrophy training for everyone, not just 
athletes or those looking to build big muscles. They emphasize the benefits of strength training for neuromuscular 
aging, as strength and power decline faster than muscle mass with age. Additionally, they explain that strength 
training is the only exercise route to keep the nervous system healthy and young. They discuss the various 
adaptations that occur in the body to increase strength, such as improvements in neuromuscular activation, muscle 
contraction, and movement mechanics. 
'''

In [26]:
get_token_count(long_summary)

160

In [7]:
data_path = '../../data/huberman_labs.json'
data = FileIO.load_json(data_path)

In [8]:
def extract_fields(data: list[dict], 
                   extract_keys: list[str]=['title','guest', 'content'], 
                  ) -> list[dict]:
    extracted_fields = []
    for d in data:
        extracted_fields.append({k:v for k,v in d.items() if k in extract_keys})
    return extracted_fields

In [9]:
episodes = extract_fields(data)

In [61]:
short_summary_extraction_system = '''
You are an expert at extracting high-level insights and important concepts from detailed reports or transcripts. 
Your main goal in life is to summarize the concepts found within episode transcripts from the Huberman Lab podcast. 
To aid in your summarization, context will be provided to you which will include the episode title as well as the guest.  
Do not include any extraneous information from the transcript such as giving thanks to corporate sponors.  
'''

In [206]:
long_summary_extraction_system = '''
Provide a comprehensive summary of the following Huberman Lab podcast episode.  
Ensure the summary includes relevant details and examples that support the main ideas, while avoiding 
any unnecessary information or repetition. The summary should provide a clear and accurate 
overview without omitting any important information. For context you will also be provided the 
episode title as well as the guest.  Do not include any extraneous information from the transcript such 
as giving thanks to corporate sponors.  Ensure that the summary is no more than 250 tokens.
'''

In [100]:
user_prompt = '''
Summarize at a high level, the content of the transcript. Do not go over 5 sentences in length. It is very important 
that you summarize the material in a condensed format. 
```
Title:\n
{title}
---------
Guest:\n
{guest}
---------
Transcript:\n
{transcript}
---------
```
Summary:
'''

In [91]:
def generate_user_prompt(data: list[dict], episode_num: int, n: int):
    title, guest, transcript = data[episode_num]['title'], data[episode_num]['guest'], data[episode_num]['content']
    return user_prompt.format(title=title, guest=guest, transcript=transcript, n=n)

In [80]:
# llm = LLM(model_name='azure/gpt-35-turbo-16k',
#           api_key=api_key,
#           api_version=api_version,
#           api_base=api_base)

llm = load_azure_openai(model_name='gpt-4')

In [117]:
def validate_token_count(system_message: str,
                         user_message: str,
                         threshold: int,
                         encoding: str='cl100k_base',
                         verbose: bool=True
                        ) -> str:
    '''
    Ensures that the length of the user_message does not 
    exceed the user-defined token threshold
    '''    
    system_tokens = get_token_count(system_message, return_tokens=True, verbose=False)
    user_tokens = get_token_count(user_message, return_tokens=True, verbose=False)
    sys_token_length, user_token_length = len(system_tokens), len(user_tokens)
    if verbose:
        print(f'Total system tokens: {sys_token_length}')
        print(f'Initial user tokens: {user_token_length}')
    if sys_token_length + user_token_length > threshold:
        user_tokens = user_tokens[:threshold - sys_token_length]
        encoder = tiktoken.get_encoding(encoding)
        user_message = encoder.decode(user_tokens)
        user_token_length = len(get_token_count(user_message, return_tokens=True, verbose=False))
    if verbose:
        print(f'Final user tokens: {user_token_length}')
        print(f'Total input tokens: {sys_token_length + user_token_length}')
    return user_message
        

In [119]:
test_prompt = generate_user_prompt(data, 10, 200)

In [122]:
user_ = validate_token_count(short_summary_extraction_system, test_prompt, threshold=16000, verbose=False)

In [115]:
# print(user_prompt)

In [202]:
async def get_summary(episode: dict, 
                      user_base: str, 
                      system_message: str,
                      max_tokens: int=200,
                      threshold: int=16000
                     ) -> str:
    title = episode['title']
    guest = episode['guest']
    transcript = episode['content']
    user_message = user_base.format(title=title, guest=guest, transcript=transcript, n=max_tokens)
    user_message = validate_token_count(system_message, user_message, threshold, verbose=False)
    summary = await llm.achat_completion(system_message, user_message, max_tokens=max_tokens, temperature=0, raw_response=False)
    return summary

In [203]:
user_prompt

'\nSummarize at a high level, the content of the transcript. Do not go over 5 sentences in length. It is very important \nthat you summarize the material in a condensed format. \n```\nTitle:\n\n{title}\n---------\nGuest:\n\n{guest}\n---------\nTranscript:\n\n{transcript}\n---------\n```\nSummary:\n'

In [216]:
test_sum = await get_summary(episodes[98], user_prompt, long_summary_extraction_system)

In [217]:
test_sum

'In the Huberman Lab Podcast #94 titled "Fitness Toolkit: Protocol & Tools to Optimize Physical Health," Andrew Huberman discusses science-based tools to optimize physical health, covering various fitness aspects such as endurance, strength, flexibility, hypertrophy, and more. He proposes a general fitness protocol that individuals can adapt to pursue specific fitness goals such as increased strength or endurance. Huberman outlines his workout schedule, emphasizing training large muscle groups like legs and adjusting for recovery and biomechanical safety with activities such as heat-cold contrast sessions for recovery.\n\nThe podcast also explores the benefits of long endurance workouts, the importance of resistance training coupled with tailored sets and repetitions for hypertrophy or strength gains, and the significance of incorporating neck exercises for overall stability and safety. Huberman reinforces the necessity of balancing fitness types throughout the week, providing a founda

In [145]:
async def summary_tasks(episodes: list[dict], 
                        user_base: str, 
                        system_message: str, 
                        max_tokens: int=200,
                        threshold: int=16000, 
                        batch_size: int=10):
    batches = ceil(len(episodes)/batch_size)
    summaries = []
    for i in tqdm(range(batches)):
        batch = episodes[i*batch_size:(i+1)*batch_size]
        tasks = [get_summary(episode, user_base, system_message, max_tokens, threshold) for episode in batch]
        responses = await asyncio.gather(*tasks)
        summaries.extend(responses)
        # await asyncio.sleep(60)
    return summaries

In [199]:
%%time
num_53 = asyncio.run(summary_tasks(episodes[53:54], user_prompt, short_summary_extraction_system, threshold=16000, batch_size=10))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:05<00:00, 65.79s/it]

CPU times: user 95.6 ms, sys: 7.19 ms, total: 103 ms
Wall time: 1min 5s





In [141]:
first_20 = test_run + test_run2

In [148]:
mid = first_20 + subset_20_100

In [167]:
final = mid + last_93

In [219]:
final_98 = '''
In the Huberman Lab Podcast #94 titled "Fitness Toolkit: Protocol & Tools to Optimize Physical Health," Andrew Huberman discusses science-based tools to optimize physical health, covering various fitness aspects such as endurance, strength, flexibility, hypertrophy, and more. He proposes a general fitness protocol that individuals can adapt to pursue specific fitness goals such as increased strength or endurance. Huberman outlines his workout schedule, emphasizing training large muscle groups like legs and adjusting for recovery and biomechanical safety with activities such as heat-cold contrast sessions for recovery.\n\nThe podcast also explores the benefits of long endurance workouts, the importance of resistance training coupled with tailored sets and repetitions for hypertrophy or strength gains, and the significance of incorporating neck exercises for overall stability and safety. Huberman reinforces the necessity of balancing fitness types throughout the week, providing a foundation for full-body health.\n\nAdditionally, he announces the launch of the Huberman Lab Podcast premium channel, offering in-depth responses to listener questions and exclusive content.
'''

In [224]:
final[98] 

'In the Huberman Lab Podcast #94 titled "Fitness Toolkit: Protocol & Tools to Optimize Physical Health," Andrew Huberman discusses science-based tools to optimize physical health, covering various fitness aspects such as endurance, strength, flexibility, hypertrophy, and more. He proposes a general fitness protocol that individuals can adapt to pursue specific fitness goals such as increased strength or endurance. Huberman outlines his workout schedule, emphasizing training large muscle groups like legs and adjusting for recovery and biomechanical safety with activities such as heat-cold contrast sessions for recovery.\n\nThe podcast also explores the benefits of long endurance workouts, the importance of resistance training coupled with tailored sets and repetitions for hypertrophy or strength gains, and the significance of incorporating neck exercises for overall stability and safety. Huberman reinforces the necessity of balancing fitness types throughout the week, providing a founda

In [169]:
FileIO.save_as_json('/tmp/short_summaries.json', final, overwrite=True)

[32m2024-04-26 04:15:39.948[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_json[0m:[36m111[0m - [1mData saved as json file here: /tmp/short_summaries.json[0m


In [170]:
old_summaries = [d['summary'] for d in data]

In [225]:
import pandas as pd

old_lens = list(map(len, old_summaries))
new_lens = list(map(len, final))
old_df = pd.DataFrame(old_lens)
new_df = pd.DataFrame(new_lens)

In [95]:
for i, d in enumerate(episodes):
    guest = d['guest']
    print(f'GUEST: {guest}')
    print(final[i])
    print('-'*100)

GUEST: Dr. Matthew Walker
In this episode of the Huberman Lab podcast, Andrew Huberman interviews Dr. Matthew Walker, a professor of neuroscience and psychology, and the director of the Center for Sleep Science at the University of California, Berkeley. The episode marks the beginning of a six-episode series on sleep. In this episode, they discuss the importance of sleep and what happens when we don't get enough quality sleep. They also touch on the different sleep stages, such as non-rapid eye movement (non-REM) sleep and rapid eye movement (REM) sleep. They explore the theory that yawning is a way for the brain to cool down and the impact of body position on sleep. They also discuss the concept of "great sleep" versus "mediocre sleep" and the consequences of not getting enough quality sleep. Overall, the episode provides a comprehensive introduction to the topic of sleep and sets the stage for the upcoming episodes in the series, which will cover various aspects of sleep and offer pr

In [231]:
FileIO.save_as_json('../../data/short_summaries.json', final)

[32m2024-04-26 04:36:28.667[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_json[0m:[36m111[0m - [1mData saved as json file here: ../../data/short_summaries.json[0m


### Combine Metadata

In [233]:
raw_data = FileIO().load_json('../../data/huberman_labs.json')

In [235]:
raw_data[0]['summary']

'In this episode of the Huberman Lab podcast, Andrew Huberman interviews Dr. Matthew Walker, a professor of neuroscience and psychology, and the director of the Center for Sleep Science at the University of California, Berkeley. The episode marks the beginning of a six-episode series on sleep. In this episode, they discuss the importance of sleep and what happens when we don\'t get enough quality sleep. They also touch on the different sleep stages, such as non-rapid eye movement (non-REM) sleep and rapid eye movement (REM) sleep. They explore the theory that yawning is a way for the brain to cool down and the impact of body position on sleep. They also discuss the concept of "great sleep" versus "mediocre sleep" and the consequences of not getting enough quality sleep. Overall, the episode provides a comprehensive introduction to the topic of sleep and sets the stage for the upcoming episodes in the series, which will cover various aspects of sleep and offer practical tools for improv

In [236]:
for i, d in enumerate(raw_data):
    d['summary'] = final[i]

In [238]:
raw_data[-1]['summary']

"In the inaugural episode of the Huberman Lab Podcast, host Andrew Huberman, a Stanford professor, introduces the show's unique approach to exploring neuroscience and its applications in daily life. The podcast will delve deeply into a single scientific topic each month, discussing the latest research, unknowns, and practical tools for improvement in areas such as motivation, focus, and other aspects of brain function. Huberman highlights the podcast's interactive nature, encouraging listener participation through comments and suggestions, which will guide future content. Additionally, he outlines the types of tools to be discussed, ranging from behavioral practices to technological interventions. Lastly, Huberman introduces his bulldog mastiff, Costello, as a quirky background presence in the podcast."

In [239]:
FileIO.save_as_json('../../data/huberman_labs.json', raw_data, overwrite=True)

[32m2024-04-26 04:38:38.802[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_json[0m:[36m111[0m - [1mData saved as json file here: ../../data/huberman_labs.json[0m
Bad pipe message: %s [b"\x0f\xdb \xcdQwm\x91Y\xa9\xc0\x17\x99+\xdf\xb0\xb6\xac\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d"]
Bad pipe message: %s [b"cl'%>\xab\xcaoch\xb8<\xa1\xce\xe0\xe0\x86w\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00:\x00\x89\xc0\x0f\xc0\x05\x005\x00\x84\xc0\x13\xc0\t\x003\x002\x001\x000\x00\x9a\x00\x99\x00\x98\x00\x97\x00E\x00D\x00C\x00B\xc0\x18\x004\x00\x9b\x00F\xc0\x0e\xc0\x04\x00/\x00\x96\x00A\x00\x07\xc0\x11\xc0\x07\xc0\x16\x00\x18\xc0\x0c\xc0\x02\x00\x05\x00\x04\xc0\x12\xc0\x08\x00\x

## Rearrange ordering of data keys

In [109]:
#separate contents
contents = [d['content'] for d in raw_data]

In [110]:
#create list of dicts with content removed
others = [{k:v for k,v in d.items() if k != 'content'} for d in raw_data]

In [111]:
#update list of dicts with content added back in 
[d.update(content=contents[i]) for i, d in enumerate(others)]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [114]:
FileIO().save_as_json('../../data/huberman_labs.json', others, overwrite=True)

[32m2024-04-17 11:35:05.098[0m | [1mINFO    [0m | [36msrc.preprocessor.preprocessing[0m:[36msave_as_json[0m:[36m107[0m - [1mData saved as json file here: ../../data/huberman_labs.json[0m
