In [1]:
from urllib.parse import urlparse, parse_qs
import whisper_timestamped as whisper

from pytube import YouTube

import whisper

import json
import os


Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [9]:
output_path_youtube = "YoutubeAudios"
output_path_transcription = "transcriptions"


In [33]:
from pytube import YouTube
import json
import whisper  # Assuming you have a package or module named whisper
from logger import logger

class YouTubeTranscriber:

    def __init__(self, url, output_path_youtube, output_path_transcription):
        self.output_path_youtube = output_path_youtube
        self.yt = YouTube(url)
        self.transcription = None
        self.url = url
        self.filename_path = None 
        self.output_path_transcription = output_path_transcription

    def extract_main_domain_and_video_id(self):
        parsed_url = urlparse(self.url)
        domain_parts = parsed_url.netloc.split('.')
        main_domain = domain_parts[-2] if len(domain_parts) >= 2 else None
        query_params = parse_qs(parsed_url.query)
        video_id = query_params.get('v', [None])[0]
        self.video_id = f"{main_domain}_{video_id}"

    def download_youtube(self):
        self.filename = f"{self.video_id}.mp3"
        
        audio_stream = self.yt.streams.filter(only_audio=True).first()
        
        audio_stream.download(output_path=self.output_path_youtube, filename=self.filename)
        logger.info(f"Audio downloaded to {self.output_path_youtube}/{self.filename}")


    def transcribe_audio(self, model_name, device):
        audio = whisper.load_audio(f"{self.output_path_youtube}/{self.filename}")
        model = whisper.load_model(model_name, device=device)
        self.transcription = whisper.transcribe(model, audio)

    def write_to_json(self):
        with open(f"{self.output_path_youtube}/{self.video_id}.json", 'w') as f:
            json.dump(self.transcription, f)
        logger.info(f"Transcription downloaded to {self.output_path_transcription}/{self.video_id}.json")

    def merge_segments(self, num_to_merge):
        merged_segments = []
        segments = self.transcription["segments"]
        for i in range(0, len(segments), num_to_merge):
            merged_dict = {}
            slice_ = segments[i:i + num_to_merge]

            # Merging the 'text' fields
            merged_dict['text'] = " ".join(item['text'] for item in slice_)

            # Get the 'start' time from the first dictionary and the 'end' time from the last dictionary
            merged_dict['start'] = int(slice_[0]['start'])
            merged_dict['end'] = int(slice_[-1]['end'])

  

            merged_segments.append(merged_dict)

        self.transcription["merged_segments"] = merged_segments



    
    def run(self, num_to_merge=4, model_name="base", device="cpu"):
        
        logger.info("extract_main_domain_and_video_id")
        self.extract_main_domain_and_video_id()
        
        logger.info("download_youtube")
        self.download_youtube()
        
        logger.info("transcribe_audio")
        self.transcribe_audio(model_name=model_name,
                             device=device)
        
        logger.info("merge_segments")
        self.merge_segments(num_to_merge)
        
        logger.info("write_to_json")
        self.write_to_json()
        


# Usage
output_path = output_path_youtube
url = 'https://www.youtube.com/watch?v=5p248yoa3oE'
url = "https://www.youtube.com/watch?v=UyoXmHS-KGc"
yt_transcriber = YouTubeTranscriber(url=url, 
                                    output_path_youtube=output_path_youtube,
                                   output_path_transcription=output_path_transcription)

yt_transcriber.run()

2023-09-29 17:11:22,359 ./logs/auto-labeler INFO extract_main_domain_and_video_id [3598473496.py]
2023-09-29 17:11:22,360 ./logs/auto-labeler INFO download_youtube [3598473496.py]
2023-09-29 17:11:23,525 ./logs/auto-labeler INFO Audio downloaded to YoutubeAudios/youtube_UyoXmHS-KGc.mp3 [3598473496.py]
2023-09-29 17:11:23,526 ./logs/auto-labeler INFO transcribe_audio [3598473496.py]
2023-09-29 17:11:41,418 ./logs/auto-labeler INFO merge_segments [3598473496.py]
2023-09-29 17:11:41,420 ./logs/auto-labeler INFO write_to_json [3598473496.py]
2023-09-29 17:11:41,424 ./logs/auto-labeler INFO Transcription downloaded to transcriptions/youtube_UyoXmHS-KGc.json [3598473496.py]


In [32]:
import json

In [84]:
# Define the path to the JSON file
file_path = "YoutubeAudios/002youtube_UyoXmHS-KGc.json"

# Open the file for reading
with open(file_path, 'r') as f:
    data = json.load(f)

# LangChain

In [34]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


In [35]:
from langchain.llms import HuggingFaceHub

In [42]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)

In [37]:
question = "Who won the FIFA World Cup in the year 1994? "

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [57]:
# repo_id = "mistralai/Mistral-7B-Instruct-v0.1"  # See https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads for some other options

In [64]:
# llm_chain = LLMChain(prompt=prompt, llm=llm)

# print(llm_chain.run(question))

In [88]:
from langchain.llms import Replicate
import openai
template = """You're the best summarizer of video taks, either wabinars, talk, coatching, you can find the most relevent informations 
from the transcription of a video talk: 

{transcription}


your response: """

# Define the human message prompt template.
human_message_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        template=template,
        input_variables=["transcription"]
    )
)

# Create the chat prompt template.
chat_prompt_template = ChatPromptTemplate.from_messages([human_message_prompt])

llm = Replicate(
            model="meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
            model_kwargs={"temperature": 0.9, "max_length": 4000, "top_p": 1},
        )
chat = LLMChain(
        llm=llm, prompt=chat_prompt_template, verbose=True
    )
chat.run(
            {"transcription": data['text']}
        )



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mHuman: You're the best summarizer of video taks, either wabinars, talk, coatching, you can find the most relevent informations 
from the transcription of a video talk: 

 Today we are going to look at an amazing new AI system that can perform 4 tricks. The first is cool, the second is great, the third one is simply incredible to the point that I couldn't believe the results and had to look over and over again. And the fourth is a thing of beauty. So what are the tricks? First, this work is called Gigagen and it can perform tax-to-image. We fellow scholars have seen this before many, many times you enter a tax prompt and it paints you an image. What is great about it is that it can give us reasonably high quality images, that is okay, but here is the kicker. It can perform this in a fraction of a second. That is extremely quick. For instance, the previous StarGain-based method that could be roughly as fast 

' The speaker discusses a new AI system capable of performing four tricks, including tax-to-image generation, super resolution, and controllable latent space. The system uses a generative adversarial network (GAN) and differs from previous methods as it can generate high-quality images rapidly and produce multiple images per second. Additionally, the method provides a controllable latent space, allowing for artistic control over generated images. The speaker considers this technology a game-changer as it offers immense value by providing four capabilities with only one tool while being faster than previous techniques. The speaker encourages'

In [79]:
from langchain.llms import OpenAI