https://github.com/Kabanosk/whisper-website
https://github.com/openai/whisper/discussions/264

In [118]:
# Import libraries
import pandas as pd
import numpy as np
import yt_dlp
import unzip

# increase column width
pd.set_option('display.max_colwidth', 1000)

# Download Audio and Transcribe

In [27]:
VIDEO_URL = "https://youtu.be/DgTjSrrf6GQ"
AUDIO_FILE_NAME = "./data/Lex_Podcast.mp3"
AUDIO_QUALITY = 5 # 0 best - 10 worst (default 5)
AUDIO_FORMAT = "mp3"
FFMPEG_LOCATION = "ffmpeg-master-latest-win64-gpl/bin"
SUBTITLE_LANGUAGE = "en.*"
TRANSCRIPT_FILE_NAME = "./data/transcript.txt"
SUBTITLE_FORMAT = "srt"

In [19]:
import wget
import os
import zipfile

FFMPEG_URL = 'https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-win64-gpl.zip'
ZIP_PATH = './ffmpeg.zip'
EXTRACT_DIR = './'

if not os.path.exists(ZIP_PATH):
    print('Downloading ffmpeg...')
    wget.download(FFMPEG_URL, ZIP_PATH)

    print('Unzipping...') 
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    
    print('Removing zip file...')
    os.remove(ZIP_PATH)

else:
    print('Already downloaded.')

Downloading ffmpeg...
Unzipping...
Removing zip file...


In [None]:
!yt-dlp -xv --ffmpeg-location ffmpeg-master-latest-win64-gpl/bin --audio-format mp3  -o data/Lex_Podcast -- {"https://youtu.be/DEu24V8vfb8"}
#!yt-dlp -xv --ffmpeg-location {FFMPEG_LOCATION} --audio-format {AUDIO_FORMAT}  -o {AUDIO_FILE_NAME} -- {"https://youtu.be/DEu24V8vfb8"}

In [None]:
import openai

openai.api_key = "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"
audio_file = open("audio.mp3", "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file, response_format=SUBTITLE_FORMAT)

### Longer Inputs
By default, the Whisper API only supports files that are less than 25 MB. If you have an audio file that is longer than that, you will need to break it up into chunks of 25 MB's or less or used a compressed audio format. To get the best performance, we suggest that you avoid breaking the audio up mid-sentence as this may cause some context to be lost.

One way to handle this is to use the [PyDub open source Python package](https://github.com/jiaaro/pydub) to split the audio:

In [None]:
from pydub import AudioSegment

song = AudioSegment.from_mp3("good_morning.mp3")

# PyDub handles time in milliseconds
ten_minutes = 10 * 60 * 1000

first_10_minutes = song[:ten_minutes]

first_10_minutes.export("good_morning_10.mp3", format="mp3")

### Prompting
* Check out [OpenAI](https://platform.openai.com/docs/guides/speech-to-text/prompting)

# Download Transcript

In [13]:
# Download the transcript with yt-dlp
!yt-dlp --write-auto-sub --skip-download --sub-format {SUBTITLE_FORMAT} --sub-lang {SUBTITLE_LANGUAGE} --output {TRANSCRIPT_FILE_NAME} -- {VIDEO_URL}

[youtube] Extracting URL: https://youtu.be/DgTjSrrf6GQ
[youtube] DgTjSrrf6GQ: Downloading webpage
[youtube] DgTjSrrf6GQ: Downloading android player API JSON
[info] DgTjSrrf6GQ: Downloading subtitles: en-orig, en, en-en-ehkg1hFWq8A
[info] DgTjSrrf6GQ: Downloading 1 format(s): 22
[info] Writing video subtitles to: transcript.txt.en-orig.vtt
[download] Destination: transcript.txt.en-orig.vtt

[download]    1.00KiB at  909.63KiB/s (00:00:00)
[download]    3.00KiB at    1.45MiB/s (00:00:00)
[download]    7.00KiB at    1.32MiB/s (00:00:00)
[download]   15.00KiB at    1.17MiB/s (00:00:00)
[download]   31.00KiB at  729.35KiB/s (00:00:00)
[download]   63.00KiB at  978.15KiB/s (00:00:00)
[download]  127.00KiB at    1.33MiB/s (00:00:00)
[download]  255.00KiB at    2.07MiB/s (00:00:00)
[download]  511.00KiB at    3.35MiB/s (00:00:00)
[download]  816.60KiB at    4.29MiB/s (00:00:00)
[download] 100% of  816.60KiB in 00:00:00 at 1.88MiB/s
[info] Writing video subtitles to: transcript.txt.en.vtt
[down



## Data Preparation
Let's change the name of the raw caption files:

In [79]:
# Get a clean list of podcast titles
import re 

def clean_titles(title):
    title = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', title)
    title = re.sub(r'\|.*?\d+', '', title)
    title = title.rstrip().replace(' ', '_').replace(':', '_').replace('&','and').lower()
    title = re.sub(r'[^a-zA-Z0-9_]', '', title)
    return title

with open('./data/Lexicap.md', 'r') as f:
    text = f.read()

titles = text.split('\n')
titles = [clean_titles(title) for title in titles if title != '']

In [None]:
import os

TRANSCRIPT_PATH = "./data/transcripts/"
FILE_EXTENSION = '.vtt'

for org_filename, line_idx in zip(os.listdir(path=TRANSCRIPT_PATH), titles):
    if org_filename.endswith(FILE_EXTENSION):
        print(org_filename)
        # rename file 
        new_filename = f"{line_idx}{FILE_EXTENSION}"
        os.rename(f"{TRANSCRIPT_PATH}{org_filename}", f"{TRANSCRIPT_PATH}{new_filename}")

### Data Cleaning

In [116]:
# Read text file in transcripts folder
TRANSCRIPT_FILE_NAME = "45_michio_kaku__future_of_humans_aliens_space_travel_and_physics.vtt"
NEW_TRANSCRIPT_FILE_NAME = "45_michio_kaku__future_of_humans_aliens_space_travel_and_physics.csv"


with open(f"{TRANSCRIPT_PATH}{TRANSCRIPT_FILE_NAME}") as oldfile, open(f"{NEW_TRANSCRIPT_FILE_NAME}", 'w') as newfile:
    old_lines = oldfile.read().split('\n')
    clean_lines = [line for line in old_lines if line not in ['', 'WEBVTT']]

    for line_idx in range(0, len(clean_lines)-1, 2):
         timestamp = clean_lines[line_idx].strip()
         text = clean_lines[line_idx+1].strip()
         new_line = f"{timestamp} ; {text}\n"
         newfile.write(new_line)


In [113]:
for line_idx in range(0, len(clean_lines)-1, 2):

2604

In [119]:
import pandas as pd

transcript_df = pd.read_csv(f"{NEW_TRANSCRIPT_FILE_NAME}"
                            , sep=';', header=None, names=['timestamps', 'text'])

transcript_df


Unnamed: 0,timestamps,text
0,00:00.000 --> 00:02.800,The following is a conversation with Michio Kaku.
1,00:02.800 --> 00:05.120,"He's a theoretical physicist, futurist,"
2,00:05.120 --> 00:08.360,and professor at the City College of New York.
3,00:08.360 --> 00:10.760,He's the author of many fascinating books
4,00:10.760 --> 00:12.840,that explore the nature of our reality
...,...,...
1297,1:00:29.760 --> 1:00:35.120,"I think you've inspired and educated thousands,"
1298,1:00:35.120 --> 1:00:36.240,if not millions.
1299,1:00:36.240 --> 1:00:37.400,"Michio, it's been an honor."
1300,1:00:37.400 --> 1:00:39.000,Thank you so much for talking today.


# Ideas
# Summarization of main topics in the audio
# Go to the mentions of the topics
# translation to arabic
# The app ask me questions about the text (for language learning) and create a discussion
# Overall sentiment in the text
# Webapp or Mobile app