In [28]:
# %pip install moviepy
# %pip install openai pydub
# %pip install python-dotenv
%pip install --upgrade openai



Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Converting MP4s to MP3s

In [29]:
import os
import subprocess
import shutil

# Add FFmpeg to PATH if needed
path = r'C:\Users\agish\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg.Essentials_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-7.1-essentials_build\bin'
if not shutil.which("ffmpeg"):
    os.environ["PATH"] += os.pathsep + path

# Function to convert MP4 to MP3 using FFmpeg
def convert_mp4_to_mp3_ffmpeg(input_file, output_file):
    """Convert an MP4 file to MP3 using FFmpeg."""
    command = [
        "ffmpeg",
        "-i", input_file,
        "-q:a", "0",  # Best quality audio
        "-map", "a",  # Extract only the audio stream
        output_file,
    ]
    subprocess.run(command, check=True)

# Process all files in the input folder
def process_files(input_folder, output_folder):
    """Convert MP4 files to MP3 and copy existing MP3 files."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    files = os.listdir(input_folder)
    print(f"Found {len(files)} files in '{input_folder}'.")

    for i, file_name in enumerate(files, start=1):
        input_path = os.path.join(input_folder, file_name)
        if not os.path.isfile(input_path):
            continue  # Skip directories or invalid files

        # Define the output file path
        output_file = os.path.join(output_folder, os.path.splitext(file_name)[0] + ".mp3")

        # Skip if the output file already exists
        if os.path.exists(output_file):
            print(f"[{i}] Skipping (already exists): {file_name}")
            continue

        if file_name.lower().endswith(".mp3"):
            print(f"[{i}] Copying MP3: {file_name}")
            shutil.copy(input_path, output_file)
        elif file_name.lower().endswith(".mp4"):
            print(f"[{i}] Converting MP4 to MP3: {file_name}")
            try:
                convert_mp4_to_mp3_ffmpeg(input_path, output_file)
                print(f"[{i}] Converted and saved to: {output_file}")
            except subprocess.CalledProcessError as e:
                print(f"[{i}] Failed to convert {file_name}: {e}")
        else:
            print(f"[{i}] Skipping unsupported file: {file_name}")

    print("Processing completed!")

# Example usage
input_folder = r"C:\Users\agish\Documents\GitHub\WhisperUse\Input"
output_folder = r"C:\Users\agish\Documents\GitHub\WhisperUse\Output"
process_files(input_folder, output_folder)


Found 11 files in 'C:\Users\agish\Documents\GitHub\WhisperUse\Input'.
[1] Skipping (already exists): DINK1.1 Intro (Original).mp4
[2] Skipping (already exists): DINK1.2.mp4
[3] Skipping (already exists): DINK1.3.mp3
[4] Skipping (already exists): DINK2.1 introvideo_final (Original).mp4
[5] Skipping (already exists): DINK2.2 per_-_clarification_of_concepts_final (Original).mp4
[6] Skipping (already exists): DINK2.3 case__mobilepay_final (Original).mp4
[7] Skipping (already exists): DINK2.4 veo_case_final (Original).mp4
[8] Skipping (already exists): DINK2.5 corti_case_final (Original).mp4
[9] Skipping (already exists): DINK2.6 paul_pop_-_ressource_management_final (Original).mp4
[10] Skipping (already exists): DINK2.7 christian_d._jensen_final (Original).mp4
[11] Skipping (already exists): DINK2.8 podcast final.mp3
Processing completed!


# Transcribing

In [30]:
import os
from pydub import AudioSegment
import openai

In [31]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [32]:
# Constants
MAX_FILE_SIZE_MB = 25
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
CHUNK_DURATION_MS = 10 * 60 * 1000

In [46]:
import time
import requests

def split_audio(file_path, output_folder):
    """Split audio file into chunks of ~25MB."""
    audio = AudioSegment.from_file(file_path)
    total_duration = len(audio)
    chunks = []

    for start in range(0, total_duration, CHUNK_DURATION_MS):
        chunk_name = f"{os.path.splitext(os.path.basename(file_path))[0]}_chunk_{start // CHUNK_DURATION_MS}.mp3"
        chunk_path = os.path.join(output_folder, chunk_name)

        if os.path.exists(chunk_path):  # Avoid re-splitting
            print(f"Chunk already exists: {chunk_name}")
            chunks.append(chunk_path)
        else:
            chunk = audio[start:start + CHUNK_DURATION_MS]
            chunk.export(chunk_path, format="mp3")
            chunks.append(chunk_path)

    return chunks


def transcribe_audio(file_path):
    """Transcribe audio using OpenAI API."""
    url = "https://api.openai.com/v1/audio/transcriptions"
    headers = {"Authorization": f"Bearer {openai.api_key}"}
    files = {
        "file": (os.path.basename(file_path), open(file_path, "rb")),
        "model": (None, "whisper-1"),
        "language": (None, "en"),
    }

    retry_count = 0
    max_retries = 3

    while retry_count < max_retries:
        try:
            response = requests.post(url, headers=headers, files=files)
            response.raise_for_status()
            return response.json().get("text", "")
        except requests.exceptions.RequestException as e:
            if response.status_code == 429:
                retry_count += 1
                print(f"429 error: Too many requests. Waiting for 1 minute before retrying...")
                time.sleep(60)  # Wait 1 minute
            else:
                print(f"Error with API request for {file_path}: {e}")
                break

    return ""  # Return empty transcript if retries are exhausted


def process_folder(input_folder, output_folder):
    """Process MP3 files in input folder and save transcriptions."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".mp3"):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_name = os.path.splitext(file_name)[0] + ".txt"
            output_file_path = os.path.join(output_folder, output_file_name)

            # Skip already processed files
            if os.path.exists(output_file_path):
                print(f"Skipping {file_name} (transcription already exists).")
                continue

            # Handle large files and check for existing chunks
            file_size = os.path.getsize(input_file_path)
            if file_size > MAX_FILE_SIZE_BYTES:
                print(f"Splitting large file: {file_name}")
                chunks = split_audio(input_file_path, output_folder)
            else:
                chunks = [input_file_path]

            # Transcribe and combine results
            transcript = ""
            for chunk in chunks:
                print(f"Transcribing {chunk}...")
                chunk_transcript = transcribe_audio(chunk)
                if chunk_transcript:
                    transcript += chunk_transcript + "\n"

            # Save transcript if non-empty
            if transcript.strip():
                with open(output_file_path, "w", encoding="utf-8") as output_file:
                    output_file.write(transcript)
                print(f"Transcription completed for {file_name}.")
            else:
                print(f"No transcription generated for {file_name}.")



In [47]:
# Usage
input_folder = "Output"
output_folder = "Transcripts"
process_folder(input_folder, output_folder)

Skipping DINK1.1 Intro (Original).mp3 (transcription already exists).
Skipping DINK1.2.mp3 (transcription already exists).
Skipping DINK1.3.mp3 (transcription already exists).
Skipping DINK2.1 introvideo_final (Original).mp3 (transcription already exists).
Transcribing Output\DINK2.2 per_-_clarification_of_concepts_final (Original).mp3...
Transcription completed for DINK2.2 per_-_clarification_of_concepts_final (Original).mp3.
Skipping DINK2.3 case__mobilepay_final (Original).mp3 (transcription already exists).
Skipping DINK2.4 veo_case_final (Original).mp3 (transcription already exists).
Skipping DINK2.5 corti_case_final (Original).mp3 (transcription already exists).
Skipping DINK2.6 paul_pop_-_ressource_management_final (Original).mp3 (transcription already exists).
Skipping DINK2.7 christian_d._jensen_final (Original).mp3 (transcription already exists).
Skipping DINK2.8 podcast final.mp3 (transcription already exists).


In [49]:
import os
import shutil

# Define input and output folder paths
output_folder = "Transcripts"
cleaned_folder = "Transcripts_Cleaned"

# Create the output folder if it doesn't exist
os.makedirs(cleaned_folder, exist_ok=True)

# Iterate through files in the output folder
for filename in os.listdir(output_folder):
    if filename.endswith(".txt"):  # Check for text files
        source_path = os.path.join(output_folder, filename)
        destination_path = os.path.join(cleaned_folder, filename)
        shutil.copy(source_path, destination_path)  # Copy file to cleaned folder

# Confirm action completed
f"Text files have been copied from '{output_folder}' to '{cleaned_folder}'."


"Text files have been copied from 'Transcripts' to 'Transcripts_Cleaned'."

In [50]:
import os

def combine_text_files(input_directory, output_file):
    """
    Combines all text files in a directory into a single text file.

    Parameters:
        input_directory (str): The directory containing the text files to combine.
        output_file (str): The path to the output text file.
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for filename in os.listdir(input_directory):
                if filename.endswith(".txt"):
                    file_path = os.path.join(input_directory, filename)
                    with open(file_path, 'r', encoding='utf-8') as infile:
                        outfile.write(f"--- Start of {filename} ---\n\n")
                        outfile.write(infile.read())
                        outfile.write(f"\n\n--- End of {filename} ---\n\n")
        print(f"All files combined successfully into {output_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Customize these variables
input_directory = "Transcripts_Cleaned"  # Replace with the directory containing text files
output_file = "combined_texts.txt"         # Replace with your desired output file name

# Call the function
combine_text_files(input_directory, output_file)


All files combined successfully into combined_texts.txt
