In [None]:
#programming language/compiler to optimize code for GPU inside Python
!pip install triton

#faster implementation of OpenAI Whisper model
!pip install faster-whisper

# installs hugging face to download and run Mistral
# installs accelerate for an efficient use of the hardware
# installs bitsanbytes to allow a big model run in the free Colab RAM
!pip install transformers==4.41.2 accelerate==0.30.1 bitsandbytes --upgrade

# dependency fix forcing SymPy to be upgraded
!pip install --upgrade sympy

# converts MP3 and M4A into raw audio files
!apt install ffmpeg

# connects Google Drive to Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os #file manager
import subprocess
import gc #memory cleaner
import torch
from faster_whisper import WhisperModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

# CONFIGURATION

#folder location
folder_path = "/content/drive/MyDrive/transcriber"

#helper function. prevents gpu to run out of memory
#deletes used model in order to download the other model
def clean_memory():
    gc.collect()
    torch.cuda.empty_cache()

# TRANSCRIPTION
print("\nStarting Transcription...")

# loads the whisper model'large-v3' for maximum accuracy on financial terms
model = WhisperModel("large-v3", device="cuda", compute_type="float16")

# find the files
all_files = os.listdir(folder_path)
media_extensions = (".m4a", ".mp3", ".wav", ".mp4", ".mkv")
# uses list comprehension to filter files
recordings = [f for f in all_files if f.lower().endswith(media_extensions)]

files_to_summarize = []

for filename in recordings:
    input_path = os.path.join(folder_path, filename)

    text_filename = os.path.splitext(filename)[0] + ".txt"
    output_path = os.path.join(folder_path, text_filename)

    files_to_summarize.append(output_path)

    if os.path.exists(output_path):
        print(f"⏭️  Skipping transcription for {filename} (Already exists)")
        continue

    print(f"\nTranscribing: {filename}...")

    # convert to clean WAV (fixes "malformed" errors)
    temp_wav = os.path.join(folder_path, "temp_processing.wav")
    subprocess.run([
        "ffmpeg", "-y", "-i", input_path,
        "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
        temp_wav
    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # transcribe
    try:
        segments, info = model.transcribe(temp_wav, beam_size=5)

        print(f"writing transcript...")

        with open(output_path, "w") as f:
            for segment in segments:
                line = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}"
                f.write(line + "\n")
                print("|", end="", flush=True) # progress

        print(f"\ntranscript saved: {text_filename}")

    except Exception as e:
        print(f"error transcribing {filename}: {e}")

    # cleanup temp file
    if os.path.exists(temp_wav):
        os.remove(temp_wav)

# delete the whisper to make room for the summary model
print("\n unloading whisper to free up memory")
del model
clean_memory()


# SUMMARIZATION
print("\n Starting Summarization...")

# load Mistral 7B using 4-bit quantization to fit in the free GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Mistral-7B-Instruct-v0.2 because it is powerful and ungated model
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

try:
  #translate english words to tokens using model's 'dictionary'
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    #smart loading the model
    summary_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto"
    )

    text_generator = pipeline(
        "text-generation",
        model=summary_model,
        tokenizer=tokenizer
    )

    for txt_path in files_to_summarize:
        summary_path = txt_path.replace(".txt", "_SUMMARY.txt")

        if os.path.exists(summary_path):
            print(f" Skipping summary for {os.path.basename(txt_path)}")
            continue

        print(f"\n Generating Summary for: {os.path.basename(txt_path)}")

        # read the full transcript
        with open(txt_path, "r") as f:
            full_text = f.read()
            # limit to first 25,000 characters to fit context window if file is massive
            full_text = full_text[:25000]

        prompt = f"""
Act as an expert minute-taker. Analyze the transcript and produce a structured report including:
- Summary: A narrative overview of the meeting's progression (start, middle, and end).
- Tone: The general atmosphere or sentiment.
- Key Topics: Bulleted details of primary discussions.
- Action Items: Tasks assigned, owners, and due dates.
- Decisions: Final agreements and approvals.

TRANSCRIPT:
{full_text}
[/INST]"""

        # settings of the prompt
        sequences = text_generator(
            prompt,
            do_sample=True,
            max_new_tokens=1000,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
        )

        # extract the summary text
        summary_text = sequences[0]['generated_text']
        # clean up the prompt from the output
        summary_only = summary_text.split("[/INST]")[-1].strip()

        # save
        with open(summary_path, "w") as f:
            f.write(summary_only)

        print(f"Summary Saved: {os.path.basename(summary_path)}")

except Exception as e:
    print(f"Error in summarization: {e}")

print("\nCheck Drive folder for .txt and _SUMMARY.txt files.")