In [None]:

!pip install git+https://github.com/openai/whisper.git
!pip install nltk transformers

import whisper
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
from google.colab import files


nltk.download('punkt')
nltk.download('stopwords')

nltk.download('punkt_tab')


print("Please upload your audio file...")
uploaded = files.upload()
audio_file = next(iter(uploaded))
print(f"Uploaded file: {audio_file}")


def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    print("Transcribing...")
    result = model.transcribe(audio_path)
    transcript = result['text']

    with open("transcript.txt", "w", encoding="utf-8") as f:
        f.write(transcript)
    print("Transcription completed.")
    return transcript

# Step 5: Extract Keywords
def extract_keywords(text, top_n=10):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    filtered = [word for word in words if word.isalpha() and word not in stop_words]

    freq = {}
    for word in filtered:
        freq[word] = freq.get(word, 0) + 1

    sorted_keywords = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    return sorted_keywords[:top_n]


def summarize_text(text, max_tokens=1000):
    summarizer = pipeline("summarization")
    text = text[:max_tokens]
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']


if os.path.exists(audio_file):
    transcript = transcribe_audio(audio_file)

    print("\n--- Transcript Preview ---")
    print(transcript[:500])

    print("\n--- Extracted Keywords ---")
    keywords = extract_keywords(transcript)
    for word, freq in keywords:
        print(f"{word}: {freq}")

    print("\n--- Summary ---")
    summary = summarize_text(transcript)
    print(summary)

else:
    print("Audio file not found.")

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-45cz_046
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-45cz_046
  Resolved https://github.com/openai/whisper.git to commit dd985ac4b90cafeef8712f2998d62c59c3e62d22
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Please upload your audio file...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Saving _meeting.mp3_ to _meeting.mp3_
Uploaded file: _meeting.mp3_


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 275MiB/s]


Transcribing...


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Transcription completed.

--- Transcript Preview ---
 Hosting or presenting content in a meeting can be intimidating, so we created this video to help you feel more confident with Microsoft Teams meetings.

--- Extracted Keywords ---
hosting: 1
presenting: 1
content: 1
meeting: 1
intimidating: 1
created: 1
video: 1
help: 1
feel: 1
confident: 1

--- Summary ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 150, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


 Microsoft Teams meetings can be intimidating, so we created this video to help you feel more confident with Microsoft Teams .  Hosting or presenting content in a meeting can be daunting, so here's how to do it .
