In [1]:
import asyncio
import logging
import nest_asyncio
import ollama
import os
import pydub
import telegram
import torch
import warnings
from datetime import datetime
from dotenv import load_dotenv
from telegram.ext import Application, CommandHandler, MessageHandler, filters
from transformers import pipeline

In [2]:
warnings.filterwarnings("ignore")
# Настройка логирования
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
nest_asyncio.apply()

load_dotenv()
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
if not TELEGRAM_BOT_TOKEN:
    raise ValueError("TELEGRAM_BOT_TOKEN not found in .env file")

In [4]:
# Инициализация моделей
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0)
punctuation_model = pipeline("token-classification", model="oliverguhr/fullstop-punctuation-multilang-large", device=0)

Device set to use cuda:0
Device set to use cuda:0


In [5]:
file_path =  "Гудтаймс - Натальные карты.mp3"
wav_path = file_path.replace(".mp3", ".wav")
audio = pydub.AudioSegment.from_mp3(file_path)
audio.export(wav_path, format="wav")

<_io.BufferedRandom name='Гудтаймс - Натальные карты.wav'>

In [6]:
result = whisper(wav_path, return_timestamps=True)

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


In [7]:
result["text"]

' Киса Свои планы, тебя в этих планах нет Тебя в этих планах нет Данные на тайных карт И астрологов прогнозы Старо под угрозу наш союз У вселенной есть свой план Если верить гороскопам Мы совместимы, я варю Я всё понимаю, киса в это верится с трудом У вселенной свои планы, ты поймёшь это потом С этим ничего не сделать, ничего не изменить Всё предрешено от самой встречи по этот миг Дай мне на проникать И астрологов прогнозы Старом под угрозу нас дают У Вселенной есть свой план Если верить гороскопам Мы совместимы, я боюсь Данные на тайных карт И астрологов прогнозы Старо пахнут грозы в наш союз У вселенной есть свой план Если верить гороскопам Мы не совместимы, я боюсь Я боюсь Субтитры создавал DimaTorzok'

In [8]:
file_path = "5440656767170147977.ogg"
wav_path = file_path.replace(".ogg", ".wav")
audio = pydub.AudioSegment.from_ogg(file_path)
audio.export(wav_path, format="wav")

<_io.BufferedRandom name='5440656767170147977.wav'>

In [9]:
result = whisper(wav_path, return_timestamps=True)

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


In [11]:
result["text"]

' Привет! Прослушай это сообщение и установи, пожалуйста, пунктуацию здесь.'

In [5]:
PUNCTUATION_MAP = {
    "COMMA": ", ",
    "PERIOD": ". ",
    "QUESTION": "? ",
    "EXCLAMATION": "! ",
    "NONE": " "
}

def split_text(text, max_length=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def restore_punctuation(text, punctuation_model):
    if not text or not text.strip():
        logging.warning("Empty text passed to restore_punctuation")
        return text
    try:
        result = ""
        for chunk in split_text(text):
            # Tokenize and predict punctuation for each chunk
            predictions = punctuation_model(chunk, aggregation_strategy="simple")
            if not predictions:
                logging.warning(f"Punctuation model returned empty result for text: {chunk}")
                result += chunk + " "
                continue
            chunk_result = ""
            capitalize_next = True  # Capitalize first word of the chunk
            for pred in predictions:
                if not isinstance(pred, dict) or 'word' not in pred or 'entity' not in pred:
                    logging.error(f"Invalid prediction format: {pred}")
                    continue
                word = pred["word"]
                punctuation = pred["entity"]
                # Capitalize word if needed
                if capitalize_next:
                    word = word.capitalize()
                    capitalize_next = False
                chunk_result += word
                # Add corresponding punctuation
                chunk_result += PUNCTUATION_MAP.get(punctuation, " ")
                # Capitalize next word after sentence-ending punctuation
                if punctuation in ["PERIOD", "QUESTION", "EXCLAMATION"]:
                    capitalize_next = True
            result += chunk_result + " "
        return result.strip()
    except Exception as e:
        logging.error(f"Error in punctuation restoration: {e}, text: {text}")
        return text  # Return original text on error

In [6]:
# Папка для сохранения файлов
OUTPUT_DIR = "transcriptions"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

async def start(update, context):
    await update.message.reply_text("Отправьте голосовое сообщение (длиннее 10 минут), и я преобразую его в текст, восстановлю пунктуацию и создам саммари.")

async def handle_voice(update, context):
    voice = update.message.voice

    # Download voice message
    file = await context.bot.get_file(voice.file_id)
    file_path = os.path.join(OUTPUT_DIR, f"voice_{voice.file_id}.ogg")
    await file.download_to_drive(file_path)

    # Convert OGG to WAV for Whisper model compatibility
    wav_path = file_path.replace(".ogg", ".wav")
    audio = pydub.AudioSegment.from_ogg(file_path)
    audio.export(wav_path, format="wav")

    # Transcribe audio to text
    logging.info("Starting transcription...")
    result = whisper(wav_path, return_timestamps=True)
    text = result["text"]
    chunks = result["chunks"]  # Segments with timestamps

    # Restore punctuation in transcribed text
    logging.info("Restoring punctuation...")
    punctuated_text = restore_punctuation(text, punctuation_model)

    # Format transcription with timestamps
    timestamped_text = ""
    for chunk in chunks:
        start_time = chunk["timestamp"][0]
        end_time = chunk["timestamp"][1]
        chunk_text = chunk["text"]
        # Restore punctuation for each chunk
        punctuated_chunk = restore_punctuation(chunk_text, punctuation_model)
        timestamped_text += f"[{start_time:.2f} - {end_time:.2f}] {punctuated_chunk}\n"

    # Summarize text using Gemma3n via Ollama
    logging.info("Summarizing text...")
    summary_prompt = f"Write a summary of the text in 3-5 sentences in Russian:\n{punctuated_text}"
    try:
        response = ollama.generate(model="gemma3n", prompt=summary_prompt, options={"num_predict": 200})
        summary = response["response"]
    except Exception as e:
        logging.error(f"Error during summarization: {e}")
        summary = "Error generating summary."

    # Save transcription and summary to Markdown file
    timestamp = datetime.now()
    timestamp_for_name = timestamp.strftime("%Y%m%d_%H%M%S")
    timestamp_for_summary = timestamp.strftime('%B %d, %Y')
    md_file = os.path.join(OUTPUT_DIR, f"transcription_{timestamp_for_name}.md")
    with open(md_file, "w", encoding="utf-8") as f:
        f.write(f"# Transcription of Voice Message from {timestamp_for_summary}\n\n")
        f.write("## Full Text\n")
        f.write(timestamped_text)
        f.write("\n## Summary\n")
        f.write(summary)

    # Send results to user
    await update.message.reply_text(f"Transcription completed. Full text:\n{punctuated_text}\n\nSummary:\n{summary}")
    with open(md_file, "rb") as f:
        await context.bot.send_document(chat_id=update.message.chat_id, document=f)

    # Clean up temporary files
    os.remove(file_path)
    os.remove(wav_path)

In [7]:
async def main():
    """
    Main function to set up and run the Telegram bot.
    """
    # Initialize Telegram bot with token
    application = Application.builder().token(TELEGRAM_BOT_TOKEN).build()
    # Register command and message handlers
    application.add_handler(CommandHandler("start", start))
    application.add_handler(MessageHandler(filters.VOICE, handle_voice))
    # Start polling for updates
    await application.run_polling()